# KNN Model

The first model I'd like to try is a KNN model. I expect that this model will perform the worst out of the four models I've decided to create. This is becasue... 

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

In [2]:
path = '../data/fma_metadata/'
tracks = pd.read_csv(path + 'tracks.csv', index_col=0, header=[0, 1])
features = pd.read_csv(path + 'features.csv', index_col=0, header=[0, 1, 2])

## Retrieve all of the features and labels

In [21]:
# This code is selecting the small dataset and extracting the features to use in the model.
# It is also separating the tracks into train, validation, and test sets.
small = tracks['set', 'subset'] == 'small'

# FMA has already separated the tracks for into the three sets (train, val, test)
train = tracks['set', 'split'] == 'training'
val = tracks['set', 'split'] == 'validation'
test = tracks['set', 'split'] == 'test'

# Load the genre labels
y_train = tracks.loc[small & train, ('track', 'genre_top')]
y_val = tracks.loc[small & val, ('track', 'genre_top')]
y_test = tracks.loc[small & test, ('track', 'genre_top')]

# Load the mfccs and convert to a numpy ndarray, I am only going to use the means for now.
# We have 20 MFCCs. When using the mfcc function in librosa, which FMA uses to get all the feature 
# statisitcs, it will return an array. From what I've see in a bunch of articles and papers people
# generally take the mean of each column and use the vector of means as 20 features. This is what I
# will do as well.
X_train_mfcc = features.loc[small & train, 'mfcc']['mean']
X_val_mfcc = features.loc[small & val, 'mfcc']['mean']
X_test_mfcc = features.loc[small & test, 'mfcc']['mean']

print('{} training examples'.format(y_train.size))
print('{} cross validation examples'.format(y_val.size))
print('{} testing examples'.format(y_test.size))
print('{} features, {} classes'.format(X_train_mfcc.shape[1], np.unique(y_train).size))

6400 training examples
800 cross validation examples
800 testing examples
20 features, 8 classes


In music, the term chroma feature or chromagram closely relates to the twelve different pitch classes. This could be a useful feature because different types of music likely may use different pitches more frequently.

In [22]:
X_train_chroma_cens = features.loc[small & train, 'chroma_cens']['mean']
X_val_chroma_cens = features.loc[small & val, 'chroma_cens']['mean']
X_test_chroma_cens = features.loc[small & test, 'chroma_cens']['mean']

In [23]:
X_train = pd.concat([X_train_mfcc, X_train_chroma_cens], axis=1, sort=False)
X_val = pd.concat([X_val_mfcc, X_val_chroma_cens], axis=1, sort=False)
X_test = pd.concat([X_test_mfcc, X_test_chroma_cens], axis=1, sort=False)

### Adding more features

Now that I've performed PCA on the MFCCs I'd like to add a couple more features to the model. The features that I'm going to be adding are the Spectral Centroid, Spectral Rolloff, Zero Crossing Rate, RMSE, Spectral Bandwidth.

In [24]:
feats = ['spectral_centroid', 'spectral_bandwidth', 'spectral_rolloff', 'zcr', 'rmse']
for feat in feats:
    X_train[feat] = features.loc[small & train, feat]['mean']
    X_val[feat] = features.loc[small & val, feat]['mean']
    X_test[feat] = features.loc[small & test, feat]['mean']
    
X_train.head()

number,01,02,03,04,05,06,07,08,09,10,...,08,09,10,11,12,spectral_centroid,spectral_bandwidth,spectral_rolloff,zcr,rmse
track_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,-163.772964,116.696678,-41.753826,29.144329,-15.050158,18.879372,-8.918165,12.002118,-4.253151,1.359791,...,0.248795,0.196245,0.175809,0.200713,0.319972,1639.583252,1607.474365,3267.804688,0.085629,3.188761
5,-205.440491,132.215073,-16.085823,41.514759,-7.642954,16.942802,-5.651261,9.569445,0.503157,8.673513,...,0.293982,0.346324,0.289821,0.246368,0.220939,1292.95813,1512.917358,2773.931885,0.053114,3.251386
10,-135.864822,157.040085,-53.453247,17.198896,6.868035,13.934344,-11.749298,8.360711,-5.130381,0.233845,...,0.349137,0.268424,0.243144,0.268941,0.236763,1360.028687,1420.259644,2603.491943,0.077515,3.89381
140,-225.713318,139.332825,-13.097699,44.533356,2.4684,28.328743,-9.931481,10.810857,3.002879,-0.937692,...,0.19197,0.291551,0.319938,0.198516,0.120607,1232.633789,1475.625366,2583.01416,0.052379,2.953848
141,-253.143906,155.716324,-16.636627,23.683815,6.045957,11.692952,-9.947761,6.887814,-3.273322,-6.340906,...,0.181313,0.177233,0.296048,0.331963,0.218315,941.244141,1192.835571,1905.394531,0.040267,2.576761


## Combine train and validation data

I'm doing this becasue I am going to use a gridsearch with K-fold crossvalidation. Therefore, there is no need for a crossvalidation set.

In [25]:
X_train = pd.concat([X_train, X_val], axis=0, sort=False)
y_train = pd.concat([y_train, y_val], axis=0, sort=False)

print('{} training examples'.format(y_train.size))

7200 training examples


In [8]:
scaler = StandardScaler()

scaler.fit(X_train)

X_train_sc = scaler.transform(X_train)
X_test_sc = scaler.transform(X_test)

## Train the model

In [26]:
parameters = {'weights': ('uniform', 'distance'), 'n_neighbors': range(15, 31)}
neigh = KNeighborsClassifier()
clf = GridSearchCV(neigh, parameters)

In [27]:
clf.fit(X_train_sc, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                            metric='minkowski',
                                            metric_params=None, n_jobs=None,
                                            n_neighbors=5, p=2,
                                            weights='uniform'),
             iid='deprecated', n_jobs=None,
             param_grid={'n_neighbors': range(15, 31),
                         'weights': ('uniform', 'distance')},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [28]:
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()

Best parameters set found on development set:

{'n_neighbors': 22, 'weights': 'distance'}

Grid scores on development set:

0.373 (+/-0.052) for {'n_neighbors': 15, 'weights': 'uniform'}
0.376 (+/-0.051) for {'n_neighbors': 15, 'weights': 'distance'}
0.371 (+/-0.049) for {'n_neighbors': 16, 'weights': 'uniform'}
0.375 (+/-0.053) for {'n_neighbors': 16, 'weights': 'distance'}
0.376 (+/-0.054) for {'n_neighbors': 17, 'weights': 'uniform'}
0.380 (+/-0.051) for {'n_neighbors': 17, 'weights': 'distance'}
0.379 (+/-0.053) for {'n_neighbors': 18, 'weights': 'uniform'}
0.380 (+/-0.054) for {'n_neighbors': 18, 'weights': 'distance'}
0.382 (+/-0.054) for {'n_neighbors': 19, 'weights': 'uniform'}
0.380 (+/-0.053) for {'n_neighbors': 19, 'weights': 'distance'}
0.383 (+/-0.055) for {'n_neighbors': 20, 'weights': 'uniform'}
0.385 (+/-0.052) for {'n_neighbors': 20, 'weights': 'distance'}
0.382 (+/-0.056) for {'n_neighbors': 21, 'weights': 'uniform'}
0.389 (+/-0.049) for {'n_neighbors': 21, 'weights':

  _warn_prf(average, modifier, msg_start, len(result))
