# KNN Regression

In [1]:
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn import neighbors

%matplotlib inline

## Create Dataset

In [2]:
music = pd.DataFrame()
music['duration'] = [184, 134, 243, 186, 122, 197, 294, 382, 102, 264, 
                     205, 110, 307, 110, 397, 153, 190, 192, 210, 403,
                     164, 198, 204, 253, 234, 190, 182, 401, 376, 102]
music['loudness'] = [18, 34, 43, 36, 22, 9, 29, 22, 10, 24, 
                     20, 10, 17, 51, 7, 13, 19, 12, 21, 22,
                     16, 18, 4, 23, 34, 19, 14, 11, 37, 42]
music['bpm'] = [ 105, 90, 78, 75, 120, 110, 80, 100, 105, 60,
                  70, 105, 95, 70, 90, 105, 70, 75, 102, 100,
                  100, 95, 90, 80, 90, 80, 100, 105, 70, 65]

In [3]:
model_params = {
    'k': [],
    'cv_score': []
}

In [4]:
def clear_dict():
    for key, value in model_params.items():
        model_params[key] = []

In [5]:
def predict_bpm(df, k, weights='uniform', num_folds=None):
    
    knn = neighbors.KNeighborsRegressor(n_neighbors=k, weights=weights)
    X = df.loc[:, ['duration', 'loudness']]
    Y = df['bpm']
    knn.fit(X, Y)
    
    # Set up our prediction line.
    T = np.hstack([np.arange(0, 50, 0.1)[:, np.newaxis], np.arange(0, 50, 0.1)[:, np.newaxis]])

    Y_ = knn.predict(T)
    
    if num_folds:
        cv_score = cross_val_score(knn, X, Y, cv=num_folds)
        model_params['cv_score'].append(cv_score.mean())
        print(f'Cross validation score: {cv_score.mean():0.3f} +/-{cv_score.std():0.3f}')

In [6]:
def grid_search(df, k_list, weights='uniform', num_folds=10):
    
    for k in k_list:
        model_params['k'].append(k)
        
        predict_bpm(df, k, weights=weights, num_folds=num_folds)
        
    cv_results = pd.DataFrame(model_params)
    clear_dict()
    
    return cv_results

In [7]:
k_list = np.arange(1, 20, 3)
cv_results_uniform = grid_search(music, k_list, num_folds=5, weights='uniform')

Cross validation score: -1.542 +/-2.337
Cross validation score: -0.844 +/-0.627
Cross validation score: -0.459 +/-0.381
Cross validation score: -0.421 +/-0.388
Cross validation score: -0.376 +/-0.366
Cross validation score: -0.320 +/-0.258
Cross validation score: -0.336 +/-0.303


In [8]:
cv_results_uniform.sort_values(by='cv_score', ascending=False)

Unnamed: 0,k,cv_score
5,16,-0.320449
6,19,-0.33557
4,13,-0.375911
3,10,-0.420533
2,7,-0.459495
1,4,-0.844387
0,1,-1.542004


In [9]:
k_list = np.arange(1, 25, 3)
cv_results_weighted = grid_search(music, k_list, num_folds=5, weights='distance')

Cross validation score: -1.542 +/-2.337
Cross validation score: -0.635 +/-0.820
Cross validation score: -0.302 +/-0.394
Cross validation score: -0.265 +/-0.371
Cross validation score: -0.248 +/-0.333
Cross validation score: -0.229 +/-0.298
Cross validation score: -0.227 +/-0.298
Cross validation score: -0.231 +/-0.302


In [10]:
cv_results_weighted.sort_values(by='cv_score', ascending=False)

Unnamed: 0,k,cv_score
6,19,-0.226719
5,16,-0.228537
7,22,-0.230851
4,13,-0.247531
3,10,-0.264842
2,7,-0.302368
1,4,-0.634759
0,1,-1.542004


## Conclusions
Even after adding an additional feature, the model is still performing horribly. The model using weighting by distance with 19 neighbors performed the best.