# KNN

In [1]:
from sklearn.datasets import load_boston
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import neighbors

## Regression

### Basic single model:

In [2]:
boston = load_boston()

X, y = shuffle(boston.data, boston.target, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [3]:
k = int(len(X_train) ** (1/2))
print("k: " + str(k))

k: 20


In [4]:
params = {'n_neighbors': k, 'weights': "uniform"}

kkn_r = neighbors.KNeighborsRegressor(**params)

kkn_r.fit(X_train, y_train)
mse = mean_squared_error(y_test, kkn_r.predict(X_test))
print("MSE: %.4f" % mse)

MSE: 48.4191


## Grid search:

In [5]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': np.arange(int(k-(k/2)), int(k+(k/2)), 1),
              'weights': ["uniform"]}

knn_r = GridSearchCV(neighbors.KNeighborsRegressor(), param_grid)
knn_r.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'weights': ['uniform'], 'n_neighbors': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
       27, 28, 29])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [6]:
sorted(knn_r.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'mean_train_score',
 'param_n_neighbors',
 'param_weights',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split0_train_score',
 'split1_test_score',
 'split1_train_score',
 'split2_test_score',
 'split2_train_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score',
 'std_train_score']

In [7]:
knn_r.cv_results_["params"]

({'n_neighbors': 10, 'weights': 'uniform'},
 {'n_neighbors': 11, 'weights': 'uniform'},
 {'n_neighbors': 12, 'weights': 'uniform'},
 {'n_neighbors': 13, 'weights': 'uniform'},
 {'n_neighbors': 14, 'weights': 'uniform'},
 {'n_neighbors': 15, 'weights': 'uniform'},
 {'n_neighbors': 16, 'weights': 'uniform'},
 {'n_neighbors': 17, 'weights': 'uniform'},
 {'n_neighbors': 18, 'weights': 'uniform'},
 {'n_neighbors': 19, 'weights': 'uniform'},
 {'n_neighbors': 20, 'weights': 'uniform'},
 {'n_neighbors': 21, 'weights': 'uniform'},
 {'n_neighbors': 22, 'weights': 'uniform'},
 {'n_neighbors': 23, 'weights': 'uniform'},
 {'n_neighbors': 24, 'weights': 'uniform'},
 {'n_neighbors': 25, 'weights': 'uniform'},
 {'n_neighbors': 26, 'weights': 'uniform'},
 {'n_neighbors': 27, 'weights': 'uniform'},
 {'n_neighbors': 28, 'weights': 'uniform'},
 {'n_neighbors': 29, 'weights': 'uniform'})

In [8]:
knn_r.cv_results_["mean_test_score"]

array([ 0.35712023,  0.34865975,  0.32869747,  0.32207005,  0.30487975,
        0.30378971,  0.30587992,  0.29804225,  0.29315995,  0.29141645,
        0.29140695,  0.29543737,  0.29467113,  0.2888999 ,  0.28543624,
        0.28052632,  0.27805718,  0.27338327,  0.27181168,  0.2717212 ])

In [9]:
best_index = np.argmax(knn_r.cv_results_["mean_test_score"])

print(knn_r.cv_results_["params"][best_index])
print()
print(max(knn_r.cv_results_["mean_test_score"]))

{'weights': 'uniform', 'n_neighbors': 10}

0.357120233397


# Classification

In [10]:
new_boston = {"data": [],
              "target": []}

for i, arr in enumerate(boston["data"]):
    
    new_arr = arr
    
    if arr[7] < 3:
        new_boston["target"].append("short")
    elif arr[7] < 6:
        new_boston["target"].append("medium")
    else:
        new_boston["target"].append("long")
        
    new_arr = np.delete(new_arr, 7)
    new_arr = np.append(new_arr, boston["target"][i])

    new_boston["data"].append(new_arr)
    
new_boston["target"].count("short"), new_boston["target"].count("medium"), new_boston["target"].count("long")

(240, 179, 87)

In [11]:
X, y = shuffle(new_boston["data"], new_boston["target"], random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [12]:
param_grid = {'n_neighbors': np.arange(int(k-(k/2)), int(k+(k/2)), 1),
              'weights': ["uniform"]}

knn_c = GridSearchCV(neighbors.KNeighborsClassifier(), param_grid)
knn_c.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'weights': ['uniform'], 'n_neighbors': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
       27, 28, 29])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [13]:
sorted(knn_c.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'mean_train_score',
 'param_n_neighbors',
 'param_weights',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split0_train_score',
 'split1_test_score',
 'split1_train_score',
 'split2_test_score',
 'split2_train_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score',
 'std_train_score']

In [14]:
knn_c.cv_results_["params"]

({'n_neighbors': 10, 'weights': 'uniform'},
 {'n_neighbors': 11, 'weights': 'uniform'},
 {'n_neighbors': 12, 'weights': 'uniform'},
 {'n_neighbors': 13, 'weights': 'uniform'},
 {'n_neighbors': 14, 'weights': 'uniform'},
 {'n_neighbors': 15, 'weights': 'uniform'},
 {'n_neighbors': 16, 'weights': 'uniform'},
 {'n_neighbors': 17, 'weights': 'uniform'},
 {'n_neighbors': 18, 'weights': 'uniform'},
 {'n_neighbors': 19, 'weights': 'uniform'},
 {'n_neighbors': 20, 'weights': 'uniform'},
 {'n_neighbors': 21, 'weights': 'uniform'},
 {'n_neighbors': 22, 'weights': 'uniform'},
 {'n_neighbors': 23, 'weights': 'uniform'},
 {'n_neighbors': 24, 'weights': 'uniform'},
 {'n_neighbors': 25, 'weights': 'uniform'},
 {'n_neighbors': 26, 'weights': 'uniform'},
 {'n_neighbors': 27, 'weights': 'uniform'},
 {'n_neighbors': 28, 'weights': 'uniform'},
 {'n_neighbors': 29, 'weights': 'uniform'})

In [15]:
knn_c.cv_results_["mean_test_score"]

array([ 0.81188119,  0.78960396,  0.79950495,  0.78960396,  0.78465347,
        0.78712871,  0.7970297 ,  0.79950495,  0.7970297 ,  0.78960396,
        0.78960396,  0.78712871,  0.78712871,  0.78465347,  0.79455446,
        0.78712871,  0.7970297 ,  0.77227723,  0.77970297,  0.78217822])

In [16]:
best_index = np.argmax(knn_c.cv_results_["mean_test_score"])

print(knn_c.cv_results_["params"][best_index])
print()
print(max(knn_c.cv_results_["mean_test_score"]))

{'weights': 'uniform', 'n_neighbors': 10}

0.811881188119
