# KNN

First we need to import the relevant libraries, namely the dataset, Python's `numpy`, a graphing library, and the machine learning library `scikit-learn`

In [10]:
from sklearn.datasets import load_boston
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import neighbors
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV

## Regression

The Boston dataset was originally devised for regression, so we'll first show a simple regression model in `scikit-learn`

### Basic single model:

Let's load the data and split into training and testing portions

In [3]:
boston = load_boston()

X, y = shuffle(boston.data, boston.target, random_state=1)

X = preprocessing.scale(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

array([[-0.41270653,  0.92862338, -1.30687771, ..., -0.02567665,
         0.44105193, -0.71811916],
       [-0.41548823,  1.87285388, -1.07342276, ..., -0.39556665,
         0.44105193, -0.76717998],
       [-0.39175357, -0.48772236, -0.61672651, ..., -0.2568579 ,
         0.3954402 ,  0.86163938],
       ..., 
       [ 0.26527877, -0.48772236,  1.01599907, ...,  0.80657583,
         0.44105193,  0.94153844],
       [-0.3799724 , -0.48772236, -0.72032214, ..., -0.48803915,
         0.22012011, -0.24853698],
       [-0.40911792, -0.48772236, -0.75534039, ...,  0.34421334,
         0.44105193, -0.54430367]])

We'll keep with how we determined `k` in `R`, by taking the square root of the number of observations

In [4]:
k = int(len(X_train) ** (1/2))
print("k: " + str(k))

k: 20


We now define the parameters for the model, and given them to the model object, fit the model to the data, and calculate the MSE on the testing data

In [5]:
params = {'n_neighbors': k, 'weights': "uniform"}

kkn_r = neighbors.KNeighborsRegressor(**params)

kkn_r.fit(X_train, y_train)
mse = mean_squared_error(y_test, kkn_r.predict(X_test))
print("MSE: %.4f" % mse)
print(kkn_r.score(X_test, y_test))

MSE: 34.5423
0.619724559909


## Grid search:

A more thorough analysis allows for checking multiple values for any parameter, let's look for the best model by looking at a range of values for `k`

In [5]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': np.arange(int(k-(k/2)), int(k+(k/2)), 1),
              'weights': ["uniform"]}

knn_r = GridSearchCV(neighbors.KNeighborsRegressor(), param_grid)
knn_r.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'weights': ['uniform'], 'n_neighbors': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
       27, 28, 29])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

The cross-validated results come in the form of a dictionary with the following keys

In [6]:
sorted(knn_r.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'mean_train_score',
 'param_n_neighbors',
 'param_weights',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split0_train_score',
 'split1_test_score',
 'split1_train_score',
 'split2_test_score',
 'split2_train_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score',
 'std_train_score']

If we want to know explicitly which parameter combinations were computed, we can check the `params` key

In [7]:
knn_r.cv_results_["params"]

({'n_neighbors': 10, 'weights': 'uniform'},
 {'n_neighbors': 11, 'weights': 'uniform'},
 {'n_neighbors': 12, 'weights': 'uniform'},
 {'n_neighbors': 13, 'weights': 'uniform'},
 {'n_neighbors': 14, 'weights': 'uniform'},
 {'n_neighbors': 15, 'weights': 'uniform'},
 {'n_neighbors': 16, 'weights': 'uniform'},
 {'n_neighbors': 17, 'weights': 'uniform'},
 {'n_neighbors': 18, 'weights': 'uniform'},
 {'n_neighbors': 19, 'weights': 'uniform'},
 {'n_neighbors': 20, 'weights': 'uniform'},
 {'n_neighbors': 21, 'weights': 'uniform'},
 {'n_neighbors': 22, 'weights': 'uniform'},
 {'n_neighbors': 23, 'weights': 'uniform'},
 {'n_neighbors': 24, 'weights': 'uniform'},
 {'n_neighbors': 25, 'weights': 'uniform'},
 {'n_neighbors': 26, 'weights': 'uniform'},
 {'n_neighbors': 27, 'weights': 'uniform'},
 {'n_neighbors': 28, 'weights': 'uniform'},
 {'n_neighbors': 29, 'weights': 'uniform'})

The `mean_test_score` key gives the score for each of the above combinations on the CV

In [8]:
knn_r.cv_results_["mean_test_score"]

array([ 0.39907869,  0.39370905,  0.3894494 ,  0.38306966,  0.3740351 ,
        0.35892923,  0.34931848,  0.33089222,  0.3246813 ,  0.32128488,
        0.31796372,  0.30691265,  0.30151601,  0.29615633,  0.2925673 ,
        0.28837524,  0.28258091,  0.27758829,  0.26868124,  0.26432613])

We can identify the best scoring model with `numpy`'s `argmax` method

In [9]:
best_index = np.argmax(knn_r.cv_results_["mean_test_score"])

print(knn_r.cv_results_["params"][best_index])
print()
print(max(knn_r.cv_results_["mean_test_score"]))

{'weights': 'uniform', 'n_neighbors': 10}

0.399078686639


Score on testing data

In [10]:
knn_r.score(X_test, y_test)

0.46765454698877595

# Classification

Following the `R` demonstration, we can also do classfication, first we'll need to convert the distances to three different groupings

In [11]:
new_boston = {"data": [],
              "target": []}

for i, arr in enumerate(boston["data"]):
    
    new_arr = arr
    
    if arr[7] < 3:
        new_boston["target"].append("short")
    elif arr[7] < 6:
        new_boston["target"].append("medium")
    else:
        new_boston["target"].append("long")
        
    new_arr = np.delete(new_arr, 7)
    new_arr = np.append(new_arr, boston["target"][i])

    new_boston["data"].append(new_arr)
    
new_boston["target"].count("short"), new_boston["target"].count("medium"), new_boston["target"].count("long")

(240, 179, 87)

Now we can reassign the new data to training and testing

In [12]:
X, y = shuffle(new_boston["data"], new_boston["target"], random_state=1)
X = preprocessing.scale(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

In [13]:
param_grid = {'n_neighbors': np.arange(int(k-(k/2)), int(k+(k/2)), 1),
              'weights': ["uniform"]}

knn_c = GridSearchCV(neighbors.KNeighborsClassifier(), param_grid)
knn_c.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_neighbors': array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
       27, 28, 29]), 'weights': ['uniform']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [14]:
sorted(knn_c.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'mean_train_score',
 'param_n_neighbors',
 'param_weights',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split0_train_score',
 'split1_test_score',
 'split1_train_score',
 'split2_test_score',
 'split2_train_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score',
 'std_train_score']

In [15]:
knn_c.cv_results_["params"]

({'n_neighbors': 10, 'weights': 'uniform'},
 {'n_neighbors': 11, 'weights': 'uniform'},
 {'n_neighbors': 12, 'weights': 'uniform'},
 {'n_neighbors': 13, 'weights': 'uniform'},
 {'n_neighbors': 14, 'weights': 'uniform'},
 {'n_neighbors': 15, 'weights': 'uniform'},
 {'n_neighbors': 16, 'weights': 'uniform'},
 {'n_neighbors': 17, 'weights': 'uniform'},
 {'n_neighbors': 18, 'weights': 'uniform'},
 {'n_neighbors': 19, 'weights': 'uniform'},
 {'n_neighbors': 20, 'weights': 'uniform'},
 {'n_neighbors': 21, 'weights': 'uniform'},
 {'n_neighbors': 22, 'weights': 'uniform'},
 {'n_neighbors': 23, 'weights': 'uniform'},
 {'n_neighbors': 24, 'weights': 'uniform'},
 {'n_neighbors': 25, 'weights': 'uniform'},
 {'n_neighbors': 26, 'weights': 'uniform'},
 {'n_neighbors': 27, 'weights': 'uniform'},
 {'n_neighbors': 28, 'weights': 'uniform'},
 {'n_neighbors': 29, 'weights': 'uniform'})

In [16]:
knn_c.cv_results_["mean_test_score"]

array([ 0.78712871,  0.76980198,  0.76732673,  0.77227723,  0.77970297,
        0.76980198,  0.77970297,  0.76237624,  0.76732673,  0.76980198,
        0.76980198,  0.76237624,  0.75247525,  0.76237624,  0.75742574,
        0.75247525,  0.75247525,  0.75247525,  0.7450495 ,  0.75247525])

In [17]:
best_index = np.argmax(knn_c.cv_results_["mean_test_score"])

print(knn_c.cv_results_["params"][best_index])
print()
print(max(knn_c.cv_results_["mean_test_score"]))

{'n_neighbors': 10, 'weights': 'uniform'}

0.787128712871


In [18]:
knn_c.score(X_test, y_test)

0.81372549019607843