# KNN

First we need to import the relevant libraries, namely the dataset, Python's `numpy`, a graphing library, and the machine learning library `scikit-learn`

In [1]:
from sklearn.datasets import load_boston
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn import neighbors, preprocessing
from sklearn.model_selection import GridSearchCV

In [2]:
np.random.seed(1)

## Regression

The Boston dataset was originally devised for regression, so we'll first show a simple regression model in `scikit-learn`

### Basic single model:

Let's load the data and split into training and testing portions

In [3]:
boston = load_boston()

X, y = shuffle(boston.data, boston.target, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

We'll keep with how we determined `k` in `R`, by taking the square root of the number of observations

In [4]:
k = int(len(X_train) ** (1/2))
print("k: " + str(k))

k: 20


We now define the parameters for the model, and given them to the model object, fit the model to the data, and calculate the MSE on the testing data

In [5]:
params = {'n_neighbors': k, 'weights': "uniform"}

kkn_r = neighbors.KNeighborsRegressor(**params)

kkn_r.fit(X_train, y_train)
mse = mean_squared_error(y_test, kkn_r.predict(X_test))
print("MSE: %.4f" % mse)
print(kkn_r.score(X_test, y_test))

MSE: 24.9820
0.736265561435


## Grid search:

A more thorough analysis allows for checking multiple values for any parameter, let's look for the best model by looking at a range of values for `k`

In [6]:
from sklearn.model_selection import GridSearchCV

param_grid = {'n_neighbors': np.arange(1, int(k+(k/2)), 1),
              'weights': ["uniform", "distance"]}

knn_r = GridSearchCV(neighbors.KNeighborsRegressor(), param_grid)
knn_r.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'weights': ['uniform', 'distance'], 'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

The cross-validated results come in the form of a dictionary with the following keys

In [7]:
sorted(knn_r.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'mean_train_score',
 'param_n_neighbors',
 'param_weights',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split0_train_score',
 'split1_test_score',
 'split1_train_score',
 'split2_test_score',
 'split2_train_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score',
 'std_train_score']

If we want to know explicitly which parameter combinations were computed, we can check the `params` key

In [8]:
knn_r.cv_results_["params"]

({'n_neighbors': 1, 'weights': 'uniform'},
 {'n_neighbors': 1, 'weights': 'distance'},
 {'n_neighbors': 2, 'weights': 'uniform'},
 {'n_neighbors': 2, 'weights': 'distance'},
 {'n_neighbors': 3, 'weights': 'uniform'},
 {'n_neighbors': 3, 'weights': 'distance'},
 {'n_neighbors': 4, 'weights': 'uniform'},
 {'n_neighbors': 4, 'weights': 'distance'},
 {'n_neighbors': 5, 'weights': 'uniform'},
 {'n_neighbors': 5, 'weights': 'distance'},
 {'n_neighbors': 6, 'weights': 'uniform'},
 {'n_neighbors': 6, 'weights': 'distance'},
 {'n_neighbors': 7, 'weights': 'uniform'},
 {'n_neighbors': 7, 'weights': 'distance'},
 {'n_neighbors': 8, 'weights': 'uniform'},
 {'n_neighbors': 8, 'weights': 'distance'},
 {'n_neighbors': 9, 'weights': 'uniform'},
 {'n_neighbors': 9, 'weights': 'distance'},
 {'n_neighbors': 10, 'weights': 'uniform'},
 {'n_neighbors': 10, 'weights': 'distance'},
 {'n_neighbors': 11, 'weights': 'uniform'},
 {'n_neighbors': 11, 'weights': 'distance'},
 {'n_neighbors': 12, 'weights': 'unifor

The `mean_test_score` key gives the score for each of the above combinations on the CV

In [9]:
knn_r.cv_results_["mean_test_score"]

array([ 0.65196087,  0.65196087,  0.68616272,  0.70788426,  0.72378716,
        0.75479941,  0.70218773,  0.74137525,  0.71139667,  0.74532754,
        0.71158707,  0.74667568,  0.70462123,  0.74202155,  0.70471201,
        0.73864603,  0.69630483,  0.73255173,  0.6917272 ,  0.72737057,
        0.68728466,  0.72377333,  0.68379587,  0.72158735,  0.67545972,
        0.71550431,  0.66517757,  0.70823355,  0.66014866,  0.70418569,
        0.65604771,  0.70022502,  0.64678475,  0.69265114,  0.64227177,
        0.68867076,  0.637639  ,  0.68481846,  0.63235158,  0.68043671,
        0.62473719,  0.67383375,  0.6193789 ,  0.66944228,  0.61344146,
        0.66430505,  0.60576799,  0.65794801,  0.59871414,  0.65225043,
        0.59372617,  0.64816131,  0.58921686,  0.64458522,  0.58606534,
        0.64094321,  0.5806255 ,  0.63642241])

We can identify the best scoring model with `numpy`'s `argmax` method

In [10]:
best_index = np.argmax(knn_r.cv_results_["mean_test_score"])

print(knn_r.cv_results_["params"][best_index])
print()
print(max(knn_r.cv_results_["mean_test_score"]))

{'weights': 'distance', 'n_neighbors': 3}

0.754799410737


Score on testing data

In [11]:
knn_r.score(X_test, y_test)

0.88741690895044401

# Classification

Following the `R` demonstration, we can also do classfication, first we'll need to convert the distances to three different groupings

In [12]:
new_boston = {"data": [],
              "target": []}

for i, arr in enumerate(boston["data"]):
    
    new_arr = arr
    
    if arr[7] < 3:
        new_boston["target"].append("short")
    elif arr[7] < 6:
        new_boston["target"].append("medium")
    else:
        new_boston["target"].append("long")
        
    new_arr = np.delete(new_arr, 7)
    new_arr = np.append(new_arr, boston["target"][i])

    new_boston["data"].append(new_arr)
    
new_boston["target"].count("short"), new_boston["target"].count("medium"), new_boston["target"].count("long")

(240, 179, 87)

Now we can reassign the new data to training and testing

In [13]:
X, y = shuffle(new_boston["data"], new_boston["target"], random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [14]:
param_grid = {'n_neighbors': np.arange(1, int(k+(k/2)), 1),
              'weights': ["uniform", "distance"]}

knn_c = GridSearchCV(neighbors.KNeighborsClassifier(), param_grid)
knn_c.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'weights': ['uniform', 'distance'], 'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [15]:
sorted(knn_c.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'mean_train_score',
 'param_n_neighbors',
 'param_weights',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split0_train_score',
 'split1_test_score',
 'split1_train_score',
 'split2_test_score',
 'split2_train_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score',
 'std_train_score']

In [16]:
knn_c.cv_results_["params"]

({'n_neighbors': 1, 'weights': 'uniform'},
 {'n_neighbors': 1, 'weights': 'distance'},
 {'n_neighbors': 2, 'weights': 'uniform'},
 {'n_neighbors': 2, 'weights': 'distance'},
 {'n_neighbors': 3, 'weights': 'uniform'},
 {'n_neighbors': 3, 'weights': 'distance'},
 {'n_neighbors': 4, 'weights': 'uniform'},
 {'n_neighbors': 4, 'weights': 'distance'},
 {'n_neighbors': 5, 'weights': 'uniform'},
 {'n_neighbors': 5, 'weights': 'distance'},
 {'n_neighbors': 6, 'weights': 'uniform'},
 {'n_neighbors': 6, 'weights': 'distance'},
 {'n_neighbors': 7, 'weights': 'uniform'},
 {'n_neighbors': 7, 'weights': 'distance'},
 {'n_neighbors': 8, 'weights': 'uniform'},
 {'n_neighbors': 8, 'weights': 'distance'},
 {'n_neighbors': 9, 'weights': 'uniform'},
 {'n_neighbors': 9, 'weights': 'distance'},
 {'n_neighbors': 10, 'weights': 'uniform'},
 {'n_neighbors': 10, 'weights': 'distance'},
 {'n_neighbors': 11, 'weights': 'uniform'},
 {'n_neighbors': 11, 'weights': 'distance'},
 {'n_neighbors': 12, 'weights': 'unifor

In [17]:
knn_c.cv_results_["mean_test_score"]

array([ 0.8539604 ,  0.8539604 ,  0.83168317,  0.8539604 ,  0.85643564,
        0.86138614,  0.83415842,  0.86633663,  0.83910891,  0.85148515,
        0.81930693,  0.84653465,  0.80940594,  0.82673267,  0.7970297 ,
        0.83168317,  0.7970297 ,  0.81683168,  0.79207921,  0.83168317,
        0.80445545,  0.81930693,  0.79950495,  0.82425743,  0.78712871,
        0.80940594,  0.79455446,  0.81683168,  0.78217822,  0.81188119,
        0.78960396,  0.80693069,  0.7970297 ,  0.80693069,  0.77970297,
        0.80693069,  0.77722772,  0.8019802 ,  0.77227723,  0.79950495,
        0.77970297,  0.79455446,  0.76980198,  0.79207921,  0.77227723,
        0.79207921,  0.75990099,  0.78960396,  0.7549505 ,  0.78465347,
        0.75      ,  0.78465347,  0.75990099,  0.78960396,  0.75247525,
        0.78960396,  0.75      ,  0.78960396])

In [18]:
best_index = np.argmax(knn_c.cv_results_["mean_test_score"])

print(knn_c.cv_results_["params"][best_index])
print()
print(max(knn_c.cv_results_["mean_test_score"]))

{'weights': 'distance', 'n_neighbors': 4}

0.866336633663


In [19]:
knn_c.score(X_test, y_test)

0.82352941176470584