## Nearest neighbors

## Part I

In [1]:
import numpy as np
import os
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

# Load the numpy .npz file
with np.load(os.path.join('data', 'cifar4-train.npz'), allow_pickle=False) as data:
    cifar4_data = dict(data.items())
    
print('Data loaded')
print('It is a dictionary with keys:', list(cifar4_data.keys()))

Data loaded
It is a dictionary with keys: ['pixels', 'overfeat', 'labels', 'names', 'allow_pickle']


In [2]:
X = cifar4_data['overfeat']
y = cifar4_data['labels']

print('X shape:',X.shape)
print('y shape:', y.shape)

X shape: (5000, 4096)
y shape: (5000,)


In [3]:
from sklearn.decomposition import PCA

# PCA to reduce dimensions. 
# Using 164 as this is the result from the previous exercise ensuring 90% of PVE explained
pca = PCA(n_components=164)
X = pca.fit_transform(X)
print('X shape:',X.shape)

X shape: (5000, 164)


Number of dimensions have been reduced to 164.

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Create the StandardScaler
scaler = StandardScaler()

# Create k-NN classifier
knn = KNeighborsClassifier(algorithm='brute')

# Create the pipeline
pipe = Pipeline([('scaler', scaler), ('knn', knn)])

In [5]:
# Splitting the data
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size=4000,test_size=1000,
                                          random_state=0, stratify=y)
# Split train data again
X_train_s, X_val, y_train_s, y_val = train_test_split(X_train, y_train,train_size=3200,test_size=800,
                                          random_state=0, stratify=y_train)

# checking the shapes
print('X_train:', X_train.shape, X_train.dtype)
print('y_train:', y_train.shape, y_train.dtype)
print('X_train_s:', X_train_s.shape, X_train_s.dtype)
print('y_train_s:', y_train_s.shape, y_train_s.dtype)
print('X_val:', X_val.shape, X_val.dtype)
print('y_val:', y_val.shape, y_val.dtype)
print('X_test:', X_test.shape, X_test.dtype)
print('y_test:', y_test.shape, y_test.dtype)

X_train: (4000, 164) float32
y_train: (4000,) int64
X_train_s: (3200, 164) float32
y_train_s: (3200,) int64
X_val: (800, 164) float32
y_val: (800,) int64
X_test: (1000, 164) float32
y_test: (1000,) int64


In [6]:
pipe.fit(X_train_s, y_train_s)

# Evaluate on validation set
accuracy = pipe.score(X_val, y_val)

# Print accuracy
print('k-nearest neighbors accuracy: {:.3f}'.format(accuracy))

k-nearest neighbors accuracy: 0.609


Tune **k** and **distance metric (L1 and L2)** using Grid Search

In [7]:
# Define a set of reasonable values
k_values = np.arange(5, 105, 5)
distance_types = [1,2]

# Save accuracy on validation set
validation_scores = []

# Grid search
for k in k_values:
    for d in distance_types:
        # Set hyperparameters
        pipe.set_params(knn__n_neighbors = k, knn__p = d)
        
        # Fit a k-NN classifier
        pipe.fit(X_train_s, y_train_s)
        
        # Evaluate on test set
        accuracy = pipe.score(X_val, y_val)
        
        # Save accuracy
        validation_scores.append({
            'n neighbors': k,
            'distance metric': d,
            'validation accuracy': accuracy
        })
        
# Create DataFrame with test scores
scores_df = pd.DataFrame(validation_scores)

# Top five scores
scores_df.sort_values(by='validation accuracy', ascending=False).head()

Unnamed: 0,distance metric,n neighbors,validation accuracy
6,1,20,0.65125
3,2,10,0.63875
4,1,15,0.63125
8,1,25,0.6275
10,1,30,0.6275


In [8]:
# Best combination
idx_max = scores_df.loc[scores_df['validation accuracy'].idxmax()]

print('Top accuracy on validation set:',idx_max[2],
      ' with k:',idx_max[1],
      ' with distance metric:',idx_max[0])

Top accuracy on validation set: 0.65125  with k: 20.0  with distance metric: 1.0


Fit the k-NN classifier with the tuned parameters to the entire train data and evaluate the accuracy on the test set.

In [9]:
# Create k-NN classifier
knn = KNeighborsClassifier(n_neighbors=15,algorithm='brute',p=1)

# Create the pipeline and fit it to training data
pipe.fit(X_train, y_train)

# Evaluate on test set
accuracy = pipe.score(X_test, y_test)

# Print accuracy
print('k-nearest neighbors accuracy: {:.3f}'.format(accuracy))

k-nearest neighbors accuracy: 0.582


## Part II

For the second task, pick an image from the test set and plot its 10 nearest neighbors from the train one.
To achieve this, extract the PCA transformer and the  KNeighborsClassifier estimator from your pipeline.
Apply PCA to your test image and pass the result to the kneighbors(X, n_neighbors) function of your k-NN estimator.
You can take a look at its documentation on this page.

In [21]:
# Function to plot the i-th image from some X/y arrays
def show_image(idx, X, y):
    # Get image pixels from the input matrix X
    flat_img = X[idx] 
    
    img = flat_img.reshape(64, 64)
    
    # Print label using the output vector y
    print('Label:', y[idx])
    
    # Plot image with pyplot
    plt.imshow(img/255)
    plt.show()

In [22]:
# Show the first 5 images from the test set
for image in range(0,5):
    show_image(image, X_test, y_test)

ValueError: cannot reshape array of size 164 into shape (64,64)