# CSE 291 Homework3 - KNN  

#### Chao Yu 
#### Wen Liang

## K Nearest Neighbor
---
### MNIST
#### Part 1. Read the data and split to training and test sets

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import confusion_matrix, accuracy_score, zero_one_loss

import warnings

warnings.filterwarnings('ignore')

mnist_train_path = './MNIST/train.csv'
mnist_train_df = pd.read_csv(mnist_train_path, header=None).T
last_column = mnist_train_df.columns[-1]
mnist_train_df.rename(columns={last_column:'Class'}, inplace=True)

In [2]:
mnist_train_df = mnist_train_df.convert_objects(convert_numeric = True)

In [3]:
X = mnist_train_df.drop(0, axis = 0).drop(0, axis = 1)
y = mnist_train_df.drop(0, axis = 1).as_matrix().T[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X.T, y, test_size=0.25, random_state=42)

#### feature selection

In [4]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier

In [5]:
X_train2 = X_train[0:5000]

In [6]:
y_train2 = y_train[0:5000]

In [7]:
clf = ExtraTreesClassifier()
clf = clf.fit(X_train2, y_train2)
clf.feature_importances_  
model = SelectFromModel(clf, prefit=True)
X_train2 = model.transform(X_train2)

In [8]:
X_test2 = model.transform(X_test)

In [9]:
X_train2.shape

(5000, 257)

In [10]:
X_test2.shape

(10500, 257)

In [11]:
y_test.shape

(10500,)

KNN

In [12]:
def compute_distances_no_loops(X_train,X_test):
    """
    Compute the distance between each test point in X and each training point
    in self.X_train using no explicit loops.
    Input / Output: Same as compute_distances_two_loops
    """
    num_train=len(X_train)
    num_test=len(X_test)
    dists = np.zeros((num_test, num_train)) 
    
    # split (p-q)^2 to p^2 + q^2 - 2pq
    dists = np.sqrt((X_test**2).sum(axis=1, keepdims=True) + (X_train**2).sum(axis=1) - 2 * X_test.dot(X_train.T))
    
    #########################################################################
    #                         END OF YOUR CODE                              #
    #########################################################################
    return dists


In [13]:
 def predict_labels(dists, k,y_train):
    """
    Given a matrix of distances between test points and training points,
    predict a label for each test point.
    Inputs:
    - dists: A numpy array of shape (num_test, num_train) where dists[i, j]
      gives the distance betwen the ith test point and the jth training point.
    Returns:
    - y: A numpy array of shape (num_test,) containing predicted labels for the
      test data, where y[i] is the predicted label for the test point X[i].  
    """
    num_test = dists.shape[0]
    y_pred = np.zeros(num_test)
    for i in xrange(num_test):
      
      closest_y = []
      
      closest_y = y_train[np.argsort(dists[i])][:k]
      
      y_pred[i] = np.argmax(np.bincount(closest_y))
     

    return y_pred

In [16]:
# KNN without using sklearn
num_folds = 5
k_choices = [1, 3, 5, 8, 10, 12, 15, 20, 50, 100]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(X_train2, num_folds))  #X_train
y_train_folds = np.array(np.array_split(y_train2, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(X_train2)/5
for k in k_choices:
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        dists=compute_distances_no_loops(X_train_dat,X_train_folds[n])
        y_validation_pred = predict_labels(dists,k,y_train_dat)
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print 'k = %d, accuracy = %f' % (k, accuracy)
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

KeyboardInterrupt: 

### Use scikit-learn KNN  

#### default - minkowski distance

In [51]:
from sklearn.neighbors import KNeighborsClassifier

In [120]:
X_train2.shape

(5000, 259)

In [121]:
y_train2.shape

(5000,)

In [68]:
Xsample_train=X_train2
ysample_train=y_train2

In [55]:
#### full data, 5 folds
num_folds = 5
k_choices = [1, 3, 5, 8, 10]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/num_folds
for k in k_choices:
    print 'K = ', k, ' is processing' 
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k)
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print 'k = %d, accuracy = %f' % (k, accuracy)
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

K =  1  is processing
K =  3  is processing
K =  5  is processing
K =  8  is processing
K =  10  is processing
k = 1, accuracy = 0.929000
k = 1, accuracy = 0.921000
k = 1, accuracy = 0.932000
k = 1, accuracy = 0.932000
k = 1, accuracy = 0.930000
mean of k=1,accuracy=0.928800
k = 3, accuracy = 0.932000
k = 3, accuracy = 0.919000
k = 3, accuracy = 0.927000
k = 3, accuracy = 0.939000
k = 3, accuracy = 0.920000
mean of k=3,accuracy=0.927400
k = 5, accuracy = 0.937000
k = 5, accuracy = 0.915000
k = 5, accuracy = 0.932000
k = 5, accuracy = 0.944000
k = 5, accuracy = 0.917000
mean of k=5,accuracy=0.929000
k = 8, accuracy = 0.927000
k = 8, accuracy = 0.912000
k = 8, accuracy = 0.926000
k = 8, accuracy = 0.933000
k = 8, accuracy = 0.918000
mean of k=8,accuracy=0.923200
k = 10, accuracy = 0.923000
k = 10, accuracy = 0.915000
k = 10, accuracy = 0.926000
k = 10, accuracy = 0.925000
k = 10, accuracy = 0.913000
mean of k=10,accuracy=0.920400


In [56]:
#### full data, 2 folds
num_folds = 2
k_choices = [1, 3, 5, 8, 10]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/num_folds
for k in k_choices:
    print 'K = ', k, ' is processing'
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k)
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print 'k = %d, accuracy = %f' % (k, accuracy)
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

K =  1  is processing
K =  3  is processing
K =  5  is processing
K =  8  is processing
K =  10  is processing
k = 1, accuracy = 0.910800
k = 1, accuracy = 0.915200
mean of k=1,accuracy=0.913000
k = 3, accuracy = 0.909600
k = 3, accuracy = 0.913600
mean of k=3,accuracy=0.911600
k = 5, accuracy = 0.914400
k = 5, accuracy = 0.912800
mean of k=5,accuracy=0.913600
k = 8, accuracy = 0.908000
k = 8, accuracy = 0.906400
mean of k=8,accuracy=0.907200
k = 10, accuracy = 0.903600
k = 10, accuracy = 0.904000
mean of k=10,accuracy=0.903800


In [69]:
#### full data, leave one out
num_folds = len(X_train2)
k_choices = [1, 3, 5, 8, 10]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/num_folds
for k in k_choices:
    print 'K = ', k, ' is processing'
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k)
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    #for accuracy in k_to_accuracies[k]:
    #    print 'k = %d, accuracy = %f' % (k, accuracy)
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

K =  1  is processing
K =  3  is processing
K =  5  is processing
K =  8  is processing
K =  10  is processing
mean of k=1,accuracy=0.933000
mean of k=3,accuracy=0.935000
mean of k=5,accuracy=0.931400
mean of k=8,accuracy=0.928400
mean of k=10,accuracy=0.927800


50% training data TD

In [76]:
Xsample_train=X_train2[0:5000/2]
ysample_train=y_train2[0:5000/2]

In [59]:
#### 50 percent data, 5 folds
num_folds = 5
k_choices = [1, 3, 5, 8, 10]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/5
for k in k_choices:
    print 'K = ', k, ' is processing' 
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k)
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

K =  1  is processing
K =  3  is processing
K =  5  is processing
K =  8  is processing
K =  10  is processing
k = 1, accuracy = 0.910000
k = 1, accuracy = 0.896000
k = 1, accuracy = 0.904000
k = 1, accuracy = 0.916000
k = 1, accuracy = 0.918000
mean of k=1,accuracy=0.908800
k = 3, accuracy = 0.920000
k = 3, accuracy = 0.906000
k = 3, accuracy = 0.902000
k = 3, accuracy = 0.898000
k = 3, accuracy = 0.906000
mean of k=3,accuracy=0.906400
k = 5, accuracy = 0.924000
k = 5, accuracy = 0.910000
k = 5, accuracy = 0.908000
k = 5, accuracy = 0.910000
k = 5, accuracy = 0.900000
mean of k=5,accuracy=0.910400
k = 8, accuracy = 0.914000
k = 8, accuracy = 0.908000
k = 8, accuracy = 0.906000
k = 8, accuracy = 0.896000
k = 8, accuracy = 0.894000
mean of k=8,accuracy=0.903600
k = 10, accuracy = 0.924000
k = 10, accuracy = 0.900000
k = 10, accuracy = 0.904000
k = 10, accuracy = 0.896000
k = 10, accuracy = 0.882000
mean of k=10,accuracy=0.901200


In [60]:
#### 50 percent data, 2 folds
num_folds = 2
k_choices = [1, 3, 5, 8, 10]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/2
for k in k_choices:
    print 'K = ', k, ' is processing' 
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k)
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

K =  1  is processing
K =  3  is processing
K =  5  is processing
K =  8  is processing
K =  10  is processing
k = 1, accuracy = 0.900800
k = 1, accuracy = 0.892800
mean of k=1,accuracy=0.896800
k = 3, accuracy = 0.901600
k = 3, accuracy = 0.876000
mean of k=3,accuracy=0.888800
k = 5, accuracy = 0.898400
k = 5, accuracy = 0.872000
mean of k=5,accuracy=0.885200
k = 8, accuracy = 0.889600
k = 8, accuracy = 0.865600
mean of k=8,accuracy=0.877600
k = 10, accuracy = 0.888000
k = 10, accuracy = 0.863200
mean of k=10,accuracy=0.875600


In [77]:
#### 50 percent data, leave one out
num_folds = len(Xsample_train)
k_choices = [1, 3, 5, 8, 10]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/num_folds
for k in k_choices:
    print 'K = ', k, ' is processing' 
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k)
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

K =  1  is processing
K =  3  is processing
K =  5  is processing
K =  8  is processing
K =  10  is processing
mean of k=1,accuracy=0.917600
mean of k=3,accuracy=0.918000
mean of k=5,accuracy=0.920800
mean of k=8,accuracy=0.911600
mean of k=10,accuracy=0.908800


75% of TD

In [78]:
Xsample_train=X_train2[0:5000*3/4]
ysample_train=y_train2[0:5000*3/4]

In [79]:
#### 75 percent data, 5 folds
num_folds = 5
k_choices = [1, 3, 5, 8, 10]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/5
for k in k_choices:
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k)
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

mean of k=1,accuracy=0.921333
mean of k=3,accuracy=0.921067
mean of k=5,accuracy=0.920000
mean of k=8,accuracy=0.919467
mean of k=10,accuracy=0.917333


In [80]:
#### 75 percent data, 2 folds
num_folds = 2
k_choices = [1, 3, 5, 8, 10]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/2
for k in k_choices:
    print 'K = ', k, ' is processing' 
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k)
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

K =  1  is processing
K =  3  is processing
K =  5  is processing
K =  8  is processing
K =  10  is processing
mean of k=1,accuracy=0.909067
mean of k=3,accuracy=0.903467
mean of k=5,accuracy=0.906933
mean of k=8,accuracy=0.900800
mean of k=10,accuracy=0.894933


In [82]:
#### 75 percent data, leave one out
num_folds = len(Xsample_train)
k_choices = [1, 3, 5, 8, 10]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/num_folds
for k in k_choices:
    print 'K = ', k, ' is processing'
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k)
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

K =  1  is processing
K =  3  is processing
K =  5  is processing
K =  8  is processing
K =  10  is processing
mean of k=1,accuracy=0.925867
mean of k=3,accuracy=0.928533
mean of k=5,accuracy=0.926933
mean of k=8,accuracy=0.926400
mean of k=10,accuracy=0.922667


#### euclidean distance

In [115]:
Xsample_train=X_train2[0:5000]
ysample_train=y_train2[0:5000]

In [116]:
#### full data, 5 folds
num_folds = 5
k_choices = [1, 3, 5, 8, 10]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/num_folds
for k in k_choices:
    print 'K = ', k, ' is processing' 
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k, metric = 'euclidean')
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print 'k = %d, accuracy = %f' % (k, accuracy)
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

K =  1  is processing
K =  3  is processing
K =  5  is processing
K =  8  is processing
K =  10  is processing
k = 1, accuracy = 0.928000
k = 1, accuracy = 0.918000
k = 1, accuracy = 0.931000
k = 1, accuracy = 0.933000
k = 1, accuracy = 0.932000
mean of k=1,accuracy=0.928400
k = 3, accuracy = 0.934000
k = 3, accuracy = 0.917000
k = 3, accuracy = 0.923000
k = 3, accuracy = 0.935000
k = 3, accuracy = 0.921000
mean of k=3,accuracy=0.926000
k = 5, accuracy = 0.935000
k = 5, accuracy = 0.923000
k = 5, accuracy = 0.926000
k = 5, accuracy = 0.937000
k = 5, accuracy = 0.919000
mean of k=5,accuracy=0.928000
k = 8, accuracy = 0.923000
k = 8, accuracy = 0.909000
k = 8, accuracy = 0.924000
k = 8, accuracy = 0.930000
k = 8, accuracy = 0.915000
mean of k=8,accuracy=0.920200
k = 10, accuracy = 0.921000
k = 10, accuracy = 0.907000
k = 10, accuracy = 0.918000
k = 10, accuracy = 0.925000
k = 10, accuracy = 0.914000
mean of k=10,accuracy=0.917000


#### Mahalanobis Distance

In [117]:
#### full data, 5 folds
num_folds = 5
k_choices = [1, 3, 5, 8, 10]

X_train_folds = []
y_train_folds = []
X_train_folds = np.array(np.array_split(Xsample_train, num_folds))  #X_train
y_train_folds = np.array(np.array_split(ysample_train, num_folds))  #y_train
k_to_accuracies = {}
num_test_sample=len(Xsample_train)/num_folds
for k in k_choices:
    print 'K = ', k, ' is processing' 
    for n in range(0,num_folds):
        temp= [x for x in range(0,num_folds) if x != n]
        X_train_dat=np.concatenate(X_train_folds[temp])
        y_train_dat=np.concatenate(y_train_folds[temp])
        neigh = KNeighborsClassifier(n_neighbors=k, metric = 'mahalanobis', metric_params={'V': np.cov(X_train_dat.T)})
        neigh.fit(X_train_dat, y_train_dat) 
        y_validation_pred = neigh.predict(X_train_folds[n])
        # Compute and print the fraction of correctly predicted examples
        num_correct = np.sum(y_validation_pred == y_train_folds[n])
        accuracy = float(num_correct) / num_test_sample
        k_to_accuracies.setdefault(k, []).append(accuracy)
for k in sorted(k_to_accuracies):
    for accuracy in k_to_accuracies[k]:
        print 'k = %d, accuracy = %f' % (k, accuracy)
    print 'mean of k=%d,accuracy=%f' %(k,np.mean(k_to_accuracies[k]))

K =  1  is processing
K =  3  is processing
K =  5  is processing
K =  8  is processing
K =  10  is processing


KeyboardInterrupt: 

test set on best model

In [147]:
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train2, y_train2) 
y_validation_pred = neigh.predict(X_test2[0:200])
# Compute and print the fraction of correctly predicted examples
num_correct = np.sum(y_validation_pred == y_test[0:200])
print float(num_correct) / 200

0.935
