## K-NN Classifier

In [1]:
from init import *

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn import datasets
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
import pickle
import time

# LOCATION = "/Volumes/My Passport for Mac/Pickle Backups"
LOCATION = "pickles"

In [3]:
%matplotlib inline

In [4]:
X = train_features.reshape(50000, 3*32*32)
Xt = test_features.reshape(10000, 3*32*32)
y = train_labels.flatten()
yt = test_labels.flatten()

In [5]:
msg("[K-NN] Training")
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X, y)
done()

pydump(knn, "pickles/knn_model.pickle")

[K-NN] Training ... done.


In [16]:
msg("[K-NN] Testing Accuracy")

count = 0
start = time.time()
def predict_batch(x):
    global count
    global start
    count += 25
    elapsed = time.time()-start
    print("\rClassifiying {}. Elapsed Time: {}m {}s ... ".format(count, int(elapsed/60), int(elapsed%60)), end="")
    return knn.predict_proba(x)

Xt_pred_proba = np.array([predict_batch(Xt[i*25:(i+1)*25]) for i in range(400)]).reshape(-1, 10)
# Xt_pred = np.apply_along_axis(predict_with_update, 1, Xt).reshape(-1, 1)
# Xt_pred = knn.predict(Xt)

pydump(Xt_pred_proba, "pickles/knn_predict_proba.pickle")

msg(metrics.accuracy_score(yt, Xt_pred_proba.argmax(axis=1).reshape(-1,1)))
done()

Classifiying 10000. Elapsed Time: 40m 9s ... 0.3386 ... done.


## K-NN w/ PCA

In [5]:
X_pca = pyload("/Volumes/My Passport for Mac/Pickle Backups/data_X_pca_200.pickle")
Xt_pca = pyload("/Volumes/My Passport for Mac/Pickle Backups/data_Xt_pca_200.pickle")

In [8]:
msg("[K-NN PCA] Training")
knn_pca = KNeighborsClassifier(n_neighbors=10)
knn_pca.fit(X_pca, y)
done()

pydump(knn_pca, "pickles/knn_pca_model.pickle")

[K-NN PCA] Training ... done.


In [10]:
msg("[K-NN PCA] Testing Accuracy")

count = 0
start = time.time()
def predict_batch(x):
    global count
    global start
    count += 25
    elapsed = time.time()-start
    print("\rClassifiying {}. Elapsed Time: {}m {}s ... ".format(count, int(elapsed/60), int(elapsed%60)), end="")
    return knn_pca.predict_proba(x)

Xt_pca_pred_proba = np.array([predict_batch(Xt_pca[i*25:(i+1)*25]) for i in range(400)]).reshape(-1, 10)
# Xt_pred = np.apply_along_axis(predict_with_update, 1, Xt).reshape(-1, 1)
# Xt_pred = knn.predict(Xt)

pydump(Xt_pca_pred_proba, "pickles/knn_pca_predict_proba.pickle")

msg(metrics.accuracy_score(yt, Xt_pca_pred_proba.argmax(axis=1).reshape(-1,1)))
done()

Classifiying 10000. Elapsed Time: 2m 9s ... 0.3654 ... done.


In [12]:
pca_100 = PCA(n_components=100)
pca_100.fit(X,y)

PCA(copy=True, n_components=100, whiten=False)

In [13]:
X_pca_100 = pca_100.transform(X)
Xt_pca_100 = pca_100.transform(Xt)

pydump(pca_100, "/Volumes/My Passport for Mac/Pickle Backups/pca_100.pickle")
pydump(X_pca_100, "/Volumes/My Passport for Mac/Pickle Backups/X_pca_100.pickle")
pydump(Xt_pca_100, "/Volumes/My Passport for Mac/Pickle Backups/Xt_pca_100.pickle")

The history saving thread hit an unexpected error (OperationalError('unable to open database file',)).History will not be written to the database.


### PCA / 100

In [5]:
def KNN_PCA(c=100):
    msg("[K-NN PCA {}] Preparing PCA ...".format(c))
    pca_100 = PCA(n_components=c)
    pca_100.fit(X,y)
    
    X_pca_100 = pca_100.transform(X)
    Xt_pca_100 = pca_100.transform(Xt)

    pydump(pca_100, "{}/pca_{}.pickle".format(LOCATION, c))
    pydump(X_pca_100, "{}/X_pca_{}.pickle".format(LOCATION, c))
    pydump(Xt_pca_100, "{}/Xt_pca_{}.pickle".format(LOCATION, c))
    
    msg("[K-NN PCA {}] Training".format(c))
    knn_pca_100 = KNeighborsClassifier(n_neighbors=10)
    knn_pca_100.fit(X_pca_100, y)
    done()
    
    pydump(knn_pca_100, "{}/knn_pca_{}_model.pickle".format(LOCATION, c))

    msg("[K-NN PCA {}] Testing Accuracy".format(c))
    
    global count
    global start
    count = 0
    start = time.time()
    def predict_batch(x):
        global count
        global start
        count += 25
        elapsed = time.time()-start
        print("\rClassifiying {}. Elapsed Time: {}m {}s ... ".format(count, int(elapsed/60), int(elapsed%60)), end="")
        return knn_pca_100.predict_proba(x)

    Xt_pca_pred_proba = np.array([predict_batch(Xt_pca_100[i*25:(i+1)*25]) for i in range(400)]).reshape(-1, 10)
    # Xt_pred = np.apply_along_axis(predict_with_update, 1, Xt).reshape(-1, 1)
    # Xt_pred = knn.predict(Xt)

    pydump(Xt_pca_pred_proba, "{}/knn_pca_{}_predict_proba.pickle".format(LOCATION, c))

    msg(metrics.accuracy_score(yt, Xt_pca_pred_proba.argmax(axis=1).reshape(-1,1)))
    done()

In [6]:
KNN_PCA(75)

[K-NN PCA 75] Preparing PCA ... ... [K-NN PCA 75] Training ... done.
Classifiying 10000. Elapsed Time: 1m 12s ... 0.3977 ... done.


In [7]:
KNN_PCA(50)

[K-NN PCA 50] Preparing PCA ... ... [K-NN PCA 50] Training ... done.
Classifiying 10000. Elapsed Time: 0m 48s ... 0.4012 ... done.


In [None]:
for i in [40,30,25,20,15,10,5,4,3,2,1]:
    KNN_PCA(i)

[K-NN PCA 40] Preparing PCA ... ... 

In [9]:
import time
class NearestNeighbor(object):
  def __init__(self):
    pass

  def train(self, X, y):
    """ X is N x D where each row is an example. Y is 1-dimension of size N """
    # the nearest neighbor classifier simply remembers all the training data
    self.Xtr = X
    self.ytr = y

  def predict(self, X):
    """ X is N x D where each row is an example we wish to predict label for """
    num_test = X.shape[0]
    # lets make sure that the output type matches the input type
    Ypred = np.zeros(num_test, dtype = self.ytr.dtype)
    
    start = time.time()
    # loop over all test rows
    for i in range(num_test):
      # find the nearest training image to the i'th test image
      # using the L1 distance (sum of absolute value differences)
      distances = np.sum(np.abs(self.Xtr - X[i,:]), axis = 1)
      min_index = np.argmin(distances) # get the index with smallest distance
      Ypred[i] = self.ytr[min_index] # predict the label of the nearest example
      elapsed = time.time()-start
      print("\rClassified {} ... Elapsed Time: {}m {}s".format(i, int(elapsed/60), int(elapsed%60)),end="")

    return Ypred

nn = NearestNeighbor() # create a Nearest Neighbor classifier class
nn.train(X, y) # train the classifier on the training images and labels
Xt_predict = nn.predict(Xt) # predict labels on the test images
# and now print the classification accuracy, which is the average number
# of examples that are correctly predicted (i.e. label matches)
print('accuracy: %f' % ( metrics.accuracy_score(yt, Xt_predict)))

Classified 9999 ... Elapsed Time: 79m 44saccuracy: 0.249200
