## Miniproject: Classification with k-Nearest Neighbors 

In [1]:
# packages
import os
import pandas as pd
import math
from scipy import io
import numpy as np
from numpy import squeeze
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import zero_one_loss
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot') 

In [2]:
allData = pd.read_csv('sessionTrialTable.csv')

# Remove any rows with nans
allDataNN = pd.DataFrame.dropna(allData,'index')
print("After Drop NaN")
print("num_rows =",len(allDataNN))

After Drop NaN
num_rows = 4997


In [3]:
ketBool = allDataNN['ketamineAdministered']
timeSinceKetamine = allDataNN['timeSinceKetamine']
sessionDate = allDataNN['sessionDate']
trialNum = allDataNN['trialNum']
neuralData = allDataNN[['animalName', 'totalCellNum',
       'gender', 'genotype', 'weight_g',
       'ketamine_day', 'correlationScore', 'lickAccuracy',
       'lickNumber', 'avgFR', 'avgSingleCellVariance',
       'varianceFR', 'avgTrialSpeed', 'varianceSpeed',
       'medianCellDepth']]

# Convert categorical columns
le = LabelEncoder()
neuralData_LE = neuralData.copy()
neuralData_LE['animalName'] = le.fit_transform(neuralData_LE['animalName'])
neuralData_LE['gender'] = le.fit_transform(neuralData_LE['gender'])
neuralData_LE['genotype'] = le.fit_transform(neuralData_LE['genotype'])

In [4]:
X, X_test, y, y_test = train_test_split(neuralData_LE,ketBool.values.ravel(), test_size=0.2)

In [5]:
# Split for cross validation, use 10 folds
num_folds = 10
XA = np.array(X)
yA = np.array(y)
X_train = []
X_test = []
y_train = []
y_test = []
skf = StratifiedKFold(n_splits=num_folds)
for train_index, test_index in skf.split(XA, yA):
    X_train.append(XA[train_index])
    X_test.append(XA[test_index])
    y_train.append(yA[train_index])
    y_test.append(yA[test_index])

In [6]:
# Now try KNN on whole train set: NOTE data point is included in its own nearest neighbors, so this is a bit meaningless
# Let's start with k=5
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean').fit(X,y)
y_pred = knn.predict(X)
print(zero_one_loss(y, y_pred))
print(accuracy_score(y,y_pred))

0.0525394045534151
0.9474605954465849


In [7]:
zo_loss = []
accuracy = []
for i in range(0,num_folds):
    knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean').fit(X_train[i],y_train[i])    
    y_pred = knn.predict(X_test[i])
    zo_loss.append(zero_one_loss(y_test[i],y_pred))
    accuracy.append(accuracy_score(y_test[i],y_pred))
    
avg_zo_loss = np.mean(zo_loss)
avg_acc = np.mean(accuracy)
print("Average zero-one loss across folds:",avg_zo_loss)
print("Average accuracy across folds:",avg_acc)

Average zero-one loss across folds: 0.07880409408808806
Average accuracy across folds: 0.921195905911912


In [8]:
avg_zo = []
avg_ac = []
for k in [1,3,5,6,7,8,10,15,20]:
    zo_loss = []
    accuracy = []
    for i in range(0,num_folds):
        knn = KNeighborsClassifier(n_neighbors=k, metric='euclidean').fit(X_train[i],y_train[i])    
        y_pred = knn.predict(X_test[i])
        zo_loss.append(zero_one_loss(y_test[i],y_pred))
        accuracy.append(accuracy_score(y_test[i],y_pred))
    avg_zo.append(np.mean(zo_loss))
    avg_ac.append(np.mean(accuracy))

In [9]:
avg_zo

[0.08831539415871349,
 0.0747915752598454,
 0.07880409408808806,
 0.08455723942024637,
 0.08456161601010007,
 0.08655787536172102,
 0.08981038475240471,
 0.09506478634241465,
 0.10607295670597941]

In [10]:
avg_ac

[0.9116846058412864,
 0.9252084247401546,
 0.921195905911912,
 0.9154427605797537,
 0.9154383839899,
 0.9134421246382789,
 0.9101896152475952,
 0.9049352136575852,
 0.8939270432940205]

### Best model: k=5, accuracy = 0.916

### Now what if we augment? does that help?

In [11]:
AugData = neuralData_LE.copy()
AugData.keys()

AugData['animalNamexCorrelationScore'] = AugData['animalName']*AugData['correlationScore']
AugData['animalNamexLickAccuracy'] = AugData['animalName']*AugData['lickAccuracy']
AugData['animalNamexLickNumber'] = AugData['animalName']*AugData['lickNumber']
AugData['animalNamexAvgFR'] = AugData['animalName']*AugData['avgFR']
AugData['animalNamexAvgSingleCellVariance'] = AugData['animalName']*AugData['avgSingleCellVariance']
AugData['animalNamexVarianceFR'] = AugData['animalName']*AugData['varianceFR']
AugData['animalNamexAvgTrialSpeed'] = AugData['animalName']*AugData['avgTrialSpeed']
AugData['animalNamexVarianceSpeed'] = AugData['animalName']*AugData['varianceSpeed']

AugData['totalCellNumxCorrelationScore'] = AugData['totalCellNum']*AugData['correlationScore']
AugData['totalCellNumxLickAccuracy'] = AugData['totalCellNum']*AugData['lickAccuracy']
AugData['totalCellNumxLickNumber'] = AugData['totalCellNum']*AugData['lickNumber']
AugData['totalCellNumxAvgFR'] = AugData['totalCellNum']*AugData['avgFR']
AugData['totalCellNumxAvgSingleCellVariance'] = AugData['totalCellNum']*AugData['avgSingleCellVariance']
AugData['totalCellNumxVarianceFR'] = AugData['totalCellNum']*AugData['varianceFR']
AugData['totalCellNumxAvgTrialSpeed'] = AugData['totalCellNum']*AugData['avgTrialSpeed']
AugData['totalCellNumxVarianceSpeed'] = AugData['totalCellNum']*AugData['varianceSpeed']

AugData['genderxCorrelationScore'] = AugData['gender']*AugData['correlationScore']
AugData['genderxLickAccuracy'] = AugData['gender']*AugData['lickAccuracy']
AugData['genderxLickNumber'] = AugData['gender']*AugData['lickNumber']
AugData['genderxAvgFR'] = AugData['gender']*AugData['avgFR']
AugData['genderxAvgSingleCellVariance'] = AugData['gender']*AugData['avgSingleCellVariance']
AugData['genderxVarianceFR'] = AugData['gender']*AugData['varianceFR']
AugData['genderxAvgTrialSpeed'] = AugData['gender']*AugData['avgTrialSpeed']
AugData['genderxVarianceSpeed'] = AugData['gender']*AugData['varianceSpeed']

AugData['genotypexCorrelationScore'] = AugData['genotype']*AugData['correlationScore']
AugData['genotypexLickAccuracy'] = AugData['genotype']*AugData['lickAccuracy']
AugData['genotypexLickNumber'] = AugData['genotype']*AugData['lickNumber']
AugData['genotypexAvgFR'] = AugData['genotype']*AugData['avgFR']
AugData['genotypexAvgSingleCellVariance'] = AugData['genotype']*AugData['avgSingleCellVariance']
AugData['genotypexVarianceFR'] = AugData['genotype']*AugData['varianceFR']
AugData['genotypexAvgTrialSpeed'] = AugData['genotype']*AugData['avgTrialSpeed']
AugData['genotypexVarianceSpeed'] = AugData['genotype']*AugData['varianceSpeed']

AugData['weight_gxCorrelationScore'] = AugData['weight_g']*AugData['correlationScore']
AugData['weight_gxLickAccuracy'] = AugData['weight_g']*AugData['lickAccuracy']
AugData['weight_gxLickNumber'] = AugData['weight_g']*AugData['lickNumber']
AugData['weight_gxAvgFR'] = AugData['weight_g']*AugData['avgFR']
AugData['weight_gxAvgSingleCellVariance'] = AugData['weight_g']*AugData['avgSingleCellVariance']
AugData['weight_gxVarianceFR'] = AugData['weight_g']*AugData['varianceFR']
AugData['weight_gxAvgTrialSpeed'] = AugData['weight_g']*AugData['avgTrialSpeed']
AugData['weight_gxVarianceSpeed'] = AugData['weight_g']*AugData['varianceSpeed']

AugData['ketamine_dayxCorrelationScore'] = AugData['ketamine_day']*AugData['correlationScore']
AugData['ketamine_dayxLickAccuracy'] = AugData['ketamine_day']*AugData['lickAccuracy']
AugData['ketamine_dayxLickNumber'] = AugData['ketamine_day']*AugData['lickNumber']
AugData['ketamine_dayxAvgFR'] = AugData['ketamine_day']*AugData['avgFR']
AugData['ketamine_dayxAvgSingleCellVariance'] = AugData['ketamine_day']*AugData['avgSingleCellVariance']
AugData['ketamine_dayxVarianceFR'] = AugData['ketamine_day']*AugData['varianceFR']
AugData['ketamine_dayxAvgTrialSpeed'] = AugData['ketamine_day']*AugData['avgTrialSpeed']
AugData['ketamine_dayxVarianceSpeed'] = AugData['ketamine_day']*AugData['varianceSpeed']

AugData['medianCellDepthxCorrelationScore'] = AugData['medianCellDepth']*AugData['correlationScore']
AugData['medianCellDepthxLickAccuracy'] = AugData['medianCellDepth']*AugData['lickAccuracy']
AugData['medianCellDepthxLickNumber'] = AugData['medianCellDepth']*AugData['lickNumber']
AugData['medianCellDepthxAvgFR'] = AugData['medianCellDepth']*AugData['avgFR']
AugData['medianCellDepthxAvgSingleCellVariance'] = AugData['medianCellDepth']*AugData['avgSingleCellVariance']
AugData['medianCellDepthxVarianceFR'] = AugData['medianCellDepth']*AugData['varianceFR']
AugData['medianCellDepthxAvgTrialSpeed'] = AugData['medianCellDepth']*AugData['avgTrialSpeed']
AugData['medianCellDepthxVarianceSpeed'] = AugData['medianCellDepth']*AugData['varianceSpeed']

stdNeuralDataAug = StandardScaler().fit_transform(AugData) 

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


In [35]:
X, X_ho, y, y_ho = train_test_split(stdNeuralDataAug,ketBool.values.ravel(), test_size=0.2, random_state=2019)

In [27]:
# Split for cross validation, use 10 folds
num_folds = 10
XA = np.array(X)
yA = np.array(y)
X_train = []
X_test = []
y_train = []
y_test = []
skf = StratifiedKFold(n_splits=num_folds)
for train_index, test_index in skf.split(XA, yA):
    X_train.append(XA[train_index])
    X_test.append(XA[test_index])
    y_train.append(yA[train_index])
    y_test.append(yA[test_index])

In [31]:
avg_zo = []
avg_ac = []
for k in [1,3,5,6,7,8,10,15,20]:
    zo_loss = []
    accuracy = []
    for i in range(0,num_folds):
        knn = KNeighborsClassifier(n_neighbors=k, metric='euclidean').fit(X_train[i],y_train[i])    
        y_pred = knn.predict(X_test[i])
        zo_loss.append(zero_one_loss(y_test[i],y_pred))
        accuracy.append(accuracy_score(y_test[i],y_pred))
    avg_zo.append(np.mean(zo_loss))
    avg_ac.append(np.mean(accuracy))

In [32]:
avg_zo

[0.06780590816192601,
 0.06078963931024571,
 0.06379527528297052,
 0.06830092688079302,
 0.06879342370889821,
 0.07155155344720908,
 0.07205093313083208,
 0.07330219251370322,
 0.07580595191219947]

In [33]:
avg_ac

[0.932194091838074,
 0.9392103606897544,
 0.9362047247170293,
 0.931699073119207,
 0.9312065762911018,
 0.928448446552791,
 0.9279490668691679,
 0.9266978074862969,
 0.9241940480878006]

##### Augmentation gets us to best accuracy of 0.939 with k = 3

In [34]:
knn = KNeighborsClassifier(n_neighbors=3, metric='euclidean').fit(X,y)    


In [36]:
y_pred = knn.predict(X_ho)
zo_loss = zero_one_loss(y_ho,y_pred)
accuracy = accuracy_score(y_ho,y_pred)

In [37]:
accuracy

0.931