In [124]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier #KNN classifier
from sklearn.svm import SVC #SVM classifier
from sklearn.ensemble import RandomForestClassifier #Random forest classifier
from sklearn.metrics import accuracy_score


In [125]:
train = pd.read_csv('/content/drive/MyDrive/AML/digit-train.csv')
test = pd.read_csv('/content/drive/MyDrive/AML/digit-test.csv')

In [126]:
print("Training set shape : ", train.shape)
print("Test set shape : ", test.shape)

Training set shape :  (4198, 785)
Test set shape :  (4198, 785)


In [127]:
train.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [128]:
test.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,7,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [129]:
cols = []
for (columnName, columnData) in train.iteritems():
    x = train[columnName].unique()
    if (len(x) == 1):
      if x ==0 :
        cols.append(columnName)

len(cols) #127 columns with only 0 values. we can get rid of these columns
train = train.drop(columns = cols)

In [130]:
cols2 = []
for (columnName, columnData) in test.iteritems():
    x = test[columnName].unique()
    if (len(x) == 1):
      if x ==0 :
        cols2.append(columnName)

#if len(cols2) == len(cols):
  #print("yes, train and test set 0 valued columns are same")
test = test.drop(columns = cols)

len(cols2)

129

In [131]:
#No null values
train.isnull().values.any()

False

In [132]:
#create a dataframe with all training data except the target column
X_train = train.drop(columns=['label'])
X_test = test.drop(columns=['label'])
#separate target values
y_train = train['label'].values
y_test = test['label'].values


## KNN ##

In [133]:
# Create KNN classifier
scores = []
for k in range(2,10):
  knn = KNeighborsClassifier(n_neighbors = k)
  knn.fit(X_train,y_train)
  accuracy = knn.score(X_test, y_test)
  scores.append([k,accuracy,knn])


In [134]:
scores.sort(key = lambda x: x[1], reverse = True)
scores

[[3, 0.9359218675559791, KNeighborsClassifier(n_neighbors=3)],
 [5, 0.9344926155312053, KNeighborsClassifier()],
 [4, 0.9333015721772272, KNeighborsClassifier(n_neighbors=4)],
 [6, 0.9306812767984755, KNeighborsClassifier(n_neighbors=6)],
 [7, 0.9304430681276799, KNeighborsClassifier(n_neighbors=7)],
 [8, 0.9304430681276799, KNeighborsClassifier(n_neighbors=8)],
 [9, 0.92663172939495, KNeighborsClassifier(n_neighbors=9)],
 [2, 0.9232968080038113, KNeighborsClassifier(n_neighbors=2)]]

In [135]:
bestmodel = scores[0]
bestmodel[2].predict(X_test)[0:5]
print("Best model number of neighbors = ",bestmodel[0])
print("Best model accuracy = ",bestmodel[1])

Best model number of neighbors =  3
Best model accuracy =  0.9359218675559791


## SVM ##

In [136]:
# Making the SVM Classifer
kernel_list = ['linear','poly','rbf','sigmoid']
scores_svm = []
for i in kernel_list:
  classifier = SVC(kernel=i)
  classifier.fit(X_train, y_train)
  y_pred = classifier.predict(X_test)
  accuracy = accuracy_score(y_test,y_pred)
  scores_svm.append([i,accuracy,classifier])


In [137]:
scores_svm.sort(key = lambda x: x[1], reverse = True)
scores_svm

[['rbf', 0.9537875178656503, SVC()],
 ['poly', 0.9406860409718913, SVC(kernel='poly')],
 ['linear', 0.9090042877560743, SVC(kernel='linear')],
 ['sigmoid', 0.8384945212005717, SVC(kernel='sigmoid')]]

In [138]:
bestmodel_svm = scores_svm[0]
print(bestmodel_svm[2].predict(X_test)[0:5],"\n")

print("Best model kernel = ",bestmodel_svm[0])
print("Best model accuracy = ",bestmodel_svm[1])

[0 0 7 3 5] 

Best model kernel =  rbf
Best model accuracy =  0.9537875178656503


## Random Forest ##

In [139]:
#Create a Gaussian Classifier
scores_rf = []
for estimator in range(100,1000,100):
  rf_classifier = RandomForestClassifier(n_estimators=estimator)
  rf_classifier.fit(X_train,y_train)
  y_pred_rf=rf_classifier.predict(X_test)
  rf_accuracy = accuracy_score(y_test, y_pred_rf)
  scores_rf.append([estimator,rf_accuracy,rf_classifier])


In [140]:
scores_rf.sort(key = lambda x: x[1], reverse = True)
scores_rf

[[800, 0.9435445450214388, RandomForestClassifier(n_estimators=800)],
 [300, 0.9418770843258695, RandomForestClassifier(n_estimators=300)],
 [500, 0.9416388756550739, RandomForestClassifier(n_estimators=500)],
 [900, 0.9416388756550739, RandomForestClassifier(n_estimators=900)],
 [400, 0.9411624583134827, RandomForestClassifier(n_estimators=400)],
 [200, 0.940924249642687, RandomForestClassifier(n_estimators=200)],
 [700, 0.9406860409718913, RandomForestClassifier(n_estimators=700)],
 [600, 0.939018580276322, RandomForestClassifier(n_estimators=600)],
 [100, 0.9349690328727965, RandomForestClassifier()]]

In [141]:
bestmodel_rf = scores_rf[0]
print(bestmodel_rf[2].predict(X_test)[0:5],"\n")

print("Best model number of estimators = ",bestmodel_rf[0])
print("Best model accuracy = ",bestmodel_rf[1])

[0 0 7 3 5] 

Best model number of estimators =  800
Best model accuracy =  0.9435445450214388
