In [99]:
import h5py
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


In [100]:
df = h5py.File("SVHN_single_grey1.h5",'r')

In [101]:
type(df)

h5py._hl.files.File

In [102]:
#Extract data to build KNN 

In [103]:
X_train = df['X_train'][:3000]
y_train = df['y_train'][:3000]
X_test = df['X_test'][:1000]
y_test = df['y_test'][:1000]

In [104]:
nsamples, nx, ny = X_train.shape
nsamples_test,nx_test,ny_test = X_test.shape
X_train = X_train.reshape((nsamples,nx*ny))
X_test = X_test.reshape((nsamples_test,nx_test*ny_test))

In [105]:
print(len(X_train))
print(len(X_test))

3000
1000


In [106]:
#keeping 10% of data for testing 

In [107]:
print("training data points :{}".format(len(X_train)))
print("testing data points :{}".format(len(X_test)))

training data points :3000
testing data points :1000


In [108]:
#Initialize knn with k 

k = range(1,30,2)
accuracies = []

In [109]:
#Idea is to loop over various values of k and then finding the optimal one 

for k in range(1,30,2):
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train,y_train)
    score = model.score(X_test,y_test)
    print("k=",k," Accuracy",score*100)
    accuracies.append(score)

k= 1  Accuracy 32.1
k= 3  Accuracy 30.7
k= 5  Accuracy 33.300000000000004
k= 7  Accuracy 32.800000000000004
k= 9  Accuracy 34.599999999999994
k= 11  Accuracy 34.5
k= 13  Accuracy 34.4
k= 15  Accuracy 34.699999999999996
k= 17  Accuracy 35.4
k= 19  Accuracy 35.0
k= 21  Accuracy 34.8
k= 23  Accuracy 33.300000000000004
k= 25  Accuracy 33.5
k= 27  Accuracy 33.6
k= 29  Accuracy 33.2


In [110]:
#Build model with k = 15

In [114]:
model = KNeighborsClassifier(n_neighbors=17)
model.fit(X_train,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=17, p=2,
           weights='uniform')

In [115]:
#Print the score and then confusion matrix

model.score(X_test,y_test)*100

35.4

In [116]:
#Print the classification report 
print(classification_report(y_test,model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.35      0.48      0.40       117
           1       0.31      0.56      0.40        93
           2       0.38      0.35      0.36        92
           3       0.30      0.31      0.30        85
           4       0.38      0.43      0.40        96
           5       0.36      0.19      0.25       105
           6       0.23      0.28      0.25        94
           7       0.61      0.53      0.57       107
           8       0.43      0.21      0.28       100
           9       0.28      0.21      0.24       111

   micro avg       0.35      0.35      0.35      1000
   macro avg       0.36      0.35      0.35      1000
weighted avg       0.37      0.35      0.35      1000



In [117]:
#Confusion matrix

print(confusion_matrix(y_test,model.predict(X_test)))

[[56 10  3  3 13  3 15  5  3  6]
 [ 8 52  4  4  9  4  7  3  1  1]
 [ 6 13 32 12  3  2  3  7  3 11]
 [ 8 15  6 26  3  7  7  4  2  7]
 [ 6 19  6  7 41  3  5  4  1  4]
 [12 21  5 11  4 20 12  7  7  6]
 [25  8  1  5 10  4 26  1  4 10]
 [ 3 15 11  9  4  4  2 57  0  2]
 [20  5  5  3 11  5 18  1 21 11]
 [18 11 11  8  9  3 16  5  7 23]]


In [118]:
#KNN is not doing any good lets do neural networks 

In [120]:
import tensorflow as tf


In [154]:
X_train = df['X_train'][:]
y_train = df['y_train'][:]
X_test = df['X_test'][:]
y_test = df['y_test'][:]

In [155]:
print("training data points :{}".format(len(X_train)))
print("testing data points :{}".format(len(X_test)))



training data points :42000
testing data points :18000


In [156]:
y_train = tf.keras.utils.to_categorical(y_train, num_classes=10)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=10)

In [157]:
nsamples, nx, ny = X_train.shape
nsamples_test,nx_test,ny_test = X_test.shape
X_train = X_train.reshape((nsamples,nx*ny))
X_test = X_test.reshape((nsamples_test,nx_test*ny_test))

In [217]:
#Initialize the Graph 

model = tf.keras.models.Sequential()

#Normalise the data  

model.add(tf.keras.layers.BatchNormalization())

In [218]:
#?tf.keras.layers.Reshape

In [219]:
#Build the Graph

model.add(tf.keras.layers.Dense(200, activation='relu'))

model.add(tf.keras.layers.Dense(100, activation='relu'))

model.add(tf.keras.layers.Dense(60, activation='relu'))

model.add(tf.keras.layers.Dense(30, activation='relu'))

model.add(tf.keras.layers.Dense(10, activation='softmax'))

In [220]:
#Create optimizer 

sgd_opt = tf.keras.optimizers.SGD(lr=.05)

In [221]:
#Compile the model 

model.compile(optimizer=sgd_opt,loss='categorical_crossentropy',metrics=['accuracy'])

In [222]:
model.fit(X_train,y_train,
         validation_data=(X_test,y_test),
         epochs=50,
         batch_size=32)

Train on 42000 samples, validate on 18000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x10b3120eb38>

In [209]:
y_predicted = model.predict(X_test)

In [210]:
y_test[0]
y_test[1]

array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0.], dtype=float32)

In [249]:
K=np.argmax(y_predicted[0])
y_predicted[1]

array([6.3866377e-05, 8.3509386e-03, 1.1852890e-02, 3.5921633e-03,
       8.6426735e-06, 1.3220310e-04, 5.1556826e-03, 9.2254573e-01,
       1.7186999e-04, 1.2442768e-03], dtype=float32)

In [212]:
model.evaluate(X_test, y_test, verbose=False)

[0.5664899996783999, 0.8371111]

In [259]:
y_pred = (y_predicted > 0.5).astype(int) 
line_y_pred=np.argmax(y_pred, axis=1)
line_y_test=np.argmax(y_test, axis=1)

In [260]:
line_y_pred[1]

7

In [262]:
print(confusion_matrix(line_y_pred,line_y_test))

[[1764  435  317  754  371  433  366  323  554  776]
 [  11 1348   10    9   19    5    7   27    7   12]
 [   2   10 1425   19    8    8    7   56    7   13]
 [   4    4   15  869    4   11    2    5   12    6]
 [   4   10    5    3 1390    6    8    4    2   12]
 [   0    2    3   37    1 1263   45    0   12   15]
 [  16    3    3   10   10   33 1383    1   42    1]
 [   7   14   20    9    6    2    2 1390    2    6]
 [   0    2    2    7    0    6   11    1 1169   14]
 [   6    0    3    2    3    1    1    1    5  949]]


In [263]:
print(classification_report(line_y_pred,line_y_test))

              precision    recall  f1-score   support

           0       0.97      0.29      0.45      6093
           1       0.74      0.93      0.82      1455
           2       0.79      0.92      0.85      1555
           3       0.51      0.93      0.66       932
           4       0.77      0.96      0.85      1444
           5       0.71      0.92      0.80      1378
           6       0.75      0.92      0.83      1502
           7       0.77      0.95      0.85      1458
           8       0.65      0.96      0.77      1212
           9       0.53      0.98      0.68       971

   micro avg       0.72      0.72      0.72     18000
   macro avg       0.72      0.88      0.76     18000
weighted avg       0.80      0.72      0.68     18000



**Upon checking the classification report we can confirm that DNN is doing far better as compared to the KNN F1 score for 
knn was in range from .25 to .57 where as in DNN model F1 score is from .45 to .85**

**When we see the classification score it is 83% for DNN where as 35.4 for KNN although we took less no of data points in KNN **

**Precision and recall is also low for KNN as F1 score has given clear intution **

Trade off :

Although we are getting good score in DNN but we don't have control over algoritham it is totally black box where as in traditional machine learning algoritham we can tell or explain how the predictions are made like in our KNN we know that we have taken 17 nearest neighbour and we will predict the class according to higher no of votes 