# Algorithm to learn digit numbers

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk

Read the training data and the data that has to be predicted

In [2]:
data_train_df = pd.read_csv('train.csv')

In [3]:
data_train_df.head(4)

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
data_train_df.shape

(42000, 785)

In [12]:
#check if there are any nan values
data_train_df.isnull().values.any()

False

We have just found the shape of our training data: 42000 samples with 784 features (as the first column is the number, hence 'y').

This dataframe has to be converted to a numpy array to start using sklearn

In [13]:
data_train = data_train_df.as_matrix()

First, lets define what our X and y will be

In [14]:
X = data_train[:,1:]
y = data_train[:,0]

Now, lets divide this data into training (75%) and test data (25%)

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

Lets scale our data

In [16]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()


X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)



Now we can start training our algorithms.  First lets use neural networks

In [17]:
#using neural networks
from sklearn.neural_network import MLPClassifier

#train the algorithm
model_nn = MLPClassifier(hidden_layer_sizes=100)
model_nn.fit(X_train, y_train);




In [18]:
from sklearn.metrics import accuracy_score

#predict the test data
y_pred_test_nn = model_nn.predict(X_test) 

#calculate the accuracy
print('neural networks accuracy =', accuracy_score(y_pred_test_nn, y_test))

neural networks accuracy = 0.971238095238


Using Decision tree

In [20]:
from sklearn.tree import DecisionTreeClassifier

model_dt = DecisionTreeClassifier()
model_dt.fit(X_train, y_train)
y_pred_test_dt = model_dt.predict(X_test)

print('Decision Tree accuracy =', accuracy_score(y_pred_test_dt, y_test))

Decision Tree accuracy = 0.851333333333


Finally, using a random forest classifier

In [21]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(criterion='entropy', n_estimators=20, random_state=33, min_samples_leaf=10)
model_rf.fit(X_train, y_train)
y_pred_test_rf = model_rf.predict(X_test)

print('Random forest accuracy =', accuracy_score(y_pred_test_rf, y_test))

Random forest accuracy = 0.940952380952


For the sake of curiosity, lets look at the confusion matrix of the random forest algorithm

In [22]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_pred_test_rf, y_test)

array([[1055,    0,    8,    3,    2,    2,    6,    0,    3,   11],
       [   0, 1148,    4,    1,    4,    5,    4,    7,   13,    3],
       [   1,    5,  960,   21,    2,    2,    1,   18,    5,    6],
       [   1,    8,    8, 1021,    0,   24,    0,    1,   17,   19],
       [   1,    3,    7,    3,  967,    7,    2,    8,   10,   21],
       [   1,    1,    4,   31,    2,  847,    9,    0,   13,    8],
       [   5,    2,   14,    2,   12,   20,  964,    1,    9,    0],
       [   1,    3,    9,    7,    5,    0,    0, 1037,    2,   10],
       [  10,    6,    6,   19,    7,   12,    6,    4,  896,   11],
       [   2,    2,    5,   14,   31,    7,    0,   20,   10,  985]])