## Classification on MNIST dataset - Voulgari Eleni

### Step 1: Prepare Project

   1. Load libraries
   2. Load dataset

In [50]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from pandas.plotting import scatter_matrix
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.ensemble import VotingClassifier

In [4]:
mnist = pd.read_csv('digit_recognizer_dataset.csv')
target = mnist.label
data = mnist.drop(['label'], axis=1)

### Step 2: Define Problem

##### What is your task? What are your goals? What do you want to achieve?

We are trying to evaluate a bunch of different classification algorithms on hand written digit recognition.
The goal is to choose the best model for predicting the hand written digits and try to improve the accuracy of the recognition.

### Step 3: Exploratory Analysis

##### Understand your data: Take a “peek” of your data, answer basic questions about the dataset. Summarise your data. Explore descriptive statistics and visualisations.

In [5]:
print data.head(10)
print target.head(10)

   pixel0  pixel1  pixel2  pixel3  pixel4  pixel5  pixel6  pixel7  pixel8  \
0       0       0       0       0       0       0       0       0       0   
1       0       0       0       0       0       0       0       0       0   
2       0       0       0       0       0       0       0       0       0   
3       0       0       0       0       0       0       0       0       0   
4       0       0       0       0       0       0       0       0       0   
5       0       0       0       0       0       0       0       0       0   
6       0       0       0       0       0       0       0       0       0   
7       0       0       0       0       0       0       0       0       0   
8       0       0       0       0       0       0       0       0       0   
9       0       0       0       0       0       0       0       0       0   

   pixel9    ...     pixel774  pixel775  pixel776  pixel777  pixel778  \
0       0    ...            0         0         0         0         0   
1     

In [6]:
print data.shape
print target.shape

(42000, 784)
(42000,)


In [7]:
pd.isnull(mnist).any()

label       False
pixel0      False
pixel1      False
pixel2      False
pixel3      False
pixel4      False
pixel5      False
pixel6      False
pixel7      False
pixel8      False
pixel9      False
pixel10     False
pixel11     False
pixel12     False
pixel13     False
pixel14     False
pixel15     False
pixel16     False
pixel17     False
pixel18     False
pixel19     False
pixel20     False
pixel21     False
pixel22     False
pixel23     False
pixel24     False
pixel25     False
pixel26     False
pixel27     False
pixel28     False
            ...  
pixel754    False
pixel755    False
pixel756    False
pixel757    False
pixel758    False
pixel759    False
pixel760    False
pixel761    False
pixel762    False
pixel763    False
pixel764    False
pixel765    False
pixel766    False
pixel767    False
pixel768    False
pixel769    False
pixel770    False
pixel771    False
pixel772    False
pixel773    False
pixel774    False
pixel775    False
pixel776    False
pixel777    False
pixel778  

In [8]:
data.describe()

        pixel0   pixel1   pixel2   pixel3   pixel4   pixel5   pixel6   pixel7  \
count  42000.0  42000.0  42000.0  42000.0  42000.0  42000.0  42000.0  42000.0   
mean       0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
std        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
min        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
25%        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
50%        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
75%        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   
max        0.0      0.0      0.0      0.0      0.0      0.0      0.0      0.0   

        pixel8   pixel9    ...         pixel774      pixel775      pixel776  \
count  42000.0  42000.0    ...     42000.000000  42000.000000  42000.000000   
mean       0.0      0.0    ...         0.219286      0.117095      0.059024   
std        0.0      0.0    ...   

In [13]:
mnist.groupby('label').size()

label
0    4132
1    4684
2    4177
3    4351
4    4072
5    3795
6    4137
7    4401
8    4063
9    4188
dtype: int64

### Step 4: Prepare Data

##### Data Cleaning/Data Wrangling/Collect more data (if necessary).

In [17]:
normalizer = Normalizer()
normalized_data = normalizer.fit_transform(data)
scaler = StandardScaler()
scaled_data = scaler.fit_transform(normalized_data)
scaled_data[1]

array([  0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
        -5.25695663e-03,  -6.85749399e-03,  -4.87955846e-03,
        -4.87955846e-03,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
         0.00000000e+00,   0.00000000e+00,  -4.87955846e-03,
        -5.59459946e-03,  -9.29615683e-03,  -1.17896976e-02,
        -1.61397448e-02,  -2.03688475e-02,  -2.60785075e-02,
        -2.99119891e-02,  -3.16459385e-02,  -3.23331345e-02,
        -3.25905358e-02,  -2.96669935e-02,  -2.96319963e-02,
        -2.77933638e-02,  -2.34898481e-02,  -1.75477552e-02,
        -1.51109978e-02,

### Step 5: Feature Engineering

##### Feature selection/feture engineering (as in new features)/data transformations.

In [19]:
pca = PCA(n_components=100)
pca_data = pca.fit_transform(data)
print pca_data

[[ -661.5957796   -699.31132836   183.28203935 ...,   -34.62238991
    -48.55246591    14.84437281]
 [ 1701.45168525  -360.55155498  -501.80559307 ...,    10.19707807
     38.02567162  -122.93827258]
 [ -886.89443397  -293.76578243    67.15530855 ...,    63.28549983
    -71.83626531   -88.94508098]
 ..., 
 [  336.26998179   950.01528008  -168.60541588 ...,   -40.26384167
    -61.38786345   -14.72619862]
 [  327.39939165    62.01608939   697.53640036 ...,   -27.71568231
     24.63794174   121.52892383]
 [ -364.24919995   418.89302673    62.89982267 ...,    44.81797071
    -45.5302838     58.49988236]]


### Step 6: Algorithm Selection

##### Select a set of algorithms to apply, select evaluation metrics, and evaluate/compare algorithms.

- We are going to compare a set of algorithms, regarding their accuracy:
1. Random Forest
2. Linear Discriminant Analysis
3. K-Nearest Neighbors
4. Support Vector Machines
5. Naive Bayes
6. Logistic Regression

In [30]:
# Because it takes forever to run it with 42.000 samples, we choose a subset of 5.000 samples
sample5000 = mnist.sample(n=5000)
target5000 = sample5000.label
data5000 = sample5000.drop(['label'], axis=1)
data5000.shape

(5000, 784)

In [31]:
# Evaluation metric:accuracy
scoring = 'accuracy'
kfold = KFold(n_splits=10, random_state=7)

In [33]:
models = []
models.append(('RF',  RandomForestClassifier()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('SVM', SVC()))
models.append(('NB',  GaussianNB()))
models.append(('LR',  LogisticRegression()))


results = []
names   = []

for name, model in models:
    cv_results = cross_val_score(model, data5000, target5000, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    print("%03s: %f (+/- %f)" % (name, cv_results.mean(), cv_results.std()))

 RF: 0.888400 (+/- 0.012831)
LDA: 0.836400 (+/- 0.010613)
KNN: 0.933200 (+/- 0.008060)
SVM: 0.109400 (+/- 0.012200)
 NB: 0.552000 (+/- 0.030146)
 LR: 0.814400 (+/- 0.012484)


### Step 7: Model Training

##### Apply ensembles and improve performance by hyperparameter optimisation.

In [35]:
X_train, X_test, Y_train, Y_test = train_test_split(data5000, target5000, test_size=0.33, random_state=7)

In [36]:
KNN = KNeighborsClassifier()
model = KNN.fit(X_train, Y_train)
predictions = model.predict(X_test)  # We predict the Ys of the test set the model we have created 
print "Accuracy for test data is", accuracy_score(Y_test, predictions)

Accuracy for test data is 0.927878787879


In [37]:
# because we have reduced the dataset to 5000 samples we will perform normalization, scaling and PCA again for this dataset

normalized_data5000 = normalizer.fit_transform(data5000)
scaler = StandardScaler()
scaled_data5000 = scaler.fit_transform(normalized_data5000)

pca_data5000 = pca.fit_transform(data5000)

In [38]:
X_train, X_test, Y_train, Y_test = train_test_split(scaled_data5000, target5000, test_size=0.33, random_state=7)

In [39]:
model = KNN.fit(X_train, Y_train)
predictions = model.predict(X_test)  # We predict the Ys of the test set the model we have created 
print "Accuracy for test data is", accuracy_score(Y_test, predictions)

Accuracy for test data is 0.880606060606


In [40]:
X_train, X_test, Y_train, Y_test = train_test_split(pca_data5000, target5000, test_size=0.33, random_state=7)

In [41]:
model = KNN.fit(X_train, Y_train)
predictions = model.predict(X_test)  # We predict the Ys of the test set the model we have created 
print "Accuracy for test data is", accuracy_score(Y_test, predictions)

Accuracy for test data is 0.933333333333


In [42]:
# Try to find the best hyperparameters for KNN for the data which has been transformed with pca
params = {"n_neighbors": np.arange(5,15,30), "metric": ["euclidean", "cityblock", "minkowski"], "weights": ["uniform", "distance"]}
grid = GridSearchCV(KNN, params)
grid.fit(X_train, (np.array(Y_train)).ravel())
print grid.best_params_

{'n_neighbors': 5, 'metric': 'euclidean', 'weights': 'distance'}


In [44]:
predictions = grid.predict(X_test)
print "Accuracy for test data is", accuracy_score(Y_test, predictions)

Accuracy for test data is 0.936363636364


In [49]:
estimators = []
model1 = RandomForestClassifier()
estimators.append(('RM', model1))
model2 = LinearDiscriminantAnalysis()
estimators.append(('LDA', model2))
model3 = KNeighborsClassifier()
estimators.append(('KNN', model3))
model4 = SVC()
estimators.append(('SVM', model4))
model5 = GaussianNB()
estimators.append(('NB', model5))
model6 = LogisticRegression()
estimators.append(('LR', model6))
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X_train, Y_train, cv=kfold)
print(results.mean())

0.903582089552


### Step 8: Finalise Model

##### Predictions on validation set, create model from the entire (training) dataset.

In [46]:
KNN_param = KNeighborsClassifier(n_neighbors = 5, metric = 'euclidean', weights = 'distance')
model_param = KNN_param.fit(X_train, Y_train)
predictions_param = KNN_param.predict(X_test)
print "Accuracy for test data is", accuracy_score(Y_test, predictions_param)

Accuracy for test data is 0.936363636364


In [47]:
model_param = KNN_param.fit(pca_data5000, target5000)

In [48]:
model_param_all = KNN_param.fit(data, target)