# Obligatory MNIST
Oftenly described as the 'Hello World' of Computer Vision, the famous MNIST dataset contains handwritten digits - from 0 to 9. 
The data is provided by the [Kaggle MNIST Competition](https://www.kaggle.com/c/digit-recognizer). 

Instead of going straight to ConvNets I will first try to use Random Forests and SVM (as the competition suggests) and compare it to the results I get from there.

**NOTE**: The inital versions will be 'quick and dirty' and lacking reasoning and commentary to all my choice, after which I will itterate on the notebook to polish it.

## 1. Imports

In [1]:
import numpy as np
import pandas as pd #because its easier
import matplotlib.pyplot as plt

## 2. Data Loading

In [2]:
train_df = pd.read_csv('./datasets/MNIST/train.csv')
test_df = pd.read_csv('./datasets/MNIST/test.csv')

In [3]:
train_df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
train_df.describe()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
count,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,...,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0
mean,4.456643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.219286,0.117095,0.059024,0.02019,0.017238,0.002857,0.0,0.0,0.0,0.0
std,2.88773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.31289,4.633819,3.274488,1.75987,1.894498,0.414264,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254.0,254.0,253.0,253.0,254.0,62.0,0.0,0.0,0.0,0.0


In [5]:
train_df.shape, test_df.shape

((42000, 785), (28000, 784))

In [6]:
X_train = train_df.iloc[:, 1:].values / 255 # normalize?
y_train = train_df['label'].values
X_test = test_df.values / 255 # normalize?

In [7]:
# verify shape
X_train.shape, y_train.shape, X_test.shape

((42000, 784), (42000,), (28000, 784))

In [9]:
# verify type
type(X_train), type(y_train), type(X_test)

(numpy.ndarray, numpy.ndarray, numpy.ndarray)

## 3. Models

In [17]:
# define a submission generation function
def make_submission(y_hat, filename):
    indices = [i for i in range(1, y_predicted.shape[0] + 1)]
    predictions_df = pd.DataFrame({'ImageId':indices, 'Label': y_hat})
    file_path = './submissions/'
    predictions_df.to_csv(file_path + filename, index=False)
    print('Done!')

### 3.1 Shallow Algos

In [10]:
# utils
from sklearn.model_selection import KFold, cross_val_score, train_test_split
# RF
from sklearn.ensemble import RandomForestClassifier
# SVM
from sklearn.svm import SVC
# KNN
from sklearn.neighbors import KNeighborsClassifier
# Optimization
from sklearn.model_selection import GridSearchCV

#### 3.1.1 RF

In [10]:
# RF initial eval
rf = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 0, n_jobs=-1)
rf_scores = cross_val_score(rf, X_train, y_train, cv=5)

In [11]:
rf_scores.mean()

0.9659288094726095

In [12]:
# selecting a subset from the data for the purpose of faster grid_search
X_subset = X_train[:10000, :]
y_subset = y_train[:10000]

In [13]:
rf = RandomForestClassifier()
rf_param_grid = {'n_estimators':[100, 250, 500, 1000, 2000, 3000], 'max_depth':[1,3,5], 
                 'max_features': ['sqrt', 0.5, 1], 'min_samples_leaf':[1,3,5,25]}
grid_search_rf = GridSearchCV(rf, rf_param_grid, cv=5, n_jobs=-1)
grid_search_rf.fit(X_subset, y_subset)
best_params = grid_search_rf.best_params_
print(best_params)

{'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'n_estimators': 500, 'n_jobs': -1}


In [14]:
# train optimized
y_predicted_rf = grid_search_rf.predict(X_test)

#### Export them predictions

In [18]:
make_submission(y_predicted, 'MNIST_RF_20190126_submission_2.csv')

1st RF Submission: 0.93885  
2st RF Submission: 0.85714 <- A grid search on the whole data could lead to better results

#### 3.1.2 SVM

In [13]:
# not quite performant on large datasets, but for the sake of testing
svc = SVC(kernel='rbf', C=20) # random guessing of params
svc_scores = cross_val_score(svc, X_train, y_train, cv=5, n_jobs=-1)

In [15]:
svc_scores.mean()

0.9598813113114162

In [None]:
svc_param_grid = [
        {'kernel': ['linear'], 'C': [10., 30., 100., 300., 1000., 3000., 10000., 30000.0]},
        {'kernel': ['rbf'], 'C': [1.0, 3.0, 10., 30., 100., 300., 1000.0],
         'gamma': [0.01, 0.03, 0.1, 0.3, 1.0, 3.0]},
    ]
svc = SVC()
grid_search_svc = GridSearchCV(svc, svc_param_grid, cv=5, n_jobs=-1)
grid_search_svc.fit(X_subset, y_subset)
best_params = grid_search_svc.best_params_
print(best_params) # <- just out of curiousity

In [None]:
y_predicted = grid_search_svc.predict(X_test)

#### Export them predictions

In [None]:
make_submission(y_predicted, 'MNIST_RF_20190130_submission_3.csv')

#### 3.1.3 KNN

In [16]:
knn = KNeighborsClassifier(n_neighbors=4, weights='distance')
knn_scores = cross_val_score(knn, X_train, y_train, cv=5, n_jobs=-1)
knn_scores.mean()

0.9691189207242517

In [None]:
knn_param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': range(3, 7)}]
knn = knn()
grid_search_knn = GridSearchCV(knn, knn_param_grid, cv=5, n_jobs=-1)
grid_search_knn.fit(X_train, y_train)
best_params = grid_search_knn.best_params_
print(best_params) # <- just out of curiousity

In [19]:
# since its score on the cross val was even higher than RF, then we could try to submit
knn = KNeighborsClassifier(n_neighbors=4, weights='distance')
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=4, p=2,
           weights='distance')

In [20]:
y_predicted = knn.predict(X_test)
#y_predicted = grid_search_knn.predict(X_test)

In [21]:
make_submission(y_predicted, 'MNIST_KNN_20190130_submission_3.csv')

Done!


### 3.2 Deep Algos

#### 3.2.1 Convolutional Neural Network - Keras Approach

In [19]:
import keras
from keras import backend as K
from keras import regularizers
from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Dense, Dropout, Flatten
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from keras.metrics import categorical_accuracy

Using TensorFlow backend.


In [21]:
# one-hot encode the training labels
y_enc_train = to_categorical(y_train, num_classes = 10)

In [23]:
# reshape the vectors so that they conform to the CNN
X_scl_train = X_train.reshape(-1, 28, 28, 1)
X_scl_test = X_test.reshape(-1, 28, 28, 1)