# Obligatory MNIST
Oftenly described as the 'Hello World' of Computer Vision, the famous MNIST dataset contains handwritten digits - from 0 to 9. 
The data is provided by the [Kaggle MNIST Competition](https://www.kaggle.com/c/digit-recognizer). 

Instead of going straight to ConvNets I will first try to use Random Forests and SVM (as the competition suggests) and compare it to the results I get from there.

**NOTE**: The inital versions will be 'quick and dirty' and lacking reasoning and commentary to all my choice, after which I will itterate on the notebook to polish it.

## 1. Imports

In [1]:
import numpy as np
import pandas as pd #because its easier
import matplotlib.pyplot as plt

## 2. Data Loading

In [2]:
train_df = pd.read_csv('./datasets/MNIST/train.csv')
test_df = pd.read_csv('./datasets/MNIST/test.csv')

In [3]:
train_df.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
train_df.describe()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
count,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,...,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0,42000.0
mean,4.456643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.219286,0.117095,0.059024,0.02019,0.017238,0.002857,0.0,0.0,0.0,0.0
std,2.88773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,6.31289,4.633819,3.274488,1.75987,1.894498,0.414264,0.0,0.0,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,254.0,254.0,253.0,253.0,254.0,62.0,0.0,0.0,0.0,0.0


In [5]:
train_df.shape, test_df.shape

((42000, 785), (28000, 784))

In [6]:
X_train = train_df.iloc[:, 1:].values / 255 # normalize?
y_train = train_df['label'].values
X_test = test_df.values / 255 # normalize?

In [7]:
# verify shape
X_train.shape, y_train.shape, X_test.shape

((42000, 784), (42000,), (28000, 784))

In [8]:
# verify typo
type(X_train), type(y_train), type(X_test)

(numpy.ndarray, numpy.ndarray, numpy.ndarray)

## 3. Models

### 3.1 Shallow Algos

In [10]:
# utils
from sklearn.model_selection import KFold, cross_val_score, train_test_split
# RF
from sklearn.ensemble import RandomForestClassifier
# SVM
from sklearn.svm import SVC
# Optimization
from sklearn.model_selection import GridSearchCV

#### 3.1.1 RF

In [11]:
# RF initial eval
rf = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 0, n_jobs=-1)
rf_scores = cross_val_score(rf, X_train, y_train, cv=5)

In [12]:
rf_scores.mean()

0.9658573893954726

In [None]:
rf_param_grid = {'n_estimators':[100, 250, 500, 1000, 2000, 3000], 'max_depth':[1,2,3,4,5,6,7,8,9]}
grid_search_rf = GridSearchCV(rf, rf_param_grid, cv=5)
grid_search_rf.fit(X_train, y_train)
best_params = grid_search_rf.best_params_
print(best_params)

In [14]:
# train optimized
rf_opt = RandomForestClassifier(n_estimators = 1000, criterion = 'entropy', random_state = 0, n_jobs=-1)
rf_opt.fit(X_train, y_train)
y_predicted = rf_opt.predict(X_test)

1
2
3
4
5
6
7
8
9
10
