# ML Workflow with MNIST

## 0. Import libraries

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks")

# Artificial neural network packages
from keras.models import *
from keras.layers import *
from keras.wrappers.scikit_learn import *
from keras.optimizers import *
from keras.metrics import *
from keras.callbacks import *
from keras.utils.np_utils import * 
import keras.backend as K

from sklearn.model_selection import train_test_split

import os

## 1. Import data

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
numTrain,numTest = len(train),len(test)
print("Length of training data is: " + str(numTrain) + " entries")
print("Length of test data is: " + str(numTest) + " entries")

## 2. Preliminary analysis

In [None]:
train.info()
train.describe()

In [None]:
test.info()
test.describe()

### 2.1. Univariate analysis

#### 2.1.1 The dependent variable

We have a look at the distribution of the dependent variable.****

In [None]:
_ = plt.hist(train["label"],bins = 20)

#### 2.1.2 Missing values

In [None]:
train.isnull().any().any()

In [None]:
test.isnull().any().any()

From these 2 cells, we quickly determine that there are no NA values.

## 3. Data cleaning and preprocessing

In [None]:
fullData = pd.concat([train,test]).reset_index(drop = True)
fullData.drop(['label'],axis = 1,inplace = True)

### 3.1. Configuring categorical features
There are some features that should be catogorical, but were entered as numeric data. We will change the data types now.

In [None]:
y_train = train["label"]

In [None]:
y_train = to_categorical(y_train)

### 3.2. Normalizing and scaling

In [None]:
fullData = fullData / 255.0

### 3.3. Reshaping

In [None]:
fullData = fullData.values.reshape(-1,28,28,1)

## 4. Model building and evaluation

### 4.1. Validation method

In [None]:
X_train = fullData[:numTrain]
X_test = fullData[numTrain:]

In [None]:
np.random.seed(1)
X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,test_size = 0.2,random_state = 1)

### 4.2. Model building

In [None]:
model = Sequential()

model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', 
                 activation ='relu', input_shape = (28,28,1)))
model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', 
                 activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(0.5))


model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', 
                 activation ='relu'))
model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', 
                 activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
model.add(Dropout(0.5))

model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dense(10, activation = "softmax"))

In [None]:
optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

In [None]:
model.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])

In [None]:
model.summary()

In [None]:
learning_rate_reduction = ReduceLROnPlateau(monitor='val_loss', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)

In [None]:
model.fit(X_train,y_train,epochs = 30,batch_size = 128,validation_data = (X_val,y_val),callbacks = [learning_rate_reduction])

In [None]:
model.evaluate(X_val,y_val,batch_size = 128)

### 4.3. Model evaluation

Work in progress.

### 4.4. Model ensembling

In [None]:
# Predictions
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob,axis = 1)
results = pd.read_csv("../input/sample_submission.csv")
results['Label'] = y_pred
results.to_csv("submission.csv",index = False)

## 5. Acknowledgements

1. [How to choose CNN Architecture MNIST](https://www.kaggle.com/cdeotte/how-to-choose-cnn-architecture-mnist) by [Chris Deotte](https://www.kaggle.com/cdeotte)
2. [Introduction to CNN Keras - 0.997 (top 6%)](https://www.kaggle.com/yassineghouzam/introduction-to-cnn-keras-0-997-top-6) by [Yassine Ghouzam](https://www.kaggle.com/yassineghouzam/introduction-to-cnn-keras-0-997-top-6)

## 6. Future work

1. Model evaluation via confusion matrix and examining false negatives/positives
2. Reasoned model emsembling
3. Experiment with minimal networks to achieve performance benchmarks
4. Experiment with fast.ai package