# Digit Recognition

In this project, the goal is to correctly identify digits from a dataset of tens of thousands of handwritten images. 

In [None]:
from IPython.display import Image
from IPython.core.display import HTML
Image(url='https://www.researchgate.net/profile/Hugo_Larochelle/publication/200744481/figure/fig1/AS:668968306098181@1536505881710/Samples-from-the-MNIST-digit-recognition-data-set-Here-a-black-pixel-corresponds-to-an.png')

### Import Libraries 

In [None]:
# Importing numpy, pandas and Series + DataFrame:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

# Imports for plotting:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

import sklearn

### Importing Data 

In [None]:
digit_train_df = pd.read_csv('../input/train.csv')

In [None]:
digit_test_df = pd.read_csv('../input/test.csv')

### Visualising Data for 'Digit Recognition dataset' 

In [None]:
digit_train_df.head()

In [None]:
digit_train_df.tail()

In [None]:
digit_train_df.shape

In [None]:
# Bar chart of frequency of digit occurance in our train dataset:
sns.factorplot('label', data=digit_train_df, kind='count')

In [None]:
# digit_test_df set doesn't have a label, so we want to use it for testing and submission
digit_test_df.head()

In [None]:
digit_test_df.shape

In [None]:
# Specify arrays (matrices) for training and testing data: 
training = np.array(digit_train_df, dtype = 'float32')

testing = np.array(digit_test_df, dtype = 'float32')

In [None]:
# Visualising digit in a random row (needs to be reshaped to original dimention 28x28):
from numpy import random
i = np.random.randint(1,42000)

plt.imshow(training[i, 1:].reshape(28,28))
label = training[i,0]
print(label)

### Training the Model 

In [None]:
# Normalise the data for training:
X_train = training[:, 1:]/255
y_train = training[:, 0]

In [None]:
# Normalise the data for testing 
X_test = testing/255

In [None]:
# Importing keras, Keras is a high-level neural networks API, written in Python and capable of running on top of 
# TensorFlow, CNTK, or Theano. 
import keras
from keras.preprocessing.image import ImageDataGenerator

In [None]:
# Importing train_test_split from sklearn:
from sklearn.model_selection import train_test_split

X_train, X_validate, y_train, y_validate = train_test_split(X_train, y_train, test_size = 0.2, random_state = 9697)

In [None]:
# Reshaping the data: 
X_train = X_train.reshape(X_train.shape[0],*(28,28,1))
X_test = X_test.reshape(X_test.shape[0], *(28,28,1))
X_validate = X_validate.reshape(X_validate.shape[0],*(28,28,1))

In [None]:
# Data augmentation, (we rescaled the data previously so rescale = 1):
data_generator = ImageDataGenerator(rescale = 1, rotation_range = 12, zoom_range = 0.15, width_shift_range = 0.125,
                                    height_shift_range = 0.125)

data_generator.fit(X_train)
data_generator.fit(X_test)

In [None]:
X_train.shape

In [None]:
X_validate.shape

In [None]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, Dropout
from keras. optimizers import Adam
from keras.callbacks import TensorBoard

In [None]:
# Build model in a sequential form: 
cnn_model = Sequential()

In [None]:
# We can also use 64 kernels instead of 32, our imput shape (image) is of size 28x28x1 and activation function is relu:
cnn_model.add(Conv2D(filters = 32, kernel_size = (5,5), padding = 'Same', 
                     input_shape = (28,28,1), activation = 'relu'))


In [None]:
# Add max pooling layer
cnn_model.add(MaxPooling2D(pool_size = (2,2)))

In [None]:
# Add the following:
cnn_model.add(Conv2D(64, (3,3), activation = 'relu'))

In [None]:
# Add dropout
cnn_model.add(Dropout(0.5))

In [None]:
# Flatten the model (into one single array)
cnn_model.add(Flatten())

In [None]:
# Adding the dense function:
cnn_model.add(Dense(output_dim = 32, activation = 'relu'))

In [None]:
# Replacing sigmoid activation function by softmax did not improve prediction:
cnn_model.add(Dense(output_dim = 10, activation = 'sigmoid'))

In [None]:
# Training the model with Adam optimizer:
cnn_model.compile(loss = 'sparse_categorical_crossentropy', optimizer = Adam(lr = 0.001), metrics = ['accuracy'])

In [None]:
# Specify number of epochs and train the model on epochs:
epochs = 10

In [None]:
cnn = cnn_model.fit(X_train, y_train, batch_size = 512, nb_epoch = epochs, verbose = 1, 
              validation_data = (X_validate, y_validate))

### Evaluating the Model

In [None]:
evaluation = cnn_model.evaluate(X_train, y_train)
print('Test acuracy: {:.3f}'.format(evaluation[1]))

In [None]:
# Visualising the training model accuracy:
import os

plt.figure(figsize = (10,5))
#plt.subplot(2,1,1)
plt.plot(cnn.history['acc'])
plt.plot(cnn.history['val_acc'])

# Adding title, labels and legend:
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc = 'lower right')

plt.show()

In [None]:
# Visualising the training model loss:
plt.figure(figsize = (10,5))
#plt.subplot(2,1,2)
plt.plot(cnn.history['loss'])
plt.plot(cnn.history['val_loss'])

# Adding title, labels and legend:
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['train', 'test'], loc = 'upper right')

plt.show()

In [None]:
# Evaluation with confution matrix:
predicted_classes = cnn_model.predict_classes(X_train)

In [None]:
L = 5
W = 5
fig, axes = plt.subplots(L, W, figsize = (12,12))
axes = axes.ravel()

for i in np.arange(0, L * W):
    axes[i].imshow(X_train[i].reshape(28,28))
    axes[i].set_title('Prediction Class = {:0.1f}\n True Class = {:0.1f}'.format(predicted_classes[i], y_train[i]))
    axes[i].axis('off')
    
plt.subplots_adjust(wspace = 0.5)    

In [None]:
# Importing the confusion matrix:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_train, predicted_classes)

In [None]:
# Plotting heatmap for the confusion matrix:
plt.figure(figsize = (14,10))
sns.heatmap(cm, cmap='YlGnBu', annot = True, fmt = 'g')
# Sum the diogonal element to get the total true correct values

In [None]:
# To obtain classification report:
from sklearn.metrics import classification_report

num_classes = 10
target_names = ['label{}'.format(i) for i in range(num_classes)]

print(classification_report(y_train, predicted_classes, target_names = target_names))

### Make a prediction for testing set 

In [None]:
prediction = cnn_model.predict_classes(X_test)

In [None]:
prediction.shape

In [None]:
id_list = np.arange(1,28001)

In [None]:
id_list

In [None]:
# Combine ImageID and Label into one DataFrame:
final_result = pd.DataFrame({'Label': prediction, 'ImageId': id_list})
final_result = final_result[['ImageId', 'Label']]

In [None]:
# Downloading final_result dataset as digit_output.csv:
final_result.to_csv('digit_output.csv', index = False)