# Digit Recognizer

## Learn computer vision fundamentals with the famous MNIST data

Author: Facundo Gal√°n  
Date: 2020-10-17  
Link: https://www.kaggle.com/c/digit-recognizer/notebooks  
Following the tutorial of [shweta2407](https://www.kaggle.com/shweta2407/mnist-digit-recognition-using-cnn-0-99-accuracy)

### Install and import all necessary libraries

In [None]:
!pip install kaggle pandas seaborn sklearn matplotlib pydot graphviz

In [None]:
!cp ../kaggle.json ~/.kaggle/kaggle.json
!kaggle --version

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os
import datetime

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Dropout, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D, AveragePooling2D
from tensorflow.keras.layers import Input, Dense, GlobalAveragePooling2D, Dropout, BatchNormalization
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import plot_model
from tensorflow.keras.applications.xception import Xception
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.callbacks import TensorBoard

print('\nAll libraries have been installed and imported correctly.')


print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))
physical_devices = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

### Download dataset

In [None]:
!kaggle competitions download -c digit-recognizer

!unzip digit-recognizer.zip -do digit-recognizer

### Load data

In [None]:
train_data = pd.read_csv('./digit-recognizer/train.csv')
test_data = pd.read_csv('./digit-recognizer/test.csv')

train_data.head()

### Separate the label and features

In [None]:
y_train = train_data['label']
x_train = train_data.drop(['label'], axis=1)


del train_data

### Plot the data

In [None]:
sns.set(style='white', context='notebook', palette='Paired')

sns.countplot(y_train)

y_train.value_counts()

### Handle null values or missing values

In [None]:
x_train.isnull().any().describe()

In [None]:
test_data.isnull().any().describe()

### Normalization and reshaping

In [None]:
x_train = x_train / 255.0
test_data = test_data / 255.0

x_train = x_train.values.reshape(-1, 28 , 28, 1)
test_data = test_data.values.reshape(-1, 28 , 28, 1)

print(x_train.shape)
print(test_data.shape)

### Encode the y_train (labels)

In [None]:
y_train = keras.utils.to_categorical(y_train, num_classes = 10)

print(y_train[0])

### Split the data into training and validation sets

In [None]:
from sklearn.model_selection import train_test_split

random_seed = 131

x, x_val, y, y_val = train_test_split(x_train, y_train, test_size=0.1, random_state=random_seed)

### Visualize some of the data

In [None]:
plt.figure(figsize=(10, 10))

for i in range(9):  
    plt.subplot(3, 3, i+1)
    plt.imshow(x[i][:,:,0])

### Create a TensorBoard to monitor the training 

In [None]:
%load_ext tensorboard
!rm -rf ./logs

logdir = os.path.join("./logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)

### Model Architecture

A LeNet-5 model will be used.

In [None]:
model = keras.Sequential()

model.add(Conv2D(filters=6, kernel_size=(3, 3), activation='relu', input_shape=(28,28,1)))
model.add(AveragePooling2D())

model.add(Conv2D(filters=16, kernel_size=(3, 3), activation='relu'))
model.add(AveragePooling2D())

model.add(Flatten())
model.add(Dense(units=120, activation='relu'))
model.add(Dense(units=84, activation='relu'))
model.add(Dense(units=10, activation = 'softmax'))

model.summary()

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
EPOCHS = 20
BATCH = 1000

history = model.fit(x,  
                    y,              
                    verbose = 1,            
                    epochs = EPOCHS, 
                    batch_size = BATCH,
                    validation_data=(x_val, y_val), 
                    shuffle=True,
                    callbacks=[tensorboard_callback])

In [None]:
%tensorboard --bind_all --logdir ./logs

### Plot the validation loss and training loss

In [None]:
plt.plot(history.history['loss'], color='r')
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epochs')
plt.legend(['training', 'validation'], loc='upper right')
plt.show()

### Evaluate the model

In [None]:
rows = 5
cols = 5

plt.figure(figsize=(10,10))
for index in range(rows*cols):
    img = test_data[index].reshape(1, 28, 28, 1)
    pred = np.argmax(model.predict(img))
    plt.subplot(rows, cols, index+1)
    plt.imshow(test_data[index][:,:,0])
    plt.xlabel('Predicted : {}'.format(pred))

plt.tight_layout()
plt.show()

### Submission

In [None]:
rows = 5
cols = 5

results =[]
cnt = 0
for index in range(28000//1000):
    imgs = test_data[index*1000:index*1000+1000].reshape(1000, 28, 28, 1)
        
    pred = model.predict(imgs)
    for index2 in range(1000):
        results.append(np.argmax(pred[index2]))
    
    cnt += 1000
    print(str(cnt) + ' images analyzed.')

In [None]:
submission = pd.DataFrame()
submission['ImageId'] = [i for i in range(1, 28001)]
submission['Label'] = results

In [None]:
submission.to_csv('./submission.csv', index=False)

In [None]:
!kaggle competitions submit -c digit-recognizer -f ./submission.csv -m "Message"