### Problem  : Identify the Digits


In [None]:
import cv2
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Activation, MaxPooling2D, Dropout, Flatten, BatchNormalization
from tensorflow.keras.optimizers import Adam, SGD

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
## Set seed 
np.random.seed(101)
tf.set_random_seed(101)

#### Files Path

In [None]:
train_labels_path = "datasets/av_mnist_data/"
train_path = "datasets/av_mnist_data/mnist_av_images/train/"
test_path = "datasets/av_mnist_data/mnist_av_images/test/"

#### Data Preparation

In [None]:
labels = pd.read_csv(train_labels_path+'mnist_av_train_labels.csv')

In [None]:
## Loading train data
x_train_lst,y_train_lst = [],[]
_ = [(x_train_lst.append(cv2.imread(train_path+file,cv2.IMREAD_GRAYSCALE)),y_train_lst.append(labels.loc[labels.filename == file,'label'].values)) for file in os.listdir(train_path)]

In [None]:
## Loading test data
x_test_lst = []
_ = [x_test_lst.append(cv2.imread(test_path+file,cv2.IMREAD_GRAYSCALE)) for file in os.listdir(test_path)]

In [None]:
## Convert to numpy array
x_train_org = np.array(x_train_lst)
y_train_org = np.squeeze(y_train_lst)
x_test_org = np.array(x_test_lst)

In [None]:
## print the shape
print("x train shape:",x_train_org.shape)
print("y train shape:",y_train_org.shape)
print("x test shape:",x_test_org.shape)

In [None]:
## display the image
index = 1
print("Image has digit:",y_train_org[index])
plt.imshow(x_train_org[index,:,:])

In [None]:
## Normalize data
x_train = x_train_org/255.0
x_test = x_test_org/255.0

## convert it into categorical
y_train = tf.keras.utils.to_categorical(y_train_org)

In [None]:
## print the shape
print("x train shape:",x_train.shape)
print("y train shape:",y_train.shape)
print("x test shape:",x_test.shape)

In [None]:
## reshape the data
x_train = x_train.reshape(x_train.shape[0],x_train.shape[1],x_train.shape[2],1)
x_test = x_test.reshape(x_test.shape[0],x_test.shape[1],x_test.shape[2],1)

In [None]:
## print the shape
print("x train shape:",x_train.shape)
print("y train shape:",y_train.shape)
print("x test shape:",x_test.shape)

In [None]:
## Save data as numpy object
# np.save('x_train.npy',x_train)
# np.save('x_test.npy',x_test)
# np.save('y_train.npy',y_train)

## CNN using keras

In [None]:
## Load the npy objects data
# x_train = np.load('./data/x_train.npy',mmap_mode='r+')
# y_train = np.load('./data/y_train.npy',mmap_mode='r+')
# x_test = np.load('./data/x_test.npy',mmap_mode='r+')

In [None]:
## print the shape
print("x train shape:",x_train.shape)
print("y train shape:",y_train.shape)
print("x test shape:",x_test.shape)

In [None]:
input_shape = x_train.shape[1:]
input_shape

In [None]:
model = Sequential()

model.add(Conv2D(32, kernel_size=(5, 5),activation='relu',input_shape=input_shape))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.3))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.35))

model.add(Flatten())
model.add(Dense(120, activation='relu'))
model.add(Dropout(0.35))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(10, activation='softmax'))

In [None]:
adam = Adam(lr = .005,decay=0.0001)

In [None]:
model.compile(optimizer=adam,loss='categorical_crossentropy',metrics=['accuracy'])

In [None]:
history = model.fit(x_train,y_train,batch_size=128,epochs=150,validation_split=.3)

In [None]:
## See the cross validation vs taining loss

# list all data in history
print(history.history.keys())

# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
print("Train set accuracy :",model.evaluate(x_train,y_train)[1])

In [None]:
## Save model
# model.save('./av_mnist_model.h5')

In [None]:
## Load model
# from tensorflow.keras.models import load_model
# model = load_model('av_mnist_model.h5')

In [None]:
## Generate prediction for test data
pred_dict = {}
output = []

for file in os.listdir(test_path):
    pred_dict['filename'] = file
    img = cv2.imread(test_path+file,cv2.IMREAD_GRAYSCALE)
    img = img.reshape(1,img.shape[0],img.shape[1],1)
    img = img/255.0
    pred_dict['label'] = np.squeeze(model.predict_classes(img))
    output.append(pred_dict)
    pred_dict = {}

df = pd.DataFrame(output)
df.to_csv('./solution.csv',index=False)

#### Results 

** Train data accuracy:** 99.77%   
** Test data accuracy:** 99.428%