# Step 0a: Dataset retrieval

If token is not manually uploaded in the first place...
```
from google.colab import files
files.upload() #upload kaggle.json
```

Install Kaggle, auth, then download dataset
```
# Install Kaggle from PyPI
!pip install -q kaggle

# Kaggle: auth
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

# Download dataset
!kaggle datasets download -d crawford/emnist

# Dataset: extract balanced only
!unzip emnist.zip emnist-balanced-test.csv
!unzip emnist.zip emnist-balanced-train.csv
!unzip emnist.zip emnist-balanced-mapping.txt
```

# Step 0b: Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools

from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils
import sklearn.metrics as metrics

# Step 1: Dataset reading

In [None]:
"""
Read dataset
"""

train = pd.read_csv("emnist-balanced-train.csv",delimiter = ',')
test = pd.read_csv("emnist-balanced-test.csv", delimiter = ',')
mapp = pd.read_csv("emnist-balanced-mapping.txt", delimiter = ' ', \
                   index_col=0, header=None, squeeze=True)
print("Train: %s, Test: %s, Map: %s" %(train.shape, test.shape, mapp.shape))

In [None]:
"""
Split dataset into train + test set
"""

train_x = train.iloc[:,1:]
train_y = train.iloc[:,0]
del train

test_x = test.iloc[:,1:]
test_y = test.iloc[:,0]
del test

print(train_x.shape,train_y.shape,test_x.shape,test_y.shape)

# Step 2: Preprocessing

In [None]:
"""
Correct image orientations and normalize them
"""

# Constants
HEIGHT = 28
WIDTH = 28

# Define: image rotation
def rotate(image):
    image = image.reshape([HEIGHT, WIDTH])
    image = np.fliplr(image)
    image = np.rot90(image)
    return image

# Flip and rotate image
train_x = np.asarray(train_x)
train_x = np.apply_along_axis(rotate, 1, train_x)
print ("train_x:",train_x.shape)

test_x = np.asarray(test_x)
test_x = np.apply_along_axis(rotate, 1, test_x)
print ("test_x:",test_x.shape)

# Normalise
train_x = train_x.astype('float32')
train_x /= 255
test_x = test_x.astype('float32')
test_x /= 255

In [None]:
"""
Show sample images from the train set to verify
"""

for i in range(100, 109):
    plt.subplot(330 + (i+1))
    plt.imshow(train_x[i], cmap=plt.get_cmap('gray'))
    plt.title(chr(mapp[train_y[i]]))

In [None]:
# number of classes
num_classes = train_y.nunique()
print(num_classes)

In [None]:
# One hot encoding
train_y = np_utils.to_categorical(train_y, num_classes)
test_y = np_utils.to_categorical(test_y, num_classes)
print("train_y: ", train_y.shape)
print("test_y: ", test_y.shape)

In [8]:
# Reshape image for CNN
train_x = train_x.reshape(-1, HEIGHT, WIDTH, 1)
test_x = test_x.reshape(-1, HEIGHT, WIDTH, 1)

In [9]:
# partition to train and val
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size= 0.10, random_state=7)

# Step 3: Model building

In [None]:
# Building model
# ((Si - Fi + 2P)/S) + 1
model = Sequential()

model.add(Conv2D(filters=128, kernel_size=(5,5), padding = 'same', activation='relu',\
                 input_shape=(HEIGHT, WIDTH,1)))
model.add(MaxPooling2D(pool_size=(2,2), strides=(2,2)))
model.add(Conv2D(filters=64, kernel_size=(3,3) , padding = 'same', activation='relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Flatten())
model.add(Dense(units=128, activation='relu'))
model.add(Dropout(.5))
model.add(Dense(units=num_classes, activation='softmax'))

model.summary()

In [36]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 4: Training

In [None]:
history = model.fit(train_x, train_y, epochs=20, batch_size=512, verbose=1, \
                    validation_data=(val_x, val_y))

In [38]:
model.save('vanilla_cnn.h5')

# Step 5: Assessment

In [None]:
"""
Evaluate loss + accuracy
"""

score = model.evaluate(test_x, test_y, verbose=0)
print("Test loss:", score[0])
print("Test accuracy:", score[1])

In [None]:
def plotgraph(epochs, acc, val_acc):
    plt.plot(epochs, acc, 'b')
    plt.plot(epochs, val_acc, 'r')
    plt.title('Model accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train', 'Val'], loc='upper left')
    plt.show()

acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1,len(acc)+1)

In [None]:
"""
Plot accuracy curve
"""

plotgraph(epochs, acc, val_acc)

In [None]:
"""
Plot loss curve
"""

plotgraph(epochs, loss, val_loss)

In [None]:
"""
Predict using the model
"""

y_pred = model.predict(test_x)
y_pred = (y_pred > 0.5)

In [None]:
"""
Calculate and plot confusion matrix
"""

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

cm = metrics.confusion_matrix(test_y.argmax(axis=1), y_pred.argmax(axis=1))

class_names = [chr(mapp[i]) for i in range(num_classes)]
plt.figure(figsize=(50,20))
plot_confusion_matrix(cm, class_names, normalize=True)
plt.show()

In [None]:
"""
Draw incorrect predictions
"""

f = plt.figure(figsize=(12,4))
n = -1

for i in range(test_y.shape[0]):
    if (test_y[i] != y_pred[i]).any() and n < 29 and np.count_nonzero(y_pred[i]) != 0:
        
        label_pred  = chr(mapp[np.where(y_pred[i] == True)[0][0]])
        label_truth = chr(mapp[np.where(test_y[i] == True)[0][0]])
        
        n += 1
        sp = f.add_subplot(3, 10, n+1)
        sp.axis('Off')
        sp.set_title('{pred} | {truth}'.format(
            pred = label_pred,
            truth = label_truth
        ))
        
        plt.imshow(test_x[i].reshape((28,28)), cmap=plt.get_cmap('gray'))

In [None]:
"""
Count number of incorrect predictions
"""

i = 0
count = 0
for i in range(test_y.shape[0]):
    if (test_y[i] != y_pred[i]).any() and np.count_nonzero(y_pred[i]) != 0:
        count += 1
print(count)