In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import itertools

from keras.utils.np_utils import to_categorical # convert to one-hot-encoding
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, AvgPool2D
from keras.optimizers import RMSprop
from keras.optimizers import Adam
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau
from keras.callbacks import LearningRateScheduler

Load data :

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
train.head(5)

Missing data :

In [None]:
missing_data = train.isnull().sum()
print(missing_data[missing_data > 0])

get train_x and train_y :

In [None]:
train_y = train['label']
train_x = train.drop('label', axis = 1)

Normalization :
*  make sure each feature to have a similar range so that our gradients don't go out of control, and it'll converge faster.

In [None]:
train_x = train_x / 255.0
test = test / 255.0

Reshape to 28 x 28 x 1 :

In [None]:
train_x = train_x.values.reshape(-1, 28, 28, 1)
test = test.values.reshape(-1, 28, 28, 1)

Make train_y array of form [...0, 0, ....., 1, 0, 0...] :

In [None]:
train_y = to_categorical(train_y)

Split validation set :

In [None]:
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size = 0.1, random_state = 2)

Let's take a look how the img looks like :

In [None]:
plt.imshow(train_x[0][:,:,0], cmap = 'gray')

> **CNN :**
* convolution: to get local features. increasing convolution layer may get more complicated features.
* pooling: to scale picture data and get those who are important.
    * MaxPooling (20 epoch) => 0.9874
    * AvgPooling (20 epoch) => 0.9817
    * Max pooling is better for extracting the extreme features
    * Avgerage pooling sometimes can抰 extract good features because it takes all into count 
* drop: to prevent overfit
* flattern: data here is 28X28, but we need to make it back to one-dimensional data in order to connect them
* conntect: fully connected

In [None]:
model = Sequential()

model.add(Conv2D(filters = 32, kernel_size = (5,5), padding = 'Same', activation = 'relu', input_shape = (28,28,1)))
model.add(Conv2D(filters = 32, kernel_size = (5,5), padding = 'Same', activation = 'relu'))
model.add(MaxPool2D(pool_size = (2, 2)))
# model.add(AvgPool2D(pool_size = (2, 2)))
model.add(Dropout(0.25))


model.add(Conv2D(filters = 64, kernel_size = (3,3), padding = 'Same', activation = 'relu'))
model.add(Conv2D(filters = 64, kernel_size = (3,3), padding = 'Same', activation = 'relu'))
model.add(MaxPool2D(pool_size = (2, 2), strides = (2,2)))
# model.add(AvgPool2D(pool_size = (2, 2), strides = (2,2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(10, activation = "softmax"))

print(model.summary())

**data genorator :**
To generate more image data by rotating, shifting the center, zooming in or out...etc.

In [None]:
datagen = ImageDataGenerator(zoom_range = 0.1,
                            height_shift_range = 0.1,
                            width_shift_range = 0.1,
                            rotation_range = 10)

Compile model

In [None]:
model.compile(loss='categorical_crossentropy', optimizer = Adam(lr=1e-4), metrics=["accuracy"])

Set learning rate :
* If val_acc doesn't get better in 3 epoch, reduce learning rate:
    * without adjusting learning rate (20 epoch): 0.9883
    * with adjusting learning rate (20 epoch): 0.9874
    * without adjusting learning rate (30 epoch): 0.9905
    * with adjusting learning rate (30 epoch): 0.9910, ReduceLROnPlateau only appeared once.
* reduce learning rate by 10% every epoch:
    * 30 epoch => 0.9943
    * much better
* reduce learning rate by 25% every epoch:
    * 30 epoch => 0.9912

In [None]:
annealer = LearningRateScheduler(lambda x: 1e-3 * 0.75 ** x)
from keras.callbacks import ReduceLROnPlateau
# annealer = ReduceLROnPlateau(monitor='val_acc', 
#                             patience=3, 
#                             verbose=1, 
#                             factor=0.5, 
#                             min_lr=0.00001)

Train the model:
* adjusting epochs will get better result. (about 25~30)
* adjusting batch_size makes the result better
    * 16 => 0.9874
    * 50 => 0.9895
    * 80 => 0.9921

In [None]:
history = model.fit_generator(datagen.flow(train_x,train_y, batch_size=16),
                              epochs = 1, 
                              validation_data = (val_x,val_y),
                              verbose = 2, 
                              steps_per_epoch = 500, 
                              callbacks=[annealer])

In [None]:
final_loss, final_acc = model.evaluate(val_x, val_y, verbose=0)
print("Final loss: {0:.4f}, final accuracy: {1:.4f}".format(final_loss, final_acc))

**Train History:**

In [None]:
def show_train_history(train_history, title, train, validation):
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[validation])
    plt.title(title)
    plt.ylabel(train)
    plt.xlabel('Epoch')
    plt.legend(['train', 'validation'], loc = 'upper left')
    plt.show()
show_train_history(history, 'Accuracy', 'acc', 'val_acc')
show_train_history(history, 'Loss', 'loss', 'val_loss')

**Predict and submit :**

In [None]:
prediction = model.predict(test)
# select the indix with the maximum probability
prediction = np.argmax(prediction, axis = 1)

In [None]:
submission = pd.DataFrame({"ImageId": list(range(1,len(prediction)+1)),
                         "Label": prediction})
submission.to_csv("submission.csv", index=False)