<h1 style="color:steelblue; font-family:Ewert; font-size:200%;" class="font-effect-3d">Code Library, Style and Links</h1>

In [None]:
%%html
<style>
@import url('https://fonts.googleapis.com/css?family=Ewert|Roboto&effect=3d|ice|');
span {font-family:'Roboto'; color:black; text-shadow: 5px 5px 5px #aaa;}  
div.output_area pre{font-family:'Roboto'; font-size:110%; color: steelblue;}      
</style>

In [None]:
import numpy as np 
import pandas as pd 
import keras as ks

import os
import ast
import cv2
import warnings

import matplotlib.pylab as plt
%matplotlib inline

os.listdir("../input")

In [None]:
I = 32 # image size in pixels
S = 1 # current number of the label set
T = 340 # number of labels in one set 
N = 1750 # number of images with the same label in the training set

files = os.listdir("../input/quickdraw-doodle-recognition/train_simplified")
file_path = '../input/quickdraw-doodle-recognition/train_simplified/'
labels = [el.replace(" ", "_")[:-4] for el in files]
print(sorted(labels))

In [None]:
from skimage.transform import resize
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from keras.models import Sequential
from keras.layers.advanced_activations import LeakyReLU
from keras.layers import Activation, Dropout, Dense
from keras.layers import Conv2D, MaxPooling2D, GlobalMaxPooling2D
from keras.preprocessing.image import ImageDataGenerator

warnings.filterwarnings('ignore', category=UserWarning)

plt.style.use('seaborn-whitegrid')
style_dict = {'background-color':'gainsboro', 'color':'steelblue', 
              'border-color': 'white', 'font-family':'Roboto'}

In [None]:
# additional function
            
def get_image(data, time_color=True):
    data = ast.literal_eval(data)
    image = np.zeros((280, 280), np.uint8)
    for t, s in enumerate(data):
        for i in range(len(s[0]) - 1):
            color = 255 - min(t, 10) * 15 if time_color else 255
            _ = cv2.line(image, (s[0][i]+10, s[1][i]+10),(s[0][i+1]+10, s[1][i+1]+10), color, 3)    
    return cv2.resize(image, (I, I))

<h1 style="color:steelblue; font-family:Ewert; font-size:200%;" class="font-effect-3d">Data Exploration</h1>

In [None]:
data = pd.DataFrame(index=range(N), columns=labels[(S-1)*T:S*T])
images = []

for i in range((S-1)*T, S*T):
    label = labels[i]
    data[label] = pd.read_csv(file_path + files[i], index_col='key_id').drawing.values[:N]
    images.extend([get_image(data[label].iloc[i]) for i in range(N)])
    data.drop([label], axis=1)
    
images = np.array(images)
images.shape

In [None]:
plt.figure(figsize=(15,5))
plt.subplot(1,3,1); plt.imshow(images[0])
plt.subplot(1,3,2); plt.imshow(images[30000])
plt.subplot(1,3,3); plt.imshow(images[60000])
plt.suptitle('Key Lines in the Pictures');

In [None]:
targets = np.array([[] + N * [k] for k in range(T)])
targets = ks.utils.to_categorical(targets, T).reshape(N*T,T)
targets.shape

In [None]:
x_train, x_test, y_train, y_test = \
train_test_split(images, targets, test_size = 0.2, random_state = 1)
n = int(len(x_test)/2)
x_valid, y_valid = x_test[:n], y_test[:n]
x_test, y_test = x_test[n:], y_test[n:]

del images, targets

x_train = x_train.reshape(-1,I,I,1)
x_valid = x_valid.reshape(-1,I,I,1)
x_test = x_test.reshape(-1,I,I,1)
y_train.shape, y_valid.shape, y_test.shape

<h1 style="color:steelblue; font-family:Ewert; font-size:200%;" class="font-effect-3d">The Model</h1>

In [None]:
def top_3_categorical_accuracy(y_true, y_pred):
    return ks.metrics.top_k_categorical_accuracy(y_true, y_pred, k=3)
def top_17_categorical_accuracy(y_true, y_pred):
    return ks.metrics.top_k_categorical_accuracy(y_true, y_pred, k=17)
def categorical_accuracy(y_true, y_pred):
    return ks.metrics.categorical_accuracy(y_true, y_pred)

def model():
    model = Sequential()
    
    model.add(Conv2D(32, (5, 5), padding='same', input_shape=x_train.shape[1:]))
    model.add(LeakyReLU(alpha=0.02))
    
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(196, (5, 5)))
    model.add(LeakyReLU(alpha=0.02))
    
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(GlobalMaxPooling2D())
    
    model.add(Dense(1024))
    model.add(LeakyReLU(alpha=0.02))
    model.add(Dropout(0.5)) 
    
    model.add(Dense(T))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', 
                  metrics=[categorical_accuracy,  top_17_categorical_accuracy])
    return model

model = model()

In [None]:
checkpointer = \
ModelCheckpoint(filepath='weights.best.model.hdf5', verbose=2, save_best_only=True)
lr_reduction = \
ReduceLROnPlateau(monitor='val_loss', patience=5, verbose=2, factor=0.5)

In [None]:
history = model.fit(x_train, y_train, 
                    epochs=150, batch_size=2800, verbose=2,
                    validation_data=(x_valid, y_valid),
                    callbacks=[checkpointer, lr_reduction])

In [None]:
data_generator = ImageDataGenerator(zoom_range=0.2, shear_range=0.2, 
                                    rotation_range=20, horizontal_flip=True)

generator_history = model.fit_generator(data_generator.flow(x_train, y_train, batch_size=2800),
                                        steps_per_epoch=1000, epochs=10, verbose=2,
                                        validation_data=(x_valid, y_valid), 
                                        callbacks=[checkpointer, lr_reduction])

<h1 style="color:steelblue; font-family:Ewert; font-size:200%;" class="font-effect-3d">Evaluation</h1>

In [None]:
model.load_weights('weights.best.model.hdf5')
score = model.evaluate(x_test, y_test)
score

In [None]:
p_test = model.predict(x_test)
well_predicted = []
for p in range(len(x_test)):
    if (np.argmax(p_test[p]) == np.argmax(y_test[p])):
        well_predicted.append(labels[(S-1) * T + np.argmax(p_test[p])])
u = np.unique(well_predicted, return_counts=True)
pd.DataFrame({'labels':u[0],'correct predictions':u[1]})\
.sort_values('correct predictions',ascending=False)\
.style.set_properties(**style_dict)

<h1 style="color:steelblue; font-family:Ewert; font-size:200%;" class="font-effect-3d">Predictions</h1>

In [None]:
del x_train, x_valid, x_test
del y_train, y_valid, y_test
# read the test file
test_data = pd.read_csv('../input/quickdraw-doodle-recognition/test_simplified.csv',
                        index_col='key_id')
test_data.tail(3).T.style.set_properties(**style_dict)

In [None]:
test_images = []
test_images.extend([get_image(test_data.drawing.iloc[i]) for i in range(len(test_data))])    
test_images = np.array(test_images)

test_predictions = model.predict(test_images.reshape(-1,I,I,1))

In [None]:
# best guesses among all label sets
test_labels = [[ labels[i] for i in test_predictions[k].argsort()[-3:][::-1] ] \
               for k in range(len(test_predictions))]
test_labels = [ " ".join(test_labels[i]) for i in range(len(test_labels))]

submission = pd.DataFrame({"key_id": test_data.index, "word": test_labels})
submission.to_csv('submission.csv', index=False)

T = 17; N = 10000
test_labels = [[ labels[i] for i in test_predictions[k].argsort()[-T:][::-1] ] \
               for k in range(len(test_predictions))]
test_labels = [ " ".join(test_labels[i]) for i in range(len(test_labels))]

submission_best17 = pd.DataFrame({"key_id": test_data.index, "word": test_labels})
submission_best17.to_csv('submission_best17.csv', index=False)

<h1 style="color:steelblue; font-family:Ewert; font-size:200%;" class="font-effect-3d">In Progress</h1>

In [None]:
test_labels0 = test_labels[0].split(); test_images0 = test_images[0]
data0 = pd.DataFrame(index=range(N), columns=test_labels0)
test_labels0

In [None]:
images0 = []
for label in test_labels0:
    file = label.replace("_", " ") + ".csv"
    data0[label] = pd.read_csv(file_path + file, index_col='key_id').drawing.values[:N]
    images0.extend([get_image(data0[label].iloc[i]) for i in range(N)])
    data0.drop([label], axis=1)
    
targets0 = np.array([[] + N * [k] for k in range(T)])
targets0 = ks.utils.to_categorical(targets0, T).reshape(N*T,T)
    
images0 = np.array(images0)
images0.shape, targets0.shape

In [None]:
x_train, x_test, y_train, y_test = \
train_test_split(images0, targets0, test_size = 0.2, random_state = 1)
n = int(len(x_test)/2)
x_valid, y_valid = x_test[:n], y_test[:n]
x_test, y_test = x_test[n:], y_test[n:]

del images0, targets0

x_train = x_train.reshape(-1,I,I,1)
x_valid = x_valid.reshape(-1,I,I,1)
x_test = x_test.reshape(-1,I,I,1)
y_train.shape, y_valid.shape, y_test.shape

In [None]:
def model():
    model = Sequential()
    
    model.add(Conv2D(32, (5, 5), padding='same', input_shape=x_train.shape[1:]))
    model.add(LeakyReLU(alpha=0.02))
    
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(196, (5, 5)))
    model.add(LeakyReLU(alpha=0.02))
    
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.2))

    model.add(GlobalMaxPooling2D())
    
    model.add(Dense(1024))
    model.add(LeakyReLU(alpha=0.02))
    model.add(Dropout(0.5)) 
    
    model.add(Dense(T))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam', 
                  metrics=[categorical_accuracy,  top_3_categorical_accuracy])
    return model
model = model()
checkpointer = \
ModelCheckpoint(filepath='weights.best.model.0.hdf5', verbose=2, save_best_only=True)
lr_reduction = \
ReduceLROnPlateau(monitor='val_loss', patience=5, verbose=2, factor=0.5)
model.fit(x_train, y_train, epochs=100, batch_size=2800, verbose=2,
          validation_data=(x_valid, y_valid), callbacks=[checkpointer, lr_reduction]);

In [None]:
model.load_weights('weights.best.model.0.hdf5')
test_predictions0 = model.predict(test_images0.reshape(1,I,I,1))[0]
test_labels03 = [ test_labels0[i] for i in test_predictions0.argsort()[-3:]]
test_labels03 = [ " ".join(test_labels03[i]) for i in range(len(test_labels03))]

In [None]:
plt.figure(figsize=(5,5))
plt.imshow(test_images0)
plt.suptitle(test_labels03);