In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory
import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
# Get requirements
import numpy as np
import pandas as pd
import keras
import tensorflow as tf
print("We're using TF", tf.__version__)
import keras
print("We are using Keras", keras.__version__)
np.random.seed(1234)
# Ok, get some stuff from Keras (model, layer, optimizers and ImageDataGenerator)
from keras import models
from keras import layers
from keras.utils import to_categorical  #keras.utils.to_categorical(y, num_classes=None)
import keras.backend as K
from keras.models import load_model
import h5py
from keras import optimizers
from keras.preprocessing.image import ImageDataGenerator
import  matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.cm as cm

In [None]:
# Load data 
org_train = pd.read_csv('../input/train.csv')
org_test_images = pd.read_csv('../input/test.csv')
org_train.head() 

In [None]:
org_train_labels = org_train["label"]
# Drop 'label' column
org_train_images = org_train.drop(labels = ["label"],axis = 1) 
print (org_train_labels.value_counts())


In [None]:
# Reshape image in 3 dimensions (height = 28px, width = 28px , channels = 1)
org_train_images = org_train_images.values.reshape(-1,28,28,1)
org_test_images = org_test_images.values.reshape(-1,28,28,1)

In [None]:
# Normalize from 0-255 to 0-1
org_train_images = org_train_images.astype('float32') /255.0
org_test_images = org_test_images.astype('float32') /255.0 
print (org_train_images.shape)

In [None]:
# Ok, lets look at our new data.
fig = plt.figure(figsize=(10,10))
for i in range(6):
    ax = fig.add_subplot(1, 6, i+1, xticks=[], yticks=[])
    ax.imshow(org_train_images[i].reshape(28,28), cmap='gray')
    

In [None]:
# Split the train and the validation set for the fitting, use stratify.
def getdata(random_state):
    train_images,dev_images,train_labels,dev_labels=train_test_split(org_train_images,org_train_labels,test_size=0.1,random_state=random_state,stratify=org_train_labels)
    train_labels = keras.utils.to_categorical(train_labels,10)
    #test_labels = keras.utils.to_categorical(test_labels,10)
    dev_labels = keras.utils.to_categorical(dev_labels,10)
    return (train_images,dev_images,train_labels,dev_labels)
train_images,dev_images,train_labels,dev_labels = getdata(12)
print (dev_images.shape)


In [None]:
# Ok lets do some training with data augumentation in Keras
# Since neural networks are hungry for data Keras have a built in function to generate syntetic data on the fly!
batch_size=60
# Realtime data augmentation:
datagen = ImageDataGenerator(
    rotation_range=12,  # randomly rotate images in the range (degrees, 0 to 180)
    shear_range=1, # shear this amount of degrees
    zoom_range = 0.12, # Randomly zoom image 
    width_shift_range= 3/28,  # randomly shift images horizontally (fraction of total width)
    height_shift_range=3/28  # randomly shift images vertically (fraction of total height)
    )
datagen.fit(train_images)
train_images_batch, train_label_batch = datagen.flow(train_images, train_labels, batch_size=batch_size).next()

# Ok, lets see how the images in the batch changed (shifted up/down, left/right and rotated/zoomed a bit in our case)
plt.figure(figsize=(20,20))
for i in range(0, int(batch_size/2)):
    plt.subplot(10,10,1+i, xticks=[], yticks=[])
    plt.imshow(train_images_batch[i].reshape(28, 28), cmap=plt.get_cmap('gray'))
# show the plot
print ('Images showing the augumented images')
plt.show()

In [None]:
#Setup the callbacks list
def callback():  
    callbacks_list = [keras.callbacks.EarlyStopping(
                                  monitor='val_loss',
                                  patience=25  # stops when this number of epochs does not improve
                                  ),
                  keras.callbacks.ModelCheckpoint(
                                  filepath='my_model.h5',
                                  verbose=1,
                                  monitor='val_loss',
                                  save_best_only=True  # save the model that is best
                                  ),
                 # keras.callbacks.LearningRateScheduler(lambda x: 1e-2 * 0.99 ** x),
                 
                  keras.callbacks.ReduceLROnPlateau(
                                  monitor='val_loss',
                                  factor=0.5,  # reduce learningrate with this factor ex: lr=0.01 --> lr=0.01*0.1=0.001
                                  patience=3,  # nr of epochs without improvement in monitored loss ('val_loss') to trigger the decrease in lr
                                  verbose=1,
                                  min_lr=0.0001)
                   
                 
                 
    ]
    return (callbacks_list)



In [None]:
# Lets create the first network using SeparableConv2D  + LeakyRelu and SpatialDropOuts and GlobalAveragePooling
# This is a test of SpatialDropout and Pooling 
from keras.layers.advanced_activations import LeakyReLU

K.clear_session()
cnn1=models.Sequential()
# 28x28 
cnn1.add(layers.SeparableConv2D(32,(3,3),padding='same',input_shape=(28,28,1))) 
cnn1.add(layers.BatchNormalization())
cnn1.add(LeakyReLU(alpha=0.2))
cnn1.add(layers.SeparableConv2D(32,(3,3),padding="same"))
cnn1.add(layers.BatchNormalization())
cnn1.add(LeakyReLU(alpha=0.2))
cnn1.add(layers.SeparableConv2D(32,(3,3),padding="same"))
cnn1.add(layers.BatchNormalization())
cnn1.add(LeakyReLU(alpha=0.2))
cnn1.add(layers.MaxPooling2D((2,2),strides=(2,2)))
cnn1.add(layers.SpatialDropout2D(0.3))

# 14x14
cnn1.add(layers.SeparableConv2D(64,(3,3),padding="same"))
cnn1.add(layers.BatchNormalization())
cnn1.add(LeakyReLU(alpha=0.1))
cnn1.add(layers.SeparableConv2D(64,(3,3),padding="same"))
cnn1.add(layers.BatchNormalization())
cnn1.add(LeakyReLU(alpha=0.1))
cnn1.add(layers.SeparableConv2D(64,(3,3),padding="same"))
cnn1.add(layers.BatchNormalization())
cnn1.add(LeakyReLU(alpha=0.1))
cnn1.add(layers.MaxPooling2D((2,2)))
cnn1.add(layers.SpatialDropout2D(0.3))

# 7x7
cnn1.add(layers.SeparableConv2D(128,(3,3),strides=1,padding="same"))
cnn1.add(layers.BatchNormalization())
cnn1.add(LeakyReLU(alpha=0.1))
cnn1.add(layers.MaxPooling2D((2,2)))
cnn1.add(layers.SpatialDropout2D(0.3))

# 3x3
cnn1.add(layers.SeparableConv2D(192,(3,3),padding="same"))
#cnn1.add(layers.BatchNormalization())
cnn1.add(layers.GlobalAveragePooling2D())  # - No RELU before this layer
cnn1.add(layers.Dense(256)) 
cnn1.add(LeakyReLU(alpha=0.1))
cnn1.add(layers.BatchNormalization())
cnn1.add(layers.Dropout(0.3))
cnn1.add(layers.Dense(10,activation='softmax'))
opt=optimizers.Nadam(lr=0.02, beta_1=0.9, beta_2=0.999, epsilon=None, schedule_decay=0.004)
#opt=optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999,  decay=0.0, amsgrad=False)  # optimizer=optimizers.RMSprop(lr=1e-4)
cnn1.compile(optimizer=opt,    
                loss='categorical_crossentropy',
                metrics=['accuracy'])
cnn1.count_params()
cnn1.summary()


In [None]:
# Fit the model on the batches generated by datagen.flow().
# datagen.flow will generate unique images for every batch!
# this will make the network train on "new" images!
bs=60 # batch size
callbacks_list=callback() # Reset the callbacklist

In [None]:
#Ok, lets train our first network!
cnn1.fit_generator(datagen.flow(train_images, train_labels,batch_size=bs),
                        steps_per_epoch=train_images.shape[0] // (bs),
                        epochs=50,
                        callbacks=callbacks_list,
                        validation_data=(dev_images,dev_labels),
                        validation_steps = dev_images.shape[0]// (bs),
    
                        #workers=8
                        )

In [None]:
# Load best cnn1 model and checkit
cnn1.load_weights('my_model.h5') # Load the best model and check it
dev_loss, dev_acc = cnn1.evaluate(dev_images, dev_labels)
print('dev_acc:', dev_acc)
cnn1.save_weights('cnn1_weights.h5')
cnn1.save('cnn1_model.h5')
os.remove ('my_model.h5')

In [None]:
#Print a classification Report
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import itertools
np.set_printoptions(precision=3)
pred=cnn1.predict_classes(dev_images)
target_names = ["{}".format(i) for i in range(10)]
print ('* Classification Report *')
print(classification_report(np.argmax(dev_labels,axis=1), pred, target_names=target_names,digits=3))
#print (confusion_matrix(np.argmax(dev_labels,axis=1), pred,labels=[9,8,7,6,5,4,3,2,1,0]))

cnf_matrix = confusion_matrix(np.argmax(dev_labels,axis=1), pred)

import seaborn as sns
sns.heatmap(cnf_matrix, annot=True, fmt="d")
plt.ylabel('True label')
plt.xlabel('Predicted label')

plt.show()

In [None]:
# Plot some curves!  

def plot_curves(hist):
    import matplotlib.pyplot as plt
    plt.figure(figsize=(10,5))
    history_dict=hist.history
    loss_values=history_dict['loss']
    val_loss_values=history_dict['val_loss']
    epochs=range(1,len(loss_values)+1)
    plt.plot(epochs,loss_values,'r',label='Training Loss') # r=red line, ro=red dot
    plt.plot(epochs,val_loss_values,'b',label='Validation Loss') # b=blue line
    plt.title('Training and validation loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()

    plt.clf()
    acc_values=history_dict['acc']
    val_acc_values=history_dict['val_acc']
    plt.figure(figsize=(10,5))
    plt.plot(epochs,acc_values,'r',label='Training accuracy') 
    plt.plot(epochs,val_acc_values,'b',label='Validation accuracy') 
    plt.title('Training and validation accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.show()

    plt.clf()
    lr_values=history_dict['lr']
    plt.figure(figsize=(10,5))
    plt.plot(epochs,lr_values,'r',label='Learning Rate')
    plt.title('Learning Rate')
    plt.xlabel('Epochs')
    plt.ylabel('Learning Rate')
    plt.legend()
    plt.show()
    return
plot_curves(cnn1.history)


In [None]:
# Lets create the second network using SeparableConv2D with RELU and SpatialDropOuts and Flatten instead of GlobalAveragePooling

train_images,dev_images,train_labels,dev_labels = getdata(14)
cnn2=models.Sequential()
# 28x28
cnn2.add(layers.SeparableConv2D(32,(3,3),activation='relu',padding='same',input_shape=(28,28,1))) 
cnn2.add(layers.BatchNormalization())
cnn2.add(layers.SeparableConv2D(32,(3,3),activation='relu',padding="same"))
cnn2.add(layers.BatchNormalization())
cnn2.add(layers.SeparableConv2D(32,(3,3),strides=1, activation='relu',padding="same"))
cnn2.add(layers.BatchNormalization())
cnn2.add(layers.MaxPooling2D((2,2)))
cnn2.add(layers.SpatialDropout2D(0.3))
# 14x14
cnn2.add(layers.SeparableConv2D(64,(3,3),activation='relu',padding="same"))
cnn2.add(layers.BatchNormalization())
cnn2.add(layers.SeparableConv2D(64,(3,3),activation='relu',padding="same"))
cnn2.add(layers.BatchNormalization())
cnn2.add(layers.SeparableConv2D(128,(3,3),strides=1, activation='relu',padding="same"))
cnn2.add(layers.BatchNormalization())
cnn2.add(layers.MaxPooling2D((2,2)))
cnn2.add(layers.SpatialDropout2D(0.4))
# 7x7
cnn2.add(layers.SeparableConv2D(128,(3,3),activation='relu',padding="same"))
cnn2.add(layers.BatchNormalization())
cnn2.add(layers.MaxPooling2D((2,2)))
cnn2.add(layers.SpatialDropout2D(0.4))

cnn2.add(layers.SeparableConv2D(512,(3,3),activation='relu'))
cnn2.add(layers.BatchNormalization())

cnn2.add(layers.Flatten())  # Konvertera 3D --> 1D
#cnn2.add(layers.GlobalAveragePooling2D())  # - Istället för flatten https://github.com/keras-team/keras/issues/8470
#cnn2.add(layers.Dropout(0.5))
cnn2.add(layers.Dense(128,activation='relu')) 
cnn2.add(layers.Dense(10,activation='softmax'))
opt=optimizers.Adam(lr=1e-2, beta_1=0.9, beta_2=0.999, decay=0.0)  # optimizer=optimizers.RMSprop(lr=1e-4)
cnn2.compile(optimizer=opt,    
                loss='categorical_crossentropy',
                metrics=['accuracy'])
cnn2.summary()
cnn2.count_params() # 3,5x in Size - should be better one could hope for!

In [None]:
#  Lets train network CNN2 exactly the same way as CNN1
callbacks_list=callback() # Reset the callbacklist
cnn2.fit_generator(datagen.flow(train_images, train_labels,batch_size=bs),
                        steps_per_epoch=train_images.shape[0] // (bs),
                        epochs=100,
                        callbacks=callbacks_list,
                        validation_data=(dev_images,dev_labels),
                        validation_steps = dev_images.shape[0] // (bs),
    
                        #workers=8
                        )

In [None]:
# Load best cnn2 model and checkit
cnn2.load_weights('my_model.h5') # Load the best model and check it
dev_loss, dev_acc = cnn2.evaluate(dev_images, dev_labels)
print('dev_acc:', dev_acc)
cnn2.save_weights('cnn2_weights.h5')
cnn2.save('cnn2_model.h5')
os.remove ('my_model.h5')

In [None]:
plot_curves(cnn2.history)

In [None]:
# Ok, lets go for a third one with a different optimizer
# 
train_images,dev_images,train_labels,dev_labels = getdata(1564)
cnn3=models.Sequential()
cnn3.add(layers.SeparableConv2D(32,(3,3),activation='relu',padding='same',input_shape=(28,28,1))) # testa padding="same"/"valid". Kolla noga på tabellen
cnn3.add(layers.BatchNormalization())
#cnn3.add(layers.MaxPooling2D((2,2)))
cnn3.add(layers.SeparableConv2D(32,(3,3),activation='relu',padding="valid"))
cnn3.add(layers.BatchNormalization())
cnn3.add(layers.SeparableConv2D(64,(3,3),strides=1, activation='relu',padding="same"))
cnn3.add(layers.BatchNormalization())
cnn3.add(layers.MaxPooling2D((2,2)))
cnn3.add(layers.SpatialDropout2D(0.4))
cnn3.add(layers.SeparableConv2D(128,(3,3),activation='relu',padding="valid"))
cnn3.add(layers.BatchNormalization())
cnn3.add(layers.SeparableConv2D(96,(3,3),activation='relu',padding="same"))
cnn3.add(layers.BatchNormalization())
cnn3.add(layers.SeparableConv2D(64,(3,3),strides=1, activation='relu',padding="same"))
cnn3.add(layers.BatchNormalization())
cnn3.add(layers.MaxPooling2D((2,2)))
cnn3.add(layers.SpatialDropout2D(0.4))
cnn3.add(layers.SeparableConv2D(256,(3,3),activation='relu',padding="valid"))
cnn3.add(layers.BatchNormalization())
cnn3.add(layers.MaxPooling2D((2,2)))
cnn3.add(layers.SpatialDropout2D(0.4))

cnn3.add(layers.GlobalAveragePooling2D()) 
#cnn3.add(layers.Dropout(0.5))
cnn3.add(layers.Dense(128,activation='relu')) 
cnn3.add(layers.Dense(10,activation='softmax'))
#opt=optimizers.Adam(lr=1e-2, beta_1=0.9, beta_2=0.999, decay=0.001)  
opt=optimizer=optimizers.RMSprop(lr=1e-3)
cnn3.compile(optimizer=opt,    
                loss='categorical_crossentropy',
                metrics=['accuracy'])
cnn3.summary()
cnn3.count_params()

In [None]:
#  Lets train network CNN3 exactly the same way as the others
callbacks_list=callback()# Reset the callbacklist
cnn3.fit_generator(datagen.flow(train_images, train_labels,batch_size=bs),
                        steps_per_epoch=train_images.shape[0] // (bs),
                        epochs=50,
                        callbacks=callbacks_list,
                        validation_data=(dev_images,dev_labels),
                        validation_steps = dev_images.shape[0] // (bs),
    
                        #workers=8
                        )

In [None]:
# Load best cnn2 model and checkit
cnn3.load_weights('my_model.h5') # Load the best model and check it
dev_loss, dev_acc = cnn2.evaluate(dev_images, dev_labels)
print('dev_acc:', dev_acc)
cnn3.save_weights('cnn3_weights.h5')
cnn3.save('cnn3_model.h5')
os.remove ('my_model.h5')

In [None]:
# First load the 3 best results from all models, see cnn3 code for how to use Keras to automatically save the best one
cnn1.load_weights('cnn1_weights.h5')
cnn2.load_weights('cnn2_weights.h5')
cnn3.load_weights('cnn3_weights.h5')

models=[cnn1,cnn2,cnn3]

In [None]:
def ensemble(models, model_input):
    # collect all the outputs from the networks
    outputs = [model(model_input) for model in models] 
    # averaging outputs
    outputsAvg = layers.Average()(outputs)    
    modelEns = keras.models.Model(inputs=model_input,outputs=outputsAvg, name='ensemble')    
    return modelEns
  
def evaluate_error(model):
    pred = model.predict(test_images, batch_size = 60)
    pred = np.argmax(pred, axis=1)
    pred = np.expand_dims(pred, axis=1) # make same sha
    error = np.sum(np.not_equal(pred, test_labels)) / test_labels.shape[0]    
    return error

#All networks in the ensemble must have the same input shape (our case 28,28,1)
model_input = keras.models.Input(shape=models[0].input_shape[1:])  
modelEns = ensemble(models, model_input)

# We need to compile it - just to ignore a error when loading it :) 
modelEns.compile(optimizer='Adam',
                loss='categorical_crossentropy',
                metrics=['accuracy'])
modelEns.summary()
modelEns.save("MyEnsemble.h5")


In [None]:
# Ok, lets test this small ensemble model with our test data
from keras.models import load_model
modelEns=keras.models.load_model("MyEnsemble.h5")


In [None]:
test_loss, test_acc = modelEns.evaluate(dev_images, dev_labels)
print('Ensemble test_acc:', test_acc)  # Compare this result to all the individual ones. 
# averageing is one type of ensemble 

In [None]:
# predict results
results = modelEns.predict(org_test_images)

In [None]:
results = np.argmax(results,axis = 1)

In [None]:
results = pd.Series(results,name="Label")

In [None]:
print (results.shape)

In [None]:
submission = pd.concat([pd.Series(range(1,28001),name = "ImageId"),results],axis = 1)

submission.to_csv("test_labels.csv",index=False)