### An example using both an attention layer and a capsule layer in a GRU network for insincere text classification

##### **Acknowledgements**:
* [ Shujian Liu](https://www.kaggle.com/shujian)
* [SRK](https://www.kaggle.com/sudalairajkumar/),
* [Dieter](https://www.kaggle.com/christofhenkel) 
* [Khoi Nguyen ](https://www.kaggle.com/suicaokhoailang)
* [Shujian Liu](https://www.kaggle.com/shujian/single-rnn-with-4-folds-clr)

In [None]:
%matplotlib inline

import os
import time
import numpy as np 
import pandas as pd
import gc
import tensorflow as tf
import pickle
import glob
import itertools

np.random.seed(7418880)
tf.set_random_seed(7418880)
from collections import defaultdict

from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.model_selection import  StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.engine.topology import Layer
from keras.callbacks import *
from keras.layers import *
from keras.models import *
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.initializers import *
from keras.optimizers import *
import keras.backend as K
import tensorflow as tf
import os
import time
import gc
import re
from unidecode import unidecode

import matplotlib
import matplotlib.pyplot as plt


In [None]:
embedding_path_dict= {'googlenews':{
                            'path':'../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin',
                            'format':'word2vec',
                            'binary': True
                      },
                      'glove':{
                            'path':'../input/embeddings/glove.840B.300d/glove.840B.300d.txt',
                            'format': 'glove',
                            'binary': ''
                      },
                      'glove_word2vec':{
                            'path':'../input/glove.840B.300d.txt.word2vec',
                            'format': 'word2vec',
                            'binary': False
                      },
                      'wiki':{
                            'path': '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec',
                            'format': 'word2vec',
                            'binary': False
                      },
                      'paragram':{
                            'path': '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt',
                            'format': '',
                            'binary': False
                      }
                    }

In [None]:
#https://www.kaggle.com/danbrice/keras-plot-history-full-report-and-grid-search
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        title='Normalized confusion matrix'
    else:
        title='Confusion matrix'

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    
## multiclass or binary report
## If binary (sigmoid output), set binary parameter to True
def full_multiclass_report(model,
                           x,
                           y_true,
                           classes,
                           batch_size=32,
                           binary=False):

    # 1. Transform one-hot encoded y_true into their class number
    if not binary:
        y_true = np.argmax(y_true,axis=1)
    
    # 2. Predict classes and stores in y_pred
    y_pred = model.predict_classes(x, batch_size=batch_size)
    
    # 3. Print accuracy score
    print("Accuracy : "+ str(accuracy_score(y_true,y_pred)))
    
    print("")
    
    # 4. Print classification report
    print("Classification Report")
    print(classification_report(y_true,y_pred,digits=5))    
    
    # 5. Plot confusion matrix
    cnf_matrix = confusion_matrix(y_true,y_pred)
    print(cnf_matrix)
    plot_confusion_matrix(cnf_matrix,classes=classes)

In [None]:
# Get word embeddings
def get_embeddings(embedding_path_dict, emb_name):
    """
    :params embedding_path_dict: a dictionary containing the path, binary flag, and format of the desired embedding,
            emb_name: the name of the embedding to retrieve
    :return embedding index: a dictionary containing the embeddings"""
    
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    
    if (emb_name == 'googlenews'):
        emb_path = embedding_path_dict[emb_name]['path']
        bin_flag = embedding_path_dict[emb_name]['binary']
        embeddings_index = kv.load_word2vec_format(emb_path, binary=bin_flag).vectors
    elif (emb_name in ['glove', 'wiki']):
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(embedding_path_dict[emb_name]['path']) if len(o)>100)    
    elif (emb_name == 'paragram'):
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(embedding_path_dict[emb_name]['path'], encoding="utf8", errors='ignore'))
    return embeddings_index

#Convert GLoVe format into word2vec format
def glove_to_word2vec(embedding_path_dict, emb_name='glove', output_emb='glove_word2vec'):
    """
    Convert the GLOVE embedding format to a word2vec format
    :params embedding_path_dict: a dictionary containing the path, binary flag, and format of the desired embedding,
            glove_path: the name of the GLOVE embedding
            output_file_path: the name of the converted embedding in embedding_path_dict. 
    :return output from the glove2word2vec script
    """
    glove_input_file = embedding_path_dict[emb_name]['path']
    word2vec_output_file = embedding_path_dict[output_emb]['path']                
    return glove2word2vec(glove_input_file, word2vec_output_file)

Let us load the train and test sets. Set max_words to be 50000 and max_len to be 70 

In [None]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")
print("Train shape : ", train_df.shape)
print("Test_shape : ", test_df.shape)

train_X = train_df["question_text"].fillna("_na_").values
train_y = train_df['target'].values
test_X = test_df["question_text"].fillna("_na_").values

del train_df
import gc
gc.collect()

embed_size = 300
max_words = 60000 # number of unique words
max_len = 70
neg_perc = 1


In [None]:
train_X[:10]

Tokenize the sentences

In [None]:
start_time = time.time()

tokenizer = Tokenizer(num_words=max_words,lower=True)
tokenizer.fit_on_texts(list(train_X))
train_X=tokenizer.texts_to_sequences(train_X)
test_X=tokenizer.texts_to_sequences(test_X)

train_X = pad_sequences(train_X, maxlen=max_len)
test_X = pad_sequences(test_X, maxlen=max_len)

print (train_X.shape, test_X.shape)
print("Total time for tokenizing = {:.0f} s".format(time.time()-start_time))

Tokenizing takes about 65 s on average in my experience. Interesting to know these numbers since we have a 7200 s time limit for these competitiom

In [None]:
def add_embed_to_matrix(embed_list, word_index, nb_words):
    
    #get embedding
    embeddings_index={}
    embedding_matrix = None
    for embedding_name in embed_list:
        embeddings_index[embedding_name]= get_embeddings(embedding_path_dict, embedding_name)
        import gc; gc.collect()
    
        all_embs = np.stack(embeddings_index[embedding_name].values())
        emb_mean,emb_std = all_embs.mean(), all_embs.std()
        embed_size = all_embs.shape[1]
        if (embedding_matrix is None):
            embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
        else:
            embedding_matrix += np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

    for word, i in word_index.items():
        if i >= nb_words: continue
        for embedding_name in embed_list:
            embedding_vector = embeddings_index[embedding_name].get(word)
            if (embedding_vector is not None):
                embedding_matrix[i] = embedding_vector
    
    del embeddings_index
    
    return embedding_matrix
        
start_time = time.time()

word_index = tokenizer.word_index
nb_words = min(max_words, len(word_index))
embedding_matrix = None

embeddings_list=['glove','wiki']
embedding_matrix = add_embed_to_matrix(embeddings_list, word_index, nb_words)
embedding_matrix = embedding_matrix/len(embeddings_list)
        
print("Total time for embedding = {:.0f} s".format(time.time()-start_time))

Let us define the model now. Nothing fancy. A GRU/LSTM bilayer concated with a LSTM/GRU layer (don't ask me why) with a few pooling, dense and dropout layers thrown in to the mix. Based on SRK's kernel.

In [None]:
def squash(x, axis=-1):
    # s_squared_norm is really small
    # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
    # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
    # return scale * x
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale

# A Capsule Implement with Pure Keras
class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(tf.keras.backend.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = tf.keras.backend.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)
    

def f1_smart(y_true, y_pred):
    args = np.argsort(y_pred)
    tp = y_true.sum()
    fs = (tp - np.cumsum(y_true[args[:-1]])) / np.arange(y_true.shape[0] + tp - 1, tp, -1)
    res_idx = np.argmax(fs)
    return 2 * fs[res_idx], (y_pred[args[res_idx]] + y_pred[args[res_idx + 1]]) / 2

In [None]:
# https://www.kaggle.com/suicaokhoailang/lstm-attention-baseline-0-652-lb

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [None]:
# https://www.kaggle.com/hireme/fun-api-keras-f1-metric-cyclical-learning-rate/code

class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())
    

def f1(y_true, y_pred):
    '''
    metric from here 
    https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras
    '''
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [None]:
def capsule(embedding_matrix, nb_words):
    K.clear_session()       
    inp = Input(shape=(max_len,))
    x = Embedding(nb_words, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(rate=0.2)(x)
    x = Bidirectional(CuDNNGRU(100, return_sequences=True, 
                                kernel_initializer=glorot_normal(seed=12300), recurrent_initializer=orthogonal(gain=1.0, seed=10000)))(x)

    x = Capsule(num_capsule=10, dim_capsule=10, routings=4, share_weights=True)(x)
    x = Flatten()(x)

    x = Dense(100, activation="relu", kernel_initializer=glorot_normal(seed=12300))(x)
    x = Dropout(0.12)(x)
    x = BatchNormalization()(x)

    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer=Adam(),)
    return model

In [None]:
def model_lstm_atten_capsule(embedding_matrix, nb_words):
    
    inp = Input(shape=(max_len,))
    x = Embedding(nb_words, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    #x = SpatialDropout1D(0.1)(x)
    x = Bidirectional(CuDNNLSTM(40, return_sequences=True))(x)
    y = Bidirectional(CuDNNGRU(40, return_sequences=True))(x)
    
    #atten_1 = Attention(max_len)(x) # skip connect
    atten_2 = Attention(max_len)(y)
    
    cap_1 = Capsule(num_capsule=10, dim_capsule=10, routings=4, share_weights=True)(x)
    x = Flatten()(cap_1)
    
    avg_pool = GlobalAveragePooling1D()(y)
    max_pool = GlobalMaxPooling1D()(y)
    
    conc = concatenate([x, atten_2, avg_pool, max_pool])
    conc = Dense(16, activation="relu")(conc)
    conc = Dropout(0.3)(conc)
    outp = Dense(1, activation="sigmoid")(conc)    

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[f1])
    
    return model

Let us train this model using 5-fold stratified validation. Through trial and error,  running 2 epochs  seemed like a reasonable choice for this model, to prevent overfitting.

In [None]:
start_time = time.time()

uid = 1
version =1 
n_splits = 5
n_epochs = 5
batch_size =1024

clr = CyclicLR(base_lr=0.001, max_lr=0.006,
               step_size=2000., mode='triangular2')

skf = StratifiedKFold(n_splits=n_splits, random_state = 7418880, shuffle=True)
val_preds = defaultdict(list)
test_preds = {}
historyD={}
for ii, (train_index, val_index)  in enumerate(skf.split(train_X, train_y)):
    
    split_start_time = time.time()

    X_train, X_val = train_X[train_index], train_X[val_index]
    y_train, y_val = train_y[train_index], train_y[val_index]
    
    #model = capsule(embedding_matrix, nb_words)
    model = model_lstm_atten_capsule(embedding_matrix, nb_words)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    hist = model.fit(X_train, y_train, batch_size=batch_size, epochs=n_epochs, 
              validation_data=(X_val, y_val), callbacks = [clr,], verbose= True)
    
    historyD["fold{}".format(ii+1)] = hist.history
    
    pred_val_y = model.predict([X_val], batch_size=batch_size, verbose=1)
    val_preds['fold{}'.format(ii+1)] = [pred_val_y.ravel(),y_val]
    
    pred_test_y = model.predict([test_X], batch_size=batch_size, verbose=1)
    test_preds['fold{}'.format(ii+1)] =  pred_test_y.ravel()
    
    # summarize history for accuracy
    plt.plot(hist.history['acc'])
    plt.plot(hist.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    
    # summarize history for loss
    plt.plot(hist.history['loss'])
    plt.plot(hist.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'test'], loc='upper left')
    plt.show()
    
    
    # Print classification report
    print("Classification Report- threshold 0.25")
    print(classification_report(y_val,(pred_val_y>0.25).astype(int),digits=5))    
    
    # 5. Plot confusion matrix
    cnf_matrix = confusion_matrix(y_val,(pred_val_y>0.25))
    print(cnf_matrix)
    plot_confusion_matrix(cnf_matrix,classes=[0,1])
    
     # Print classification report
    print("Classification Report- threshold 0.5")
    print(classification_report(y_val,(pred_val_y>=0.5).astype(int),digits=5))    
    
    # 5. Plot confusion matrix
    cnf_matrix = confusion_matrix(y_val,(pred_val_y>=0.5))
    print(cnf_matrix)
    #plot_confusion_matrix(cnf_matrix,classes=[0,1])
    
    del model
    gc.collect()
    
    print("split: {} Time for training one model = {:.0f} s".format(ii, time.time()-split_start_time))
    print("Total time for training so far = {:.0f} s".format(time.time()-start_time))    
# with open('train_history_uid{}_v{}.pkl'.format(uid, version),'wb') as pklfile:
#     pickle.dump(historyD,pklfile)
# with open('val_preds_uid{}_v{}.pkl'.format(uid, version),'wb') as pklfile:
#     pickle.dump(val_preds,pklfile)
# with open('test_preds_uid{}_v{}.pkl'.format(uid, version),'wb') as pklfile:
#     pickle.dump(test_preds,pklfile)

print("Total time for training = {:.0f} s".format(time.time()-start_time))    

So total training time is ~ 65 minutes.

For each fold, a validation prediction and a test prediction are made. The threshold value to use for each of these test predictions is  obtained by a threshold scan  that yields the maximum  F1 score on the validation set. Basically what others have been using. Ad-hoc, but works.

In separate kernels, I submitted the test prediction from each of the 5 folds. The public LB scores I got were 
0.662, 0.663, 0.664, 0.665, 0.665. An average of ~0.664. Now, how does the average prediction from these five folds perform?

In [None]:
test01_df = pd.DataFrame()
#print(val_preds['fold6'])
for ii in range(len(val_preds)):
    threshL = []
    y_preds, y_actual = (val_preds['fold{}'.format(ii+1)])
    for idx, thresh in enumerate(np.arange(0.1, 0.51, 0.01)):
        thresh = np.round(thresh,2)
        y_01 = [ 0 if x <thresh else 1 for x in y_preds]
        threshL.append((metrics.f1_score(y_actual,y_01), thresh))
    threshL=sorted(threshL)
    best_F1, opt_thresh = threshL[-1]
    print ("For fold {0}, best validation F1 of {1:.5f} at threshold {2:.2f}".format(ii+1, best_F1,opt_thresh))
    test01_df['fold{}'.format(ii)] = [ 0 if x<opt_thresh else 1 for x in test_preds['fold{}'.format(ii+1)]]

In [None]:
print(test01_df.sum(axis=1).value_counts())

So,  the models disagree on (594+470+354+354) = 1772 samples . 

A pearson corelation coefficient between the predictions is also an useful number to know to see how correlated the differnt folds are. 

In [None]:
 test01_df.corr(method='pearson')

We get a correlation coefficient of ~ 0.88 between prediction from the various folds. Heavily correlated, but still some variance.  So averaging these slightly different predictions should give us a better score than each of the  individual predictions.

 Let us create the submission file:

In [None]:
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = [ 0 if x<2.5 else 1 for x in test01_df.sum(axis=1)]
out_df.to_csv("submission.csv", index=False)

On submission, this should yields an LB score of 0.671or thereabouts, which is a reasonable improvement of 0.007 or so. One should expect a similar improvement on other "single models" while using K-fold validation. Hope this kernel was useful and gives an idea of how to balance K-fold validation of a single model  vs. adding additional models to the soup.