Inspired by:
* https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings
* https://www.kaggle.com/shujian/single-rnn-with-4-folds-v1-9
* http://mlexplained.com/2018/01/13/weight-normalization-and-layer-normalization-explained-normalization-in-deep-learning-part-2/
* https://arxiv.org/abs/1607.06450
* https://github.com/keras-team/keras/issues/3878
* https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings
* https://www.kaggle.com/jhoward/improved-lstm-baseline-glove-dropout

(and other links in notebook)

Remark:
model overfits like hell...

In [None]:
import numpy as np # linear algebra
np.set_printoptions(threshold=np.nan)
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
print(os.listdir("../input/embeddings"))
print(os.listdir("../input/embeddings/GoogleNews-vectors-negative300"))

# Any results you write to the current directory are saved as output.

import gensim
from gensim.utils import simple_preprocess
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import Callback
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,f1_score,precision_recall_fscore_support,recall_score,precision_score
from keras import backend as K
from sklearn.utils import class_weight
import matplotlib.pyplot as plt

#https://www.kaggle.com/shujian/single-rnn-with-4-folds-v1-9
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        print('\rthreshold = %f | score = %f'%(threshold,score),end='')
        if score > best_score:
            best_threshold = threshold
            best_score = score
    print('\nbest threshold is % f with score %f'%(best_threshold,best_score))
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

In [None]:
# https://www.kaggle.com/jhoward/improved-lstm-baseline-glove-dropout
print('loading word2vec model...')
word2vec = gensim.models.KeyedVectors.load_word2vec_format('../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin', binary=True)
print('vocab:',len(word2vec.vocab))

In [None]:
df = pd.read_csv('../input/train.csv')
df["question_text"].fillna("_##_",inplace=True)
max_len = df['question_text'].apply(lambda x:len(x)).max()
print('max length of sequences:',max_len)
# df = df.sample(frac=0.1)

print('columns:',df.columns)
pd.set_option('display.max_columns',None)
print('df head:',df.head())
print('example of the question text values:',df['question_text'].head().values)
print('what values contains target:',df.target.unique())

print('Computing class weights....')
#https://datascience.stackexchange.com/questions/13490/how-to-set-class-weights-for-imbalanced-classes-in-keras
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(df.target.values),
                                                 df.target.values)
print('class_weights:',class_weights)


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#dim of vectors
dim = 300
# max words in vocab
num_words = 50000
# max number in questions
max_len = 100 

print('Fiting tokenizer')
## Tokenize the sentences
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(df['question_text'])

print('spliting data')
df_train,df_test = train_test_split(df)

print('text to sequence')
x_train = tokenizer.texts_to_sequences(df_train['question_text'])
x_test = tokenizer.texts_to_sequences(df_test['question_text'])

print('pad sequence')
## Pad the sentences 
x_train = pad_sequences(x_train,maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)

## Get the target values
y_train = df_train['target'].values
y_test = df_test['target'].values

print(x_train.shape)
print(y_train.shape)


In [None]:
all_embs = word2vec.vectors
emb_mean,emb_std = all_embs.mean(), all_embs.std()
print(emb_mean,emb_std)

print(num_words,' from ',len(tokenizer.word_index.items()))
num_words = min(num_words, len(tokenizer.word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (num_words, dim))

# embedding_matrix = np.zeros((num_words, dim))
count = 0
for word, i in tokenizer.word_index.items():
    if i>=num_words:
        break
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
    else:
        count += 1
print('embedding matrix size:',embedding_matrix.shape)
print('Number of words not in vocab:',count)

In [None]:
from keras.layers import Dense, Input,Embedding, Dropout, Activation, CuDNNLSTM,BatchNormalization
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.callbacks import Callback,EarlyStopping
from keras.engine import Layer
from keras.initializers import Ones, Zeros
import keras.backend as K
from keras import regularizers
from keras import constraints

# https://arxiv.org/abs/1607.06450
# https://github.com/keras-team/keras/issues/3878
class LayerNormalization(Layer):
    def __init__(self, **kwargs):
        super(LayerNormalization, self).__init__(**kwargs)

    def build(self, input_shape):
        self.gain = self.add_weight(name='gain', shape=input_shape[-1:],
                                    initializer=Ones(), trainable=True)
        self.bias = self.add_weight(name='bias', shape=input_shape[-1:],
                                    initializer=Zeros(), trainable=True)
        super(LayerNormalization, self).build(input_shape)

    def call(self, x, **kwargs):
        mean = K.mean(x, axis=-1, keepdims=True)
        std = K.std(x, axis=-1, keepdims=True)
        # dot = *
        # std+eps because of possible nans..
        return self.gain * (x - mean) / (std + K.epsilon()) + self.bias

    def compute_output_shape(self, input_shape):
        return input_shape

#model looks to be from here: https://www.kaggle.com/CVxTz/keras-bidirectional-lstm-baseline-lb-0-069

    
inp = Input(shape=(max_len,))
#classic emb layer with pretrained weights
x = Embedding(num_words, dim, weights=[embedding_matrix], trainable=False)(inp)
#seq2seq?
# x = LayerNormalization()(x)
x = Bidirectional(CuDNNLSTM(64, 
                            return_sequences=True, 
#                             kernel_regularizer = regularizers.l2(0.001),
#                             recurrent_regularizer = regularizers.l2(0.001),
#                             kernel_constraint=constraints.MaxNorm(axis=-1), 
#                             recurrent_constraint=constraints.MaxNorm(axis=-1)
                           ))(x)
# limitations of CuDNN https://www.reddit.com/r/MLQuestions/comments/9an2y0/keras_cudnnlstm_is_it_worth_the_drawbacks/
# so looks like by this that layer normalization can be used: http://mlexplained.com/2018/01/13/weight-normalization-and-layer-normalization-explained-normalization-in-deep-learning-part-2/
# but batch normalization can't
# x = LayerNormalization()(x)
#why is the max here?
x = GlobalMaxPool1D()(x)
#why is the dense here?
x = Dense(16, activation="relu")(x)
# x = LayerNormalization()(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)

model = Model(inputs=inp, outputs=x)

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

# for commiting the model to competition i need to comment these sections....otherwise the running time will be more then 2h on gpu...
history = model.fit(x_train,y_train, 
                      batch_size=512, 
                      validation_split=0.2,
                      class_weight=class_weights,
                      epochs=100,
                      #overfits rather soon
                      callbacks=[EarlyStopping(patience=2)])

_, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].plot(history.history['loss'], label='loss')
ax[0].plot(history.history['val_loss'], label='val_loss')
ax[0].legend()
ax[0].set_title('loss')

ax[1].plot(history.history['acc'], label='acc')
ax[1].plot(history.history['val_acc'], label='val_acc')
ax[1].legend()
ax[1].set_title('acc')

plt.show()

In [None]:
#for train set
y_pred = model.predict(x_train,batch_size=1024, verbose=1)
search_result = threshold_search(y_train, y_pred)
print(search_result)
y_pred = y_pred>search_result['threshold']
y_pred = y_pred.astype(int)

print('RESULTS ON TRAINING SET:\n',classification_report(y_train,y_pred))


#for test set
y_pred = model.predict(x_test,batch_size=1024, verbose=1)
search_result = threshold_search(y_test, y_pred)
print(search_result)
y_pred = y_pred>search_result['threshold']
y_pred = y_pred.astype(int)

print('RESULTS ON TEST SET:\n',classification_report(y_test,y_pred))

# Results

## without regul , non trainable emb
    979591/979591 [==============================] - 50s 51us/step
    threshold = 0.990000 | score = 0.000099
    best threshold is  0.330000 with score 0.725006
    {'threshold': 0.33, 'f1': 0.7250061303106289}
    RESULTS ON TRAINING SET:              precision    recall  f1-score   support

              0       0.98      0.98      0.98    918850
              1       0.70      0.75      0.73     60741

    avg / total       0.97      0.96      0.97    979591

    326531/326531 [==============================] - 16s 50us/step
    threshold = 0.990000 | score = 0.000000
    best threshold is  0.280000 with score 0.655881
    {'threshold': 0.28, 'f1': 0.6558810668998688}
    RESULTS ON TEST SET:              precision    recall  f1-score   support

              0       0.98      0.97      0.98    306462
              1       0.61      0.71      0.66     20069

    avg / total       0.96      0.95      0.96    326531

## without regul , non trainable emb, with Dropout 0.1
    979591/979591 [==============================] - 49s 50us/step
    threshold = 0.990000 | score = 0.000000
    best threshold is  0.460000 with score 0.727626
    {'threshold': 0.46, 'f1': 0.7276255725700638}
    RESULTS ON TRAINING SET:
                  precision    recall  f1-score   support

              0       0.98      0.98      0.98    918850
              1       0.70      0.76      0.73     60741

    avg / total       0.97      0.96      0.97    979591

    326531/326531 [==============================] - 16s 50us/step
    threshold = 0.990000 | score = 0.000000
    best threshold is  0.420000 with score 0.656067
    {'threshold': 0.42, 'f1': 0.65606690715254}
    RESULTS ON TEST SET:
                  precision    recall  f1-score   support

              0       0.98      0.97      0.98    306462
              1       0.61      0.71      0.66     20069

    avg / total       0.96      0.95      0.96    326531


## without regul , trainable emb, with Dropout 0.1

    979591/979591 [==============================] - 49s 50us/step
    threshold = 0.990000 | score = 0.031973
    best threshold is  0.460000 with score 0.826382
    {'threshold': 0.46, 'f1': 0.8263816852821267}
    RESULTS ON TRAINING SET:
                  precision    recall  f1-score   support

              0       0.99      0.99      0.99    918850
              1       0.81      0.85      0.83     60741

    avg / total       0.98      0.98      0.98    979591

    326531/326531 [==============================] - 16s 50us/step
    threshold = 0.990000 | score = 0.016393
    best threshold is  0.400000 with score 0.629146
    {'threshold': 0.4, 'f1': 0.629146426092991}
    RESULTS ON TEST SET:
                  precision    recall  f1-score   support

              0       0.98      0.97      0.97    306462
              1       0.59      0.68      0.63     20069

    avg / total       0.95      0.95      0.95    326531


## without regul , trainable emb

    979591/979591 [==============================] - 50s 51us/step
    threshold = 0.990000 | score = 0.009536
    best threshold is  0.390000 with score 0.840622
    {'threshold': 0.39, 'f1': 0.8406221710147314}
    RESULTS ON TRAINING SET:              precision    recall  f1-score   support

              0       0.99      0.99      0.99    918850
              1       0.84      0.84      0.84     60741

    avg / total       0.98      0.98      0.98    979591

    326531/326531 [==============================] - 16s 50us/step
    threshold = 0.990000 | score = 0.004374
    best threshold is  0.250000 with score 0.631671
    {'threshold': 0.25, 'f1': 0.6316712025462428}
    RESULTS ON TEST SET:              precision    recall  f1-score   support

              0       0.98      0.97      0.97    306462
              1       0.59      0.68      0.63     20069

    avg / total       0.95      0.95      0.95    326531

## with regularizers, non trainable emb
    
    kernel_regularizer = regularizers.l2(0.001),
    recurrent_regularizer = regularizers.l2(0.001)
    
    best threshold is  0.250000 with score 0.635888
             precision    recall  f1-score   support

          0       0.98      0.97      0.97    306130
          1       0.59      0.69      0.64     20401

    avg / total       0.95      0.95      0.95    326531

## with layernorm, without regularizers, non trainable emb

    326531/326531 [==============================] - 21s 65us/step
    threshold = 0.990000 | score = 0.000000
    best threshold is  0.240000 with score 0.652420
                 precision    recall  f1-score   support

              0       0.98      0.97      0.97    306130
              1       0.60      0.71      0.65     20401

    avg / total       0.96      0.95      0.95    326531
    
But graphs show overfit.
    

## with layernorm, with regularizers, non trainable emb

       979591/979591 [==============================] - 64s 65us/step
    threshold = 0.990000 | score = 0.000000
    best threshold is  0.260000 with score 0.650079
    {'threshold': 0.26, 'f1': 0.6500788180385126}
    RESULTS ON TRAINING SET:              precision    recall  f1-score   support

              0       0.98      0.97      0.98    919182
              1       0.61      0.70      0.65     60409

    avg / total       0.96      0.95      0.96    979591

    326531/326531 [==============================] - 21s 64us/step
    threshold = 0.990000 | score = 0.000000
    best threshold is  0.260000 with score 0.642098
    {'threshold': 0.26, 'f1': 0.6420979986197378}
    RESULTS ON TEST SET:              precision    recall  f1-score   support

              0       0.98      0.97      0.97    306130
              1       0.60      0.68      0.64     20401

    avg / total       0.96      0.95      0.95    326531
    
    Doesn't overfit.
    
## with layernorm, with regularizers,  trainable emb

    979591/979591 [==============================] - 63s 65us/step
    threshold = 0.990000 | score = 0.000000
    best threshold is  0.420000 with score 0.734747
    {'threshold': 0.42, 'f1': 0.7347472422512937}
    RESULTS ON TRAINING SET:              precision    recall  f1-score   support

              0       0.99      0.98      0.98    919182
              1       0.70      0.78      0.73     60409

    avg / total       0.97      0.97      0.97    979591

    326531/326531 [==============================] - 21s 65us/step
    threshold = 0.990000 | score = 0.000000
    best threshold is  0.420000 with score 0.650728
    {'threshold': 0.42, 'f1': 0.6507277969200478}
    RESULTS ON TEST SET:              precision    recall  f1-score   support

              0       0.98      0.97      0.98    306130
              1       0.62      0.68      0.65     20401

    avg / total       0.96      0.95      0.96    326531
    
Overfits immediatly....in 2 epochs.

## with layernorm, with regularizers,  trainable emb and constraints MaxNorm

    979591/979591 [==============================] - 64s 65us/step
    threshold = 0.990000 | score = 0.000000
    best threshold is  0.360000 with score 0.760183
    {'threshold': 0.36, 'f1': 0.7601830919533802}
    RESULTS ON TRAINING SET:              precision    recall  f1-score   support

              0       0.99      0.98      0.98    919182
              1       0.73      0.79      0.76     60409

    avg / total       0.97      0.97      0.97    979591

    326531/326531 [==============================] - 21s 65us/step
    threshold = 0.990000 | score = 0.000000
    best threshold is  0.290000 with score 0.643731
    {'threshold': 0.29, 'f1': 0.6437309901993917}
    RESULTS ON TEST SET:              precision    recall  f1-score   support

              0       0.98      0.97      0.97    306130
              1       0.60      0.70      0.64     20401

    avg / total       0.96      0.95      0.95    326531
    
Overfits. On test not an improvement....

## with layernorm, without regularizers, trainable emb

    979591/979591 [==============================] - 64s 65us/step
    threshold = 0.990000 | score = 0.014331
    best threshold is  0.430000 with score 0.823383
    {'threshold': 0.43, 'f1': 0.8233833284318531}
    RESULTS ON TRAINING SET:              precision    recall  f1-score   support

              0       0.99      0.99      0.99    919182
              1       0.81      0.83      0.82     60409

    avg / total       0.98      0.98      0.98    979591

    326531/326531 [==============================] - 21s 65us/step
    threshold = 0.990000 | score = 0.005083
    best threshold is  0.340000 with score 0.649846
    {'threshold': 0.34, 'f1': 0.6498460683780468}
    RESULTS ON TEST SET:              precision    recall  f1-score   support

              0       0.98      0.97      0.98    306130
              1       0.62      0.69      0.65     20401

    avg / total       0.96      0.95      0.95    326531
    
    Crazy results on train.
    
## with layernorm, without regularizers, trainable emb, maxlen = 200

    979591/979591 [==============================] - 125s 128us/step
    threshold = 0.990000 | score = 0.000000
    best threshold is  0.580000 with score 0.820049
    {'threshold': 0.58, 'f1': 0.8200489795918368}
    RESULTS ON TRAINING SET:              precision    recall  f1-score   support

              0       0.99      0.99      0.99    918924
              1       0.81      0.83      0.82     60667

    avg / total       0.98      0.98      0.98    979591

    326531/326531 [==============================] - 41s 127us/step
    threshold = 0.990000 | score = 0.000000
    best threshold is  0.490000 with score 0.644622
    {'threshold': 0.49, 'f1': 0.6446220093490224}
    RESULTS ON TEST SET:              precision    recall  f1-score   support

              0       0.98      0.97      0.97    306388
              1       0.60      0.69      0.64     20143

    avg / total       0.96      0.95      0.95    326531
    
## with layernorm, without regularizers, trainable emb, maxlen = 300

In [None]:
#fit final model on all data
print('text to sequence')
x = tokenizer.texts_to_sequences(df['question_text'])

print('pad sequence')
## Pad the sentences 
x = pad_sequences(x,maxlen=max_len)

## Get the target values
y = df['target'].values

print('fiting final model...')
history = model.fit(x,y, batch_size=512, epochs=6,class_weight=class_weights)

_, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].plot(history.history['loss'], label='loss')
ax[0].legend()
ax[0].set_title('loss')

ax[1].plot(history.history['acc'], label='acc')
ax[1].legend()
ax[1].set_title('acc')

plt.show()

y_pred = model.predict(x,batch_size=1024, verbose=1)
search_result = threshold_search(y, y_pred)
y_pred = y_pred>search_result['threshold']
y_pred = y_pred.astype(int)

print(classification_report(y,y_pred))


In [None]:
#submission
print('Loading test data...')
df_final = pd.read_csv('../input/test.csv')
df_final["question_text"].fillna("_##_", inplace=True)

x_final=tokenizer.texts_to_sequences(df_final['question_text'])
x_final = pad_sequences(x_final,maxlen=max_len)

y_pred = model.predict(x_final)
y_pred = y_pred > search_result['threshold']
y_pred = y_pred.astype(int)
print(y_pred[:5])

df_subm = pd.DataFrame()
df_subm['qid'] = df_final.qid
df_subm['prediction'] = y_pred
print(df_subm.head())
df_subm.to_csv('submission.csv', index=False)