### Inspired by:
* https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings
* https://www.kaggle.com/shujian/single-rnn-with-4-folds-v1-9
* http://mlexplained.com/2018/01/13/weight-normalization-and-layer-normalization-explained-normalization-in-deep-learning-part-2/
* https://arxiv.org/abs/1607.06450
* https://github.com/keras-team/keras/issues/3878
* https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings
* https://www.kaggle.com/jhoward/improved-lstm-baseline-glove-dropout
* https://www.kaggle.com/aquatic/entity-embedding-neural-net
* https://www.kaggle.com/hireme/fun-api-keras-f1-metric-cyclical-learning-rate
* https://ai.google/research/pubs/pub46697
* https://blog.openai.com/quantifying-generalization-in-reinforcement-learning/
* https://www.kaggle.com/rasvob/let-s-try-clr-v3


In [None]:
import numpy as np # linear algebra
import sys
np.set_printoptions(threshold=sys.maxsize)
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
print(os.listdir("../input/embeddings"))
print(os.listdir("../input/embeddings/GoogleNews-vectors-negative300"))

# Any results you write to the current directory are saved as output.

import gensim
from gensim.utils import simple_preprocess
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import Callback
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,f1_score,precision_recall_fscore_support,recall_score,precision_score
from keras import backend as K
from sklearn.utils import class_weight
import matplotlib.pyplot as plt

import tensorflow as tf

# SEED = 2018

# np.random.seed(SEED)
# tf.set_random_seed(SEED)

#https://www.kaggle.com/shujian/single-rnn-with-4-folds-v1-9
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        print('\rthreshold = %f | score = %f'%(threshold,score),end='')
        if score > best_score:
            best_threshold = threshold
            best_score = score
    print('\nbest threshold is % f with score %f'%(best_threshold,best_score))
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

In [None]:
df = pd.read_csv('../input/train.csv')
df["question_text"].fillna("_##_",inplace=True)
max_len = df['question_text'].apply(lambda x:len(x)).max()
print('max length of sequences:',max_len)
# df = df.sample(frac=0.1)

print('columns:',df.columns)
pd.set_option('display.max_columns',None)
print('df head:',df.head())
print('example of the question text values:',df['question_text'].head().values)
print('what values contains target:',df.target.unique())

print('Loading test data...')
df_final = pd.read_csv('../input/test.csv')
df_final["question_text"].fillna("_##_", inplace=True)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#dim of vectors
dim = 300
# max words in vocab
num_words = 75966
# max number in questions
max_len = 100 

print('Fiting tokenizer')
## Tokenize the sentences
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(list(df['question_text'])+list(df_final['question_text']))

print('text to sequence')
x_train = tokenizer.texts_to_sequences(df['question_text'])

print('pad sequence')
## Pad the sentences 
x_train = pad_sequences(x_train,maxlen=max_len)

## Get the target values
y_train = df['target'].values

print(x_train.shape)
print(y_train.shape)

x_test=tokenizer.texts_to_sequences(df_final['question_text'])
x_test = pad_sequences(x_test,maxlen=max_len)

print('Test data loaded:',x_test.shape)

In [None]:
print('Glove ... ')
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open('../input/embeddings/glove.840B.300d/glove.840B.300d.txt'))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
print(len(all_embs))


word_index = tokenizer.word_index
# num_words = min(num_words, len(word_index))
embedding_matrix_glov = np.random.normal(emb_mean, emb_std, (num_words, dim))
count=0
for word, i in word_index.items():
    if i >= num_words: 
        break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix_glov[i] = embedding_vector
    else:
        count += 1
print('embedding matrix size:',embedding_matrix_glov.shape)
print('Number of words not in vocab:',count)

del embeddings_index,all_embs
import gc
gc.collect()

In [None]:
print('Para...')
EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
print(len(all_embs))


word_index = tokenizer.word_index
# num_words = min(num_words, len(word_index))
embedding_matrix_para = np.random.normal(emb_mean, emb_std, (num_words, dim))
count=0
for word, i in word_index.items():
    if i >= num_words: 
        break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: 
        embedding_matrix_para[i] = embedding_vector
    else:
        count += 1
print('embedding matrix size:',embedding_matrix_glov.shape)
print('Number of words not in vocab:',count)

del embeddings_index,all_embs
import gc
gc.collect()

In [None]:
matrixes = [embedding_matrix_glov,embedding_matrix_para]

matrix = np.mean(matrixes,axis=0)

del embedding_matrix_glov,embedding_matrix_para
import gc
gc.collect()

In [None]:
from keras.layers import Dense, Input,Embedding, Dropout, Activation, CuDNNLSTM,BatchNormalization,concatenate,SpatialDropout1D
from keras.layers import Bidirectional, GlobalMaxPool1D, Concatenate, GlobalAveragePooling1D,Average,Conv1D,GlobalMaxPooling1D,AlphaDropout
from keras.layers import MaxPooling1D,UpSampling1D,RepeatVector,LSTM,TimeDistributed,Flatten, CuDNNGRU, Add
from keras.models import Model
from keras.callbacks import Callback,EarlyStopping,ModelCheckpoint, ReduceLROnPlateau
from keras.engine import Layer
from keras.initializers import Ones, Zeros
import keras.backend as K
from keras import regularizers
from keras import constraints
from keras import optimizers
from keras import initializers
from sklearn.model_selection import KFold, StratifiedKFold
import warnings

def get_model(trainable=False):
    inp1 = Input(shape=(max_len,))
    emb = Embedding(num_words, dim, weights=[matrix],trainable = trainable,)(inp1)
    x,h_f,c_f,h_b,c_b = Bidirectional(CuDNNLSTM(128,return_sequences=True,return_state=True))(emb)
    x = concatenate([h_f,h_b])
    x = Dense(128, activation="relu")(x)
    x = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=inp1, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     print(model.summary())
    return model

batch_size = 512
print('Batch size = ',batch_size)

patience = 2

best_model=None
all_results = {}

train_meta = np.zeros(y_train.shape)
test_meta = np.zeros(x_test.shape[0])

splits = list(StratifiedKFold(n_splits=4, shuffle=True, random_state=14).split(x_train, y_train))

for idx, (train_idx, valid_idx) in enumerate(splits):
    print('----'+str(idx)+'-----')
    X_train1 = x_train[train_idx]
    y_train1 = y_train[train_idx]
    X_val = x_train[valid_idx]
    y_val = y_train[valid_idx]
    model_file = 'model_'+str(idx)+'.h5'
    modelcheck = ModelCheckpoint(model_file,save_best_only=True)
    stop = EarlyStopping(patience=patience)
    
    model = get_model()
    
    history = model.fit(X_train1,y_train1, 
                      batch_size=batch_size, 
                      validation_data=(X_val,y_val),
                      epochs=100,
                      callbacks=[modelcheck,stop],
                      verbose=2)
#     print('Pretraining finished, unfreezing embeddings layer...')
    
#     model = get_model(trainable=True)
#     model.load_weights(model_file)    
    
#     modelcheck = ModelCheckpoint(model_file,save_best_only=True)
#     stop = EarlyStopping(patience=patience)
#     metrics = Metrics()
    
#     history = model.fit(X_train1,y_train1, 
#                       batch_size=batch_size, 
#                       validation_data=(X_val,y_val),
#                       epochs=1,
#                       #overfits rather soon
#                       callbacks=[modelcheck,stop],
#                       verbose=2)
    
    
    print('training finished...')

    #load best performing
    model.load_weights(model_file)

    #for val set
    y_pred = model.predict(X_val,batch_size=batch_size, verbose=1)
    train_meta[valid_idx] = y_pred.reshape(-1)

    search_result = threshold_search(y_val, y_pred)
    print(search_result)
    y_pred = y_pred>search_result['threshold']
    y_pred = y_pred.astype(int)

    print('RESULTS ON VALIDATION SET:\n',classification_report(y_val,y_pred))

    all_results[model_file] = search_result['f1']    
    
    #for test set
    y_pred = model.predict(x_test,batch_size=batch_size, verbose=1)
    test_meta += y_pred.reshape(-1) / len(splits)
    
    if best_model is None or best_model['f1']  < search_result['f1']:
        best_model={'model':model_file,'f1':search_result['f1']}
    
    
print('-'*80)
print(all_results)
print('-'*80)
print(best_model)
print('-'*80)

In [None]:
#submission
search_result = threshold_search(y_train, train_meta)
print(search_result)

df_subm = pd.DataFrame()
df_subm['qid'] = df_final.qid
df_subm['prediction'] = test_meta > search_result['threshold']
print(df_subm.head())
df_subm.to_csv('submission.csv', index=False)