## Inspired by:
* https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings
* https://www.kaggle.com/shujian/single-rnn-with-4-folds-v1-9
* http://mlexplained.com/2018/01/13/weight-normalization-and-layer-normalization-explained-normalization-in-deep-learning-part-2/
* https://arxiv.org/abs/1607.06450
* https://github.com/keras-team/keras/issues/3878
* https://www.kaggle.com/lystdo/lstm-with-word2vec-embeddings
* https://www.kaggle.com/jhoward/improved-lstm-baseline-glove-dropout
* https://www.kaggle.com/aquatic/entity-embedding-neural-net
* https://www.kaggle.com/hireme/fun-api-keras-f1-metric-cyclical-learning-rate
* https://ai.google/research/pubs/pub46697
* https://blog.openai.com/quantifying-generalization-in-reinforcement-learning/
* https://www.goodreads.com/book/show/33986067-deep-learning-with-python



In [None]:
import numpy as np # linear algebra
np.set_printoptions(threshold=np.nan)
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
print(os.listdir("../input/embeddings"))
print(os.listdir("../input/embeddings/GoogleNews-vectors-negative300"))

# Any results you write to the current directory are saved as output.

import gensim
from gensim.utils import simple_preprocess
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import Callback
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,f1_score,precision_recall_fscore_support,recall_score,precision_score
from keras import backend as K
from sklearn.utils import class_weight
import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf

SEED = 2018

np.random.seed(SEED)
tf.set_random_seed(SEED)

#https://www.kaggle.com/shujian/single-rnn-with-4-folds-v1-9
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in [i * 0.01 for i in range(100)]:
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        print('\rthreshold = %f | score = %f'%(threshold,score),end='')
        if score > best_score:
            best_threshold = threshold
            best_score = score
    print('\nbest threshold is % f with score %f'%(best_threshold,best_score))
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

In [None]:
df = pd.read_csv('../input/train.csv')
df["question_text"].fillna("_##_",inplace=True)
max_len = df['question_text'].apply(lambda x:len(x)).max()
print('max length of sequences:',max_len)
# df = df.sample(frac=0.1)

print('columns:',df.columns)
pd.set_option('display.max_columns',None)
print('df head:',df.head())
print('example of the question text values:',df['question_text'].head().values)
print('what values contains target:',df.target.unique())

print('Computing class weights....')
#https://datascience.stackexchange.com/questions/13490/how-to-set-class-weights-for-imbalanced-classes-in-keras
class_weights = class_weight.compute_class_weight('balanced',
                                                 np.unique(df.target.values),
                                                 df.target.values)
print('class_weights:',class_weights)


In [None]:
import spacy
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
import time

regex_tokenizer = RegexpTokenizer(r'\w+')
nlp = spacy.load('en_core_web_sm')

punctuations = string.punctuation
stops=set(stopwords.words('english'))


# Define function to cleanup text by removing personal pronouns, stopwords, and puncuation
def cleanup_text(doc):
    doc = nlp(doc, disable=['parser', 'ner','tagger'])
    tokens = [tok.lemma_.lower().strip() for tok in doc if tok.lemma_ != '-PRON-']
    tokens = [tok for tok in tokens if tok not in stops and tok not in punctuations]
    return ' '.join(tokens)

def delstopwords(sentence):
    words = [word.lower().strip() for word in regex_tokenizer.tokenize(sentence)]
    return " ".join([ word for word in words if word not in stops and word and word not in punctuations])

# if you wanna see what it does
a = df["question_text"].head(5).apply( lambda doc : cleanup_text(doc))
b = df["question_text"].head(5).apply( lambda doc : delstopwords(doc))
print('ORIGINAL:\n',df["question_text"].head(5).values)
print('SPACY:\n',a.values)
print('NTLK:\n',b.values)

print('Spacy conversion....')
t_start = time.time()
df["question_text"].head(1000).apply( lambda doc : cleanup_text(doc))
took = time.time()-t_start
print('Time it took:',took)
est = len(df) * took/1000.
print('Estimated time:',est)

print('NTLK conversion...')
t_start = time.time()
df["question_text"].head(1000).apply( lambda doc : delstopwords(doc))
took = time.time()-t_start
print('Time it took:',took)
est = len(df) * took/1000.
print('Estimated time:',est)


print('Using spacy for conversion....')
t_start = time.time()

df["question_text"] = df["question_text"].apply( lambda doc : cleanup_text(doc))
print('example of the question text values:',df['question_text'].head().values)

took = time.time()-t_start
print('Time it took:',took)

len_series = df['question_text'].apply(lambda x:len(x))
max_len = len_series.max()
avg_len = len_series.quantile(0.89)
print('max length of sequences:',max_len)
print('avg length of sequences:',avg_len)

sns.set(color_codes=True)
plt.figure(figsize=(20, 8))
sns.kdeplot(len_series)
plt.show()


In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#dim of vectors
dim = 100
# max words in vocab
num_words = 50000
# max number in questions
max_len = int(avg_len)

print('Fiting tokenizer')
## Tokenize the sentences
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(df['question_text'])

print('spliting data')
df_train,df_test = train_test_split(df, random_state=1)

print('text to sequence')
x_train = tokenizer.texts_to_sequences(df_train['question_text'])
x_test = tokenizer.texts_to_sequences(df_test['question_text'])

print('pad sequence')
## Pad the sentences 
x_train = pad_sequences(x_train,maxlen=max_len)
x_test = pad_sequences(x_test, maxlen=max_len)

## Get the target values
y_train = df_train['target'].values
y_test = df_test['target'].values

print(x_train.shape)
print(y_train.shape)


In [None]:
from keras.layers import Dense, Input,Embedding, Dropout, Activation, CuDNNLSTM,BatchNormalization,concatenate,SpatialDropout1D
from keras.layers import Bidirectional, GlobalMaxPool1D, Concatenate, GlobalAveragePooling1D,Average,Conv1D,GlobalMaxPooling1D,AlphaDropout
from keras.layers import MaxPooling1D,UpSampling1D,RepeatVector,LSTM,TimeDistributed,Flatten,Add, Lambda, Dot
from keras.models import Model
from keras.callbacks import Callback,EarlyStopping
from keras.engine import Layer
from keras.initializers import Ones, Zeros
import keras.backend as K
from keras import regularizers
from keras import constraints
from keras import optimizers

def f1(y_true, y_pred):
    '''
    metric from here 
    https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras
    '''
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# https://ai.google/research/pubs/pub46697
adam = optimizers.Adam()
print('LR:',K.eval(adam.lr))
# 0.001 = learning rate in adam
# optimal batch size ~ eps *N, where eps = learning rate and N = training size
batch_size = int(x_train.shape[0]*K.eval(adam.lr))
print('Batch size = ',batch_size)

# https://blog.openai.com/quantifying-generalization-in-reinforcement-learning/
inp1 = Input(shape=(max_len,))
x = Embedding(num_words, dim,trainable = True,embeddings_regularizer=regularizers.l2(0.0001))(inp1)
x = SpatialDropout1D(0.1)(x)
# filter
x = Conv1D(max_len,kernel_size=5,kernel_regularizer=regularizers.l2(0.0001))(x)
x = MaxPooling1D()(x)
x = SpatialDropout1D(0.1)(x)
# memory
x = Bidirectional(CuDNNLSTM(int(max_len/2), return_sequences=True,kernel_regularizer=regularizers.l2(0.0001)))(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = BatchNormalization()(x)
x = Dense(max_len, activation='relu',kernel_regularizer=regularizers.l2(0.0001))(x)
x = Dropout(0.2)(x)
x = BatchNormalization()(x)
x = Dense(1, activation='sigmoid',kernel_regularizer=regularizers.l2(0.0001))(x)

model = Model(inputs=inp1, outputs=x)

model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy',f1])

print(model.summary())
patience = 10
# if you have a lot of data, you can use task specific embeddings
history = model.fit(x_train,y_train, 
                      batch_size=batch_size, 
                      validation_split=0.2,
                      epochs=100,                      
                      #overfits rather soon
                      callbacks=[EarlyStopping(patience=patience)])

print('training finished...')

In [None]:
_, ax = plt.subplots(1, 3, figsize=(20, 8))
ax[0].plot(history.history['loss'], label='loss')
ax[0].plot(history.history['val_loss'], label='val_loss')
ax[0].legend()
ax[0].set_title('loss')

ax[1].plot(history.history['acc'], label='acc')
ax[1].plot(history.history['val_acc'], label='val_acc')
ax[1].legend()
ax[1].set_title('acc')


ax[2].plot(history.history['f1'], label='f1')
ax[2].plot(history.history['val_f1'], label='val_f1')
ax[2].legend()
ax[2].set_title('f1')

plt.show()

In [None]:
import datetime
print(datetime.datetime.now())

#for train set
y_pred = model.predict(x_train,batch_size=batch_size, verbose=1)
search_result = threshold_search(y_train, y_pred)
print(search_result)
y_pred = y_pred>search_result['threshold']
y_pred = y_pred.astype(int)

print('RESULTS ON TRAINING SET:\n',classification_report(y_train,y_pred))


#for test set
y_pred = model.predict(x_test,batch_size=batch_size, verbose=1)
search_result = threshold_search(y_test, y_pred)
print(search_result)
y_pred = y_pred>search_result['threshold']
y_pred = y_pred.astype(int)

print('RESULTS ON TEST SET:\n',classification_report(y_test,y_pred))

In [None]:
print('fiting final model...')
n_epochs = len(history.history['loss']) - patience
history = model.fit(x_train,y_train, batch_size=batch_size, epochs=n_epochs)

print('fitting on full data done...')

In [None]:
_, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].plot(history.history['loss'], label='loss')
ax[0].legend()
ax[0].set_title('loss')

ax[1].plot(history.history['f1'], label='f1')
ax[1].legend()
ax[1].set_title('f1')

plt.show()

y_pred = model.predict(x_test,batch_size=batch_size, verbose=1)
search_result = threshold_search(y_test, y_pred)
y_pred = y_pred>search_result['threshold']
y_pred = y_pred.astype(int)

print(classification_report(y_test,y_pred))

In [None]:
#submission
print('Loading test data...')
df_final = pd.read_csv('../input/test.csv')
df_final["question_text"].fillna("_##_", inplace=True)

x_final=tokenizer.texts_to_sequences(df_final['question_text'])
x_final = pad_sequences(x_final,maxlen=max_len)

y_pred = model.predict(x_final,batch_size=batch_size,verbose=1)
y_pred = y_pred > search_result['threshold']
y_pred = y_pred.astype(int)
print(y_pred[:5])

df_subm = pd.DataFrame()
df_subm['qid'] = df_final.qid
df_subm['prediction'] = y_pred
print(df_subm.head())
df_subm.to_csv('submission.csv', index=False)