In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import gc
import os
import re
os.environ['OMP_NUM_THREADS'] = '2'
from sklearn.metrics import f1_score, roc_auc_score

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_colwidth', -1)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GroupShuffleSplit
from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
from keras.preprocessing import text, sequence

In [None]:
train_df = pd.read_csv("../input/train.csv",usecols=['question_text','target'])
test_df = pd.read_csv("../input/test.csv")
train_df, val_df = train_test_split(train_df, test_size=0.001, random_state = 200)

In [None]:
train_df['target'].value_counts()

In [None]:
val_df['target'].value_counts(normalize = True)

In [None]:
%%time
regex = re.compile('[^a-zA-Z0-9\$]')
def clear_digit(m):
    return m.group(1) + m.group(3)
def clear_dot_name(m):
    return m.group(1) + m.group(3) + " "
def sep_digit_leter(m):
    return m.group(1) + " " + m.group(2)


def preprocessing(X):
#First parameter is the replacement, second parameter is your input string
    #
    X = [x.replace('’', '\'') for x in X]
    X = [x.replace('$', ' $ ') for x in X]
    
    X = [x.replace('\'s ',' sssssssss ') for x in X]
    
    X = [x.replace('i\'m ','i am ') for x in X]
    X = [x.replace('I\'m ','I am ') for x in X]
    
    
    X = [x.replace('\'re ',' are ') for x in X]
    X = [x.replace('\'ve ',' have ') for x in X]  
    X = [x.replace('won\'t ','will not ') for x in X]
    X = [x.replace('n\'t ',' not ') for x in X]
    X = [x.replace('\'ll ',' will ') for x in X]
    X = [x.replace('\'d ',' ddddddddd ') for x in X]
    #X = [x.replace('U.S.', ' USA ') for x in X]
    
    #X = [x.replace('B.S.', ' BS ') for x in X]
    #X = [x.replace('M.S.', ' MS ') for x in X]
    X = [x.replace('e.g.', ' ') for x in X]
    #X = [x.lower() for x in X]
    
    
    X = [re.sub('\[math\].*?math\]', ' equation ', x) for x in X]
    #X = [re.sub('\(.*?\)', ' ', x) for x in X]
    X = [x.replace('B.Tech',  ' BS ') for x in X]
    X = [x.replace('M.Tech',  ' MS ') for x in X]
    X = [x.replace('Mr. ',  ' Mr ') for x in X]
    X = [x.replace('Mrs. ',  ' Mrs ') for x in X]
    X = [x.replace('Ms. ',  ' Ms ') for x in X]
    X = [re.sub("(http|Http|www\.).*?( |$)", ' link ', x) for x in X]
    X = [re.sub("([0-9])(,)([0-9])", clear_digit, x) for x in X]
    X = [re.sub("([0-9])([a-z])", sep_digit_leter, x) for x in X]
    X = [re.sub("([a-z])([0-9])", sep_digit_leter, x) for x in X]
    X = [re.sub("([A-Z])(\.)([A-Z]{0,1})([a-z]{0,1})(\.{0,1})", clear_digit, x) for x in X]
    
    
    X = [re.sub('\.+',' aaaaaaaaa ',x) for x in X]
    X = [re.sub(',+',' bbbbbbbbb ',x) for x in X]
    X = [x.replace('?',  ' ccccccccc ') for x in X]
    X = [x.replace('!',  ' vvvvvvvvv ') for x in X]
    
    
    X = [regex.sub(' ', x) for x in X]
    
    X = [x.replace(' US ', ' USA ') for x in X]
    
    X = [x.lower() for x in X]
    for i in '0123456789':
        X = [x.replace(i, '#') for x in X]
    
    X = [x.replace(' aaaaaaaaa ', ' . ') for x in X]
    X = [x.replace(' bbbbbbbbb ',' , ') for x in X]
    X = [x.replace(' ccccccccc ',' ? ') for x in X]
    X = [x.replace(' vvvvvvvvv ',' ! ') for x in X]
    X = [x.replace(' sssssssss ',' \'s ') for x in X]
    X = [x.replace(' ddddddddd ',' \'d ') for x in X]
    
    X = [x.replace('quorans', 'quora') for x in X]
    X = [x.replace('quoran', 'quora') for x in X]
    X = [x.replace('qoura', 'quora') for x in X]
    X = [x.replace('cryptocurrencies', 'bitcoin') for x in X]
    X = [x.replace('redmi', 'phone') for x in X]
    X = [x.replace('oneplus', 'phone') for x in X]
    X = [x.replace('lenovo','laptop') for x in X]
    return X

#train_df['question_text'].iloc[37859:37860] ['question_text'] = "What are Loy Machedo's thoughts on evil spirit?"

train_df['transform'] = preprocessing(train_df['question_text'])
val_df['transform'] = preprocessing(val_df['question_text'])
test_df['transform'] = preprocessing(test_df['question_text'])

#Out: 'abdE'

In [None]:
#train_df['transform'] = train_df['transform'].apply(lambda x: " ".join([i if (len(i) <= 2 or i.upper() != i) else "something" for i in x.split() ]).lower() )
#test_df['transform'] = test_df['transform'].apply(lambda x: " ".join([i if (len(i) <= 2 or i.upper() != i) else "something" for i in x.split()]).lower() )
#val_df['transform'] = val_df['transform'].apply(lambda x: " ".join([i if (len(i) <= 2 or i.upper() != i) else "something" for i in x.split() ]).lower())

In [None]:
train_df.head()

In [None]:
X_train = train_df['transform'].values
X_val = val_df['transform'].values
y_train = train_df['target'].values
y_val = val_df['target'].values
X_test = test_df['transform'].values


In [None]:
%%time
maxlen = 50

tokenizer = text.Tokenizer(filters='\t\n')
tokenizer.fit_on_texts(list(X_train))

# tokenizer = text.Tokenizer(filters='\t\n',oov_token=set(list(oov[0].values)))
# tokenizer.fit_on_texts(list(X_train))
X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)

x_train = sequence.pad_sequences(X_train, maxlen=maxlen, padding='post',truncating='post')
x_val = sequence.pad_sequences(X_val, maxlen=maxlen, padding='post', truncating='post')
x_test = sequence.pad_sequences(X_test, maxlen=maxlen, padding='post', truncating='post')

In [None]:
word_index = tokenizer.word_index
embedding_matrix1 = np.zeros((max(list(word_index.values())) + 1, 300), dtype = 'float32')
embedding_matrix2 = np.zeros((max(list(word_index.values())) + 1, 300), dtype = 'float32')
#embedding_matrix3 = np.zeros((max(list(word_index.values())) + 1, 300), dtype = 'float32')
len(word_index)

In [None]:
# embdedding setup
# Source https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html
#embeddings_index = {}
f = open('../input/embeddings/glove.840B.300d/glove.840B.300d.txt')
for line in tqdm(f):
    values = line.split(" ")
    if (len(values) < 200):
        print("a")
    word = values[0]
    if word not in  word_index:
        continue
    embedding_matrix1[word_index[word]] = np.asarray(values[1:], dtype='float32')
f.close()

In [None]:
f = open('../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt', encoding="utf8", errors='ignore')
for line in tqdm(f):
    values = line.split(" ")
    word = values[0]
    if word in  word_index:
        embedding_matrix2[word_index[word]] = np.asarray(values[1:], dtype='float32')
f.close()

In [None]:
#tmp = embedding_matrix1.sum(axis=1)
tmp = pd.DataFrame(list(tokenizer.word_index.items()))
tmp[2] = (embedding_matrix1.sum(axis=1)==0)[1:]
tmp[3] = (embedding_matrix2.sum(axis=1)==0)[1:]
#tmp[4] = (embedding_matrix3.sum(axis=1)==0)[1:]
#a = tmp[tmp[2]][0][:1000].values
#a1 = tmp[tmp[2]][0][:1000].index

tmp = tmp[tmp[2] | tmp[3]][:4000]
name = tmp[0].values
indexes = tmp[1].values
embedding_matrix1[indexes] = 0
embedding_matrix2[indexes] = 0
#embedding_matrix3[indexes] = 0


In [None]:
%%time
token_name = {}
cnt = 1
for i in list(tokenizer.word_index.keys()):
    if i in name:
        token_name[i] = cnt
        cnt += 1
    else:
        token_name[i] = 0

tokenizer.word_index = token_name

In [None]:
X_train = train_df['transform'].values
X_val = val_df['transform'].values
y_train = train_df['target'].values
y_val = val_df['target'].values
X_test = test_df['transform'].values


X_train = tokenizer.texts_to_sequences(X_train)
X_val = tokenizer.texts_to_sequences(X_val)
X_test = tokenizer.texts_to_sequences(X_test)

x_train_name = sequence.pad_sequences(X_train, maxlen=maxlen, padding='post',truncating='post')
x_val_name = sequence.pad_sequences(X_val, maxlen=maxlen, padding='post', truncating='post')
x_test_name = sequence.pad_sequences(X_test, maxlen=maxlen, padding='post', truncating='post')

In [None]:
train_df['token'] = list(x_train)
train_df['token_name'] = list(x_train_name)

In [None]:
mask_zeros = np.ones((name.shape[0] + 1, 600))
mask_zeros[0] = 0

In [None]:
from keras.models import Sequential
from keras.models import Model, load_model
from keras.layers import CuDNNGRU, CuDNNLSTM, Dense, Bidirectional, Input, SpatialDropout1D,Embedding, \
        BatchNormalization, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, Conv1D, Multiply, Add

In [None]:
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints
from keras import backend as K

from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints
from keras import backend as K

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [None]:
class GlobalMinPooling1D(Layer):
    

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        return K.min(x, axis=1)
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[2])

#     def compute_output_shape(self, input_shape):
#         #return input_shape[0], input_shape[-1]
#         return input_shape[0],  self.features_dim
    


In [None]:
class GlobalSumPooling1D(Layer):
    

    def compute_mask(self, input, input_mask=None):
        # do not pass the mask to the next layers
        return None

    def call(self, x, mask=None):
        return K.sum(x, axis=1)
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], input_shape[2])

#     def compute_output_shape(self, input_shape):
#         #return input_shape[0], input_shape[-1]
#         return input_shape[0],  self.features_dim
    


In [None]:
from keras.layers import Dropout
from keras.initializers import he_uniform
def build_model(input_layer, input_layer_name,  embedding_matrix):
    x1 = Embedding(embedding_matrix.shape[0], 600, weights=[embedding_matrix], trainable= False)(input_layer)
    x2 = Embedding(name.shape[0] + 1, 600,  trainable= True)(input_layer_name)
    x3 = Embedding(name.shape[0] + 1, 600,  weights=[mask_zeros], trainable= False)(input_layer_name)
    #x = SpatialDropout1D(0.2)(x)
    x = Multiply()([x2, x3])
    x = Add()([x1, x])
    x = Bidirectional(CuDNNLSTM(128, kernel_initializer=he_uniform(seed=0), return_sequences=True))(x)
    x = SpatialDropout1D(0.2)(x)
    y = Bidirectional(CuDNNGRU(128,kernel_initializer=he_uniform(seed=0), return_sequences=True))(x)
    a = GlobalAveragePooling1D()(y)
    b = GlobalMaxPooling1D()(y)
    c = GlobalMinPooling1D()(y)
    #t = GlobalMaxPooling1D()(x)
    #d = Attention(30)(x)
    #e = Attention(30)(y)
    x = concatenate([a, b])
    x = Dense(32, activation="relu",kernel_initializer=he_uniform(seed=0))(x)
    x = Dense(1, activation="sigmoid",kernel_initializer=he_uniform(seed=0))(x)
    return x

In [None]:
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in tqdm([i * 0.01 for i in range(100)]):
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

In [None]:


train_df.reset_index(drop = True, inplace = True)
val_df.reset_index(drop= True, inplace = True)

In [None]:
positive = train_df[train_df['target'] == 1][['token','token_name','target']]
negative = train_df[train_df['target'] == 0][['token','token_name','target']]

In [None]:
for C in range(4):
    if (C == 0):
        embedding_matrix = np.concatenate([embedding_matrix1, embedding_matrix2], axis=1)
    elif (C == 1):
        embedding_matrix = np.concatenate([embedding_matrix1, embedding_matrix2], axis=1)
    elif (C == 2):
        embedding_matrix = np.concatenate([embedding_matrix2, embedding_matrix1], axis=1)
    else:
        embedding_matrix = np.concatenate([ embedding_matrix2, embedding_matrix1], axis=1)

    neg1, neg2 = train_test_split(negative, test_size = 0.5, random_state = C*100)
    df1, df2 = pd.concat([neg1,positive], ignore_index=True), pd.concat([neg2,positive], ignore_index=True)
    input_layer = Input((50,), name="i1")
    input_layer_name = Input((50, ), name = "i2")
    output_layer = build_model(input_layer, input_layer_name, embedding_matrix)
    model = Model([input_layer, input_layer_name], output_layer)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])



    model.fit({"i1": np.stack(df1['token'].values), "i2": np.stack(df1['token_name'].values)}, 
        df1['target'], batch_size=128, verbose=2,shuffle=True,\
              epochs=1, validation_data=({"i1": x_val, "i2": x_val_name}, val_df['target']))
    

    
    model.fit({"i1":x_train, "i2": x_train_name}, train_df['target'], batch_size=256, verbose=2,shuffle=True,\
              epochs=1, validation_data=({"i1": x_val, "i2": x_val_name}, val_df['target']))
    #model.fit(x_train, train_df['target'], batch_size=1024, verbose=1,shuffle=True,\
    #          epochs=1, validation_data=(x_val, val_df['target']), callbacks=callbacks_list)
    #val_df[C] = model.predict({"i1": x_val, "i2": x_val_name}, batch_size=1024).flatten()
    #test_df[C] = model.predict(x_test, batch_size=512).flatten()
    #best_search = threshold_search(val_df['target'].values, val_df[C].values)
    #print(best_search)
    
    #print(model.layers[2].get_weights()[0])
    model.layers[2].trainable = False
    #model.layers[7].trainable = False
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    
#     model.fit({"i1":x_train, "i2": x_train_name}, train_df['target'], batch_size=512, verbose=2,shuffle=True,\
#               epochs=1, validation_data=({"i1": x_val, "i2": x_val_name}, val_df['target']))
#     #model.fit(x_train, train_df['target'], batch_size=1024, verbose=1,shuffle=True,\
#     #          epochs=1, validation_data=(x_val, val_df['target']), callbacks=callbacks_list)
#     val_df[C] = model.predict({"i1": x_val, "i2": x_val_name}, batch_size=1024).flatten()
#     #test_df[C] = model.predict(x_test, batch_size=512).flatten()
#     best_search = threshold_search(val_df['target'].values, val_df[C].values)
#     print(best_search)
    
    model.fit({"i1":x_train, "i2": x_train_name}, train_df['target'], batch_size=512, verbose=2,shuffle=True,\
              epochs=1, validation_data=({"i1": x_val, "i2": x_val_name}, val_df['target']))
   
    #print(model.layers[2].get_weights()[0])
    #val_df[C] = model.predict({"i1": x_val, "i2": x_val_name}, batch_size=1024).flatten()
    test_df[C] = model.predict({"i1": x_test, "i2": x_test_name}, batch_size=1024).flatten()
    #best_search = threshold_search(val_df['target'].values, val_df[C].values)
    #print(best_search)

In [None]:
#val_df['preds'] = (val_df[0] + val_df[1] + val_df[2] + val_df[3])/4
test_df['preds'] = (test_df[0] + test_df[1] + test_df[2] + test_df[3])/4

In [None]:
y_te = (test_df['preds'] > 0.35).astype(np.int)

submit_df = pd.DataFrame({"qid": test_df["qid"], "prediction": y_te})
submit_df.to_csv("submission.csv", index=False)