In [None]:
import tensorflow as tf
with tf.Session() as sess:
    devices = sess.list_devices()
devices

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc
import time
from tqdm import tqdm
import math
import pickle

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.optimizers import Adam
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn import metrics

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True


    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

def load_and_prec(maxlen=300):
    train_df = pd.read_csv("../input/train.csv")
    test_df = pd.read_csv("../input/test.csv")
    print("Train shape : ",train_df.shape)
    print("Test shape : ",test_df.shape)
    
    ## split to train and val
    train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=2018)

    ## create training, validation, and test sets
    print('Creating train, val, and test sets')
    train_X = train_df["question_text"].values
    val_X = val_df["question_text"].values   
    test_X = test_df["question_text"].values

    ## Tokenize the sentences
    print('Tokenizing the sentences')
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(list(train_X)) 
    train_X = tokenizer.texts_to_sequences(train_X)
    val_X = tokenizer.texts_to_sequences(val_X)
    test_X = tokenizer.texts_to_sequences(test_X)

    ## Pad the sentences 
    print('Padding Sequences for input datasets')
    train_X = pad_sequences(train_X, maxlen=maxlen, padding='pre')
    val_X = pad_sequences(val_X, maxlen=maxlen, padding='pre')
    test_X = pad_sequences(test_X, maxlen=maxlen, padding='pre')

    ## Get the target values
    train_y = train_df['target'].values
    val_y = val_df['target'].values  
    
    ## Shuffling the data
    print('Shuffling training and validation sets')
    np.random.seed(2018)
    trn_idx = np.random.permutation(len(train_X))
    val_idx = np.random.permutation(len(val_X))
    train_X = train_X[trn_idx]
    val_X = val_X[val_idx]
    train_y = train_y[trn_idx]
    val_y = val_y[val_idx]    
    
    print('Complete - Returning Values')
    return train_X, val_X, test_X, train_y, val_y, tokenizer.word_index


def load_all_embeddings(vocabulary,max_features,embed_size):
    
    def load_embedding(embed_file):
        def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
        if 'glove' in embed_file:
                embed_file = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
                embeddings_index = dict(get_coefs(*o.split(" ")) for o\
                                        in open(embed_file))
        if 'wiki' in embed_file:
                embed_file = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
                embeddings_index = dict(get_coefs(*o.split(" ")) for o\
                                        in open(embed_file) if len(o)>100)
        if 'paragram' in embed_file:
                embed_file = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
                embeddings_index = dict(get_coefs(*o.split(" ")) for o\
                                        in open(embed_file, encoding="utf8", errors='ignore')\
                                        if len(o)>100)
        if 'Google' in embed_file:
                from gensim.models import KeyedVectors
                embed_file = '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
                embeddings_index = KeyedVectors.load_word2vec_format(embed_file, binary=True)
                nb_words = min(max_features, len(vocabulary))
                embedding_matrix = (np.random.rand(nb_words, embed_size) - 0.5) / 5.0
                for word, i in vocabulary.items():
                    if i >= max_features: continue
                    if word in embeddings_index:
                        embedding_vector = embeddings_index.get_vector(word)
                        embedding_matrix[i] = embedding_vector
                
                del embeddings_index; gc.collect()
                return embedding_matrix

        ## Read Embeddings for Specified File       
        all_embs = np.stack(embeddings_index.values())
        emb_mean,emb_std = all_embs.mean(), all_embs.std()
        nb_words = min(max_features, len(vocabulary))

        ## Initialize embedding matrix with random values based on -
        ## the mean and std (Not all words are contained in every pre-train)
        embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
        for word, i in vocabulary.items():
            if i >= max_features: continue
            embedding_vector = embeddings_index.get(word)
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
        del embeddings_index; gc.collect()
        return embedding_matrix
    
    #Loop over embeddings and create output file
    embed_list = os.listdir('../input/embeddings/')
    embedding_matrices = []
    for embed in embed_list:
        embedding_matrices.append(load_embedding(embed))
        print('Loaded %s' % embed)
    #Average embedding matrice
    average_embedding_matrix = np.mean(embedding_matrices,axis=0)
    print(np.shape(average_embedding_matrix))
    
    return average_embedding_matrix

In [None]:
## Models
def joined_model(embedding_matrix, maxlen, max_features,embed_size):
    inp = Input(shape=(maxlen,))
    emb = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    
    #BDLSTM-att w/Dropout
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(emb)
    x = Dropout(0.2)(x)
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = Dropout(0.2)(x)
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = Dense(128, activation="relu")(x)
    
    ## Simple BDLSTM-att
    y = Bidirectional(CuDNNLSTM(128, return_sequences=True))(emb)
    y = Bidirectional(CuDNNLSTM(128, return_sequences=True))(y)
    y = Attention(maxlen)(y)
    
    #Simple BDLSTM-maxpool
    z = Bidirectional(CuDNNLSTM(128, return_sequences=True))(emb)
    z = Bidirectional(CuDNNLSTM(128, return_sequences=True))(z)
    z = GlobalMaxPool1D()(z)
    z = Dense(256, activation="sigmoid")(z)
    
    #LSTM_du
    w = Bidirectional(CuDNNGRU(64, return_sequences=True))(emb)
    avg_pool = GlobalAveragePooling1D()(w)
    max_pool = GlobalMaxPooling1D()(w)
    w_conc = concatenate([avg_pool, max_pool])
    w_conc = Dense(64, activation="relu")(w_conc)
    w_conc = Dropout(0.1)(w_conc)

    #Concatenate
    output = concatenate([x,y,z,w_conc])
    output = Dense(1, activation="sigmoid")(output)
    model = Model(inputs=inp, outputs=output)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def train_pred(model, train_X, train_y, val_X, val_y, test_X, epochs=2):
    file_path = "model-checkpoint.hdf5"
    checkpoint = ModelCheckpoint(file_path, monitor='val_loss', verbose=1,save_best_only=True, mode='min')
    early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=1)

    model.fit(
        train_X, 
        train_y, 
        batch_size=1024, 
        epochs=epochs, 
        validation_data=(val_X, val_y),
        callbacks=[checkpoint,early_stop]
    )
    
    model.load_weights(file_path)
    pred_val_y = model.predict([val_X], batch_size=256, verbose=0)
    best_thresh = 0.5
    best_score = 0.0
    for thresh in np.arange(0.1, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        score = metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
        if score > best_score:
            best_thresh = thresh
            best_score = score

    print("Val F1 Score: {:.4f}".format(best_score))

    pred_test_y = model.predict([test_X], batch_size=1024, verbose=0)
    return pred_val_y, pred_test_y, best_score

## Get Model Data and Settings

In [None]:
## Text Input Settings
embed_size = 300 # how big is each word vector
max_features = 75000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 65 # max number of words in a question to use

#Get training data
train_X, val_X, test_X, train_y, val_y, vocabulary =\
load_and_prec(maxlen)

#Get Embedding Matrix for Model
embed_matrix = load_all_embeddings(vocabulary,max_features,embed_size)

## Set Models, Fit, and Generate Predictions

In [None]:
model = joined_model(embed_matrix, maxlen, max_features, embed_size)

pred_val_y, pred_test_y, best_score =\
train_pred(model, train_X, train_y, val_X, val_y, test_X, epochs = 8)

## Find Best Thresholds and Generate Scores

In [None]:
thresholds = []
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    res = metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
    thresholds.append([thresh, res])
    print("F1 score at threshold {0} is {1}".format(thresh, res))
    
thresholds.sort(key=lambda x: x[1], reverse=True)
best_thresh = thresholds[0][0]
print("Best threshold: ", best_thresh)

## Output Predictions

In [None]:
pred_test_y = (pred_test_y > best_thresh).astype(int)
test_df = pd.read_csv("../input/test.csv", usecols=["qid"])
out_df = pd.DataFrame({"qid":test_df["qid"].values})
out_df['prediction'] = pred_test_y
out_df.to_csv("submission.csv", index=False)