In [2]:
import argparse
import numpy as np 
import pandas as pd
import os
from collections import Counter
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import KFold, StratifiedKFold
from keras.preprocessing.sequence import pad_sequences
from collections import Counter , defaultdict
from keras.utils import Sequence
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.layers import Input, Embedding, Dense, Conv2D, MaxPool2D, concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D
from keras.models import Model
from keras.optimizers import Adam
from keras import backend as K
import tensorflow as tf
import Utils
from Architecture import lstm_gru_attn
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint
FOLDS = 3 #donot change
DATA_SPLIT_SEED = 1403 #donot change


In [3]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')    
def getxyfromdf(df, T): return pad_sequences(T.text_to_sequence(df.question_text.values), maxlen=config['seq_len'], padding='post'), df.target.values  if 'target' in df else None



In [5]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')    



class Tokenizer():  
    def __init__(self, max_vocab=400000):
        self.c = Counter()
        self.word2id, self.id2word = {"#pad#":0}  , ['#pad#']
        self.MAX_VOCAB = 400000
        return

    @staticmethod
    def tokenize(text):
        for each in "!\"#$%&()*+,-./:;<=>?@[\]^_`{|}~'’'":
            text = text.replace(each,'')        
        return text.split(' ')
        
    def fit_texts(self,texts, embeddings_index=None):
        for text in texts:
            self.c.update(self.tokenize(text))   
        if embeddings_index !=None: self.build_vocab(embeddings_index)
        
    def text_to_sequence(self,texts):
       return [[ self.word2id[x] if x in self.word2id else 0  for x in self.tokenize(text) ] for text in texts]
        
    def sequences_to_text(self,seqs):
        return [' '.join( self.id2word[i] for x in seq) for seq in seqs]

    def build_vocab(self, embedding_index):
        if embedding_index:
            all_embs = np.stack(embeddings_index.values())
            emb_mean,emb_std = all_embs.mean(), all_embs.std()
        else:
            emb_mean,emb_std = 0., 1.

        vocab_size = min(self.MAX_VOCAB, len(self.c))
        #self.embedding_matrix = np.random.normal( emb_mean, emb_std, (vocab_size, all_embs.shape[1]))
        self.embedding_matrix =  K.eval(tf.truncated_normal((vocab_size, 300), emb_mean, emb_std))
        self.unrepresented = []
        for ind,word in enumerate(sorted(self.c, key= lambda x: self.c[x], reverse=True)) :
            if ind > vocab_size: break

            self.word2id[word] = len(self.id2word)
            self.id2word.append(word)
            if word in embedding_index:
                self.embedding_matrix[ind] = embedding_index[word]
            elif word.lower() in embedding_index:
                self.embedding_matrix[ind] = embedding_index[word.lower()]
            else:
                self.unrepresented.append(word)
        return
################################
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        self.b = self.add_weight((input_shape[1],),
                                    initializer='zero',
                                    name='{}_b'.format(self.name),
                                    regularizer=self.b_regularizer,
                                    constraint=self.b_constraint) if self.bias else None

        self.built = True

    def compute_mask(self, input, input_mask=None): return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias: eij += self.b
        eij = K.tanh(eij)
        a = K.exp(eij)
        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim
####################  
def lstm_gru_attn(embedding_matrix):
    global config
    inp = Input(shape=(config['seq_len'],)) 
    x = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], weights = [embedding_matrix], trainable=True)(inp)    
    x = SpatialDropout1D(config['dropout'])(x)
    x = Bidirectional(CuDNNLSTM(config['internal_dim'], return_sequences=True))(x) 
    y = Bidirectional(CuDNNGRU(config['internal_dim'], return_sequences=True))(x)
    A1 = Attention(config['seq_len'])(x)
    A2 = Attention(config['seq_len'])(y)

    pool1 = GlobalAveragePooling1D()(y)
    pool2 = GlobalMaxPooling1D()(y)
    concat = concatenate([pool1, pool2, A1, A2])
    x = Dense(64, activation="relu")(concat)
    x = Dropout(config['dropout'])(x)
    output = Dense(1, activation='sigmoid')(x)

    opt = Adam(lr=config['lr'], decay=1e-6)

    model = Model(inputs=inp, outputs=output)
    model.compile(loss='binary_crossentropy', optimizer=opt)  
    print(model.summary())
    return model
    
################################

In [4]:
def one_round(model, model_dir, train_x, train_y, val_x, val_y, tx, epochs):
    callbacks = [   ReduceLROnPlateau(monitor='val_loss', factor=0.2,patience=1, min_lr=0.001),
                    ModelCheckpoint(filepath='%s/weights.{epoch:02d}-{val_loss:.3f}.hdf5' %model_dir, monitor='val_loss'),
                    Utils.Metrics()]
    #model.fit(train_x,train_y,validation_data = (val_x, val_y), batch_size=config['bs'], epochs=epochs, callbacks=callbacks)
    print('Averaging predictions...')        
    #averaging -outputs
    top_k = []
    for ck_file in os.listdir(model_dir):
        top_k.append((float(ck_file.split('-')[1][:5]), ck_file))
    top_k.sort()
    val_pred_y = np.zeros_like(val_y, dtype=float)
    test_y = np.zeros((tx.shape[0],1))
    print(top_k)
    CHKPOINTS = len(top_k) if len(top_k) <  config['top_k_checkpoints'] else config['top_k_checkpoints']
    for _,model_file in top_k[:config['top_k_checkpoints']]:
        model.load_weights(os.path.abspath(f'{model_dir}/{model_file}'))
        val_pred_y += model.predict(val_x).reshape(-1)
        test_y += model.predict(tx).reshape(-1)
    return val_pred_y/CHKPOINTS, test_y/CHKPOINTS


##################### HELPER FUNC ENDS ####################
def preprocess(train_df, test_df, embeddings_index):
    T = Tokenizer()
    T.fit_texts(train_df.question_text.values)
    T.fit_texts(test_df.question_text.values)
    T.build_vocab(embeddings_index)
    return T

def train_full(train_df, test_df, embeddings_index):
    global FOLDS, config
    T = preprocess(train_df, test_df, embeddings_index)
    train_X, train_Y = getxyfromdf(train_df, T)
    test_X, _ = getxyfromdf(test_df, T)      
    train_X = train_X[:1000]
    train_Y = train_Y[:1000]
    test_X = test_X[:1000]
    model = config['func'](T.embedding_matrix, **config)
    #Utils.search_lr(model,train_X, train_Y)
    print('Splitting stratified k fold')
    splits = list(StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=DATA_SPLIT_SEED).split(train_X, train_Y))    
    
    train_meta = np.zeros_like(train_Y)
    test_meta = np.zeros((test_X.shape[0],1))
    
    for idx, (train_idx, valid_idx) in enumerate(splits):
        tr_x = train_X[train_idx]
        tr_y = train_Y[train_idx]
        val_x = train_X[valid_idx]
        val_y = train_Y[valid_idx]       
        print(f"Running fold {idx}") 
        val_pred_y, test_y = one_round(model,config['model_dir'], tr_x, tr_y, val_x, val_y, test_X, 5)
        train_meta[valid_idx] = val_pred_y.reshape(-1)
        test_meta += test_y/FOLDS
    print('Getting best threshold and saving the test tsv file')
    #th = get_best_threshold(val_pred_y, val_y)
    train_df[f"targets_{config['func'].__name__}_avg_{config['top_k_checkpoints']}"] = train_meta
    train_df.to_csv("../output/train.csv", index_label ='qid')

    test_df[f"targets_{config['func'].__name__}_avg_{config['top_k_checkpoints']}"] = test_meta    
    test_df.to_csv("../output/test.csv", index_label ='qid')    
    return



In [6]:
################ CONFIGURATIONS #################

config = {}
config['embedding'] = 'None'
config['seq_len'] = 100
config['bs'] = 512
config['dropout'] = 0.25
config['internal_dim'] = 100
config['lr'] = 0.01
config['func'] = lstm_gru_attn #function is used as a variable
config['top_k_checkpoints'] = 3 #for checkpoint ensemble-averaging (maximum of epochs)


In [7]:
train_df = pd.read_csv("../output/train.csv")
test_df = pd.read_csv("../output/test.csv")

In [8]:
T = preprocess(train_df, test_df, {})


In [15]:
test_X, _ = getxyfromdf(test_df, T)

In [16]:
print(test_X[0])

[   202   1167   1480      6 301786    202   2494   1167   1300    100
      3  33116  48108   3237      5     20    807   2494   1480      2
      6     20   1167    330      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0
      0      0      0      0      0      0      0      0      0      0]


In [12]:
model = config['func'](T.embedding_matrix)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 100, 300)     92877300    input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 100, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 100, 200)     321600      spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
bidirectio

In [10]:
config['model_dir'] = '__'.join(f'{k}_{v}' if k != 'func' else f'{k}_{v.__name__}' for k,v in config.items())


In [11]:
print(config['model_dir'])

embedding_None__seq_len_100__bs_512__dropout_0.25__internal_dim_100__lr_0.01__func_lstm_gru_attn__top_k_checkpoints_3


In [27]:
config['bs'] = 512
def one_round(model, train_x, train_y, val_x, val_y, tx, epochs ):
    for e in range(epochs):
        model.fit(train_x,train_y,batch_size=config['bs'],epochs=1)
        val_pred_y = model.predict(val_x)
        print(f"Epoch {e} Aprox. f1 score {f1_score(val_y, (val_pred_y>0.33).astype(int))}")
    test_y = model.predict(tx)
    return val_pred_y, test_y

one_round(model, train_X[:-200], train_Y[:-200], train_X[-200:], train_Y[-200:],train_X[-200:], 1 )

Epoch 1/1
Epoch 0 Aprox. f1 score 0.5128205128205129


(array([[3.8250650e-03],
        [4.7505620e-01],
        [1.6708693e-02],
        [2.4986883e-01],
        [6.6158473e-02],
        [3.9272744e-02],
        [2.2624739e-02],
        [2.5330391e-04],
        [1.4266948e-01],
        [1.2996265e-04],
        [5.9139529e-05],
        [2.8617883e-03],
        [3.2672469e-02],
        [4.8926124e-01],
        [8.8804518e-05],
        [4.9981085e-04],
        [1.6471203e-02],
        [1.0198958e-01],
        [7.0575834e-03],
        [1.2649153e-03],
        [2.6241252e-03],
        [4.1813999e-02],
        [4.8926124e-01],
        [8.7199820e-04],
        [8.5546017e-02],
        [8.3202198e-03],
        [1.8514927e-04],
        [2.7373226e-02],
        [1.7099808e-03],
        [3.8175346e-04],
        [3.5912082e-02],
        [3.7833367e-02],
        [8.1265735e-04],
        [3.4139841e-03],
        [3.1649824e-02],
        [3.5779181e-01],
        [4.2155040e-03],
        [4.3278260e-04],
        [4.5811966e-02],
        [9.8791188e-03],


In [25]:
d = np.expand_dims(model.predict(test_X[:100]).reshape(-1), -1)
print(d.shape)
a = np.zeros((100,1))
a+=d
print(a.shape)
a+=d
print(a.shape)

(100, 1)
(100, 1)
(100, 1)


In [23]:
print(a/2)

[[0.48444149]
 [0.49389109]
 [0.48593161]
 [0.48696923]
 [0.4991526 ]
 [0.49707583]
 [0.52118498]
 [0.48951164]
 [0.50385362]
 [0.51378775]
 [0.5108884 ]
 [0.51923281]
 [0.51136273]
 [0.50487673]
 [0.50920314]
 [0.51349086]
 [0.49687502]
 [0.49227223]
 [0.51492447]
 [0.52305216]
 [0.48883221]
 [0.50302064]
 [0.49143434]
 [0.48148397]
 [0.48970875]
 [0.49274215]
 [0.50405902]
 [0.49849731]
 [0.48860711]
 [0.51239836]
 [0.5018838 ]
 [0.49873376]
 [0.50038904]
 [0.50494868]
 [0.49185887]
 [0.49737763]
 [0.48988467]
 [0.48634624]
 [0.48796317]
 [0.48749572]
 [0.49894869]
 [0.48984325]
 [0.49809527]
 [0.48978749]
 [0.49380562]
 [0.51031619]
 [0.47291368]
 [0.49224287]
 [0.48953071]
 [0.49444014]
 [0.50728899]
 [0.4991591 ]
 [0.49828416]
 [0.48398679]
 [0.48954305]
 [0.50335997]
 [0.49956635]
 [0.49433103]
 [0.4820722 ]
 [0.47481823]
 [0.49451485]
 [0.51256329]
 [0.47891903]
 [0.47831735]
 [0.50247973]
 [0.50335586]
 [0.48612651]
 [0.50956249]
 [0.47454956]
 [0.50688696]
 [0.49787948]
 [0.50