In [None]:
import sys
sys.path.append('../src/')

In [None]:
import numpy as np
import time
from pprint import pprint
import io
import pandas as pd
from argparse import Namespace

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from collections import Counter

from tqdm import tqdm
import random

from nltk import TweetTokenizer

In [None]:
import keras
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation,Bidirectional
from keras.layers import Concatenate, Permute, Dot, Multiply,RepeatVector,add,Flatten,Lambda
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.initializers import glorot_uniform,TruncatedNormal
from keras.models import load_model
from keras import regularizers
import keras.backend as K
from keras.utils import to_categorical

In [None]:
import utils.hasoc2019 as hasoc_utils
import utils.preprocessing as preprocessor

### Opening the dataset

In [None]:
args = Namespace(
    data_file = '../data/train/english_dataset.tsv',
    fast_text_loc = \
        "/Users/cozek/Documents/MTech/3rd Sem/Project/crawl-300d-2M-subword/crawl-300d-2M-subword.vec"
)

In [None]:
data_df = hasoc_utils.open_data_as_df(args.data_file)

In [None]:
data_df

# BiLSTM Model

In [None]:
def bilstm_model(input_shape, embedding_layer):
    
    sentence_indices = Input(input_shape, dtype='int32')
    
    embeddings = embedding_layer(sentence_indices)

    X = Bidirectional(LSTM(128, return_sequences=True, kernel_initializer='glorot_normal',
                           recurrent_regularizer=keras.regularizers.l2(0.001),activation='relu'),  
                      merge_mode='mul')(embeddings)
    
    X = Dropout(0.5, noise_shape=None, seed=None)(X)
    X = keras.layers.BatchNormalization(axis=-1, 
                                        momentum=0.99, 
                                        epsilon=0.001, center=True,
                                        scale=True, beta_initializer='zeros', 
                                        gamma_initializer='ones', 
                                        moving_mean_initializer='zeros', 
                                        moving_variance_initializer='ones', 
                                        beta_regularizer=None, 
                                        gamma_regularizer=None, 
                                        beta_constraint=None, gamma_constraint=None)(X)

    X = Bidirectional(LSTM(128, kernel_initializer='glorot_normal',activation='relu',
                      recurrent_regularizer=keras.regularizers.l2(0.001)),
                      merge_mode='mul')(X)
    
    X = keras.layers.BatchNormalization(axis=-1, 
                                        momentum=0.99, 
                                        epsilon=0.001, center=True,
                                        scale=True, beta_initializer='zeros', 
                                        gamma_initializer='ones', 
                                        moving_mean_initializer='zeros', 
                                        moving_variance_initializer='ones', 
                                        beta_regularizer=None, 
                                        gamma_regularizer=None, 
                                        beta_constraint=None, gamma_constraint=None)(X)
    X = Dropout(0.2, noise_shape=None, seed=None)(X)
    X = Dense(256)(X)
    X = keras.layers.BatchNormalization(axis=-1, 
                                        momentum=0.99, 
                                        epsilon=0.001, center=True,
                                        scale=True, beta_initializer='zeros', 
                                        gamma_initializer='ones', 
                                        moving_mean_initializer='zeros', 
                                        moving_variance_initializer='ones', 
                                        beta_regularizer=None, 
                                        gamma_regularizer=None, 
                                        beta_constraint=None, gamma_constraint=None)(X)
    
    
    X = Dropout(0.5, noise_shape=None, seed=None)(X)
    X = Dense(2)(X)
    X = Activation('softmax')(X)
    
    model = Model(input=sentence_indices, output=X)
    
    return model

In [None]:
def load_embedding(fasttext_loc):
    """Loads fasttext vec file
    Args:
        fasttext_loc: path to vec file
    Returns:
        data : Dict[token:np.array] 
        vocab : list of tokens
    """
    fin = io.open(fasttext_loc, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split())
    vocab = []
    data = {}
    for line in tqdm(fin):
        tokens = line.rstrip().split(' ')
        vocab.append(tokens[0])
        data[tokens[0]] = np.array(list(map(float, tokens[1:])))
    return data,vocab

def gen_word_index_maps(vocab):
    word_to_index = {}
    index_to_word = {}
    for i in tqdm(range(len(vocab)), total = len(vocab)):
        word_to_index[vocab[i]] = i
        index_to_word[i] = vocab[i]
    return word_to_index,index_to_word

def create_embedding_layer(word_to_vec_map, word_to_index ):
    vocab_length = len(word_to_index) +1
    
    emb_dim = word_to_vec_map['hurray'].shape[0]
    
    emb_matrix = np.zeros((vocab_length,emb_dim))
    
    for word, index in tqdm(word_to_index.items(), total= len(word_to_index)):
        if word_to_vec_map[word].shape[0]==emb_dim:
            emb_matrix[index, :] = word_to_vec_map[word]
        else : ##weird dimention error
            emb_matrix[index, :] = np.concatenate((pretrained_model[word],[0,]), axis=0)
            

    
    embedding_layer = Embedding(vocab_length, emb_dim, trainable = False)
    
    embedding_layer.build((None,))
    
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer


def find_max_len(data):
    tknz = TweetTokenizer()
    max_len = 0
    for sentence in data:
        l = len(tknz.tokenize(sentence))
        if  l > max_len:
            max_len = l
    return max_len

In [None]:
fasttext_model,vocab = load_embedding(args.fast_text_loc)

In [None]:
word_to_index, index_to_word = gen_word_index_maps(vocab)

In [None]:
embedding_layer = create_embedding_layer(fasttext_model, word_to_index)
print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])

In [None]:
max_len = find_max_len(data_df.text)
model = bilstm_model( (max_len,), embedding_layer)
model.summary()

In [None]:
model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(),
              metrics=['accuracy'])

In [None]:
def sentences_to_indices(X, word_to_index, max_len):
    m = len(X)
    X_indices = np.zeros((m,max_len))
    tknzr = TweetTokenizer()
    for i in range(m):
        sentence_words = tknzr.tokenize(X[i])
        
        sentence_words = sentence_words[:max_len].copy()
        j = 0
        for w in sentence_words:
            if w in word_to_index:
                X_indices[i,j] = word_to_index[w]
            j=j+1
    return X_indices

In [None]:
X_train_indices = sentences_to_indices(data_df.text,word_to_index,max_len)

In [None]:
label_map = {
    'NOT' : 0,
    'HOF' : 1,
}
y_train_oh = to_categorical(data_df.task_1.map(label_map),2)

In [None]:
model.fit(x=X_train_indices, y=y_train_oh , epochs = 30,batch_size=64 ,verbose=1,)