In [None]:
import os
import copy

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd

from nltk import word_tokenize

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe,FastText

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from torch import nn, optim
from torch.autograd import Variable
from tensorboardX import SummaryWriter
from time import gmtime, strftime

In [None]:
SEED = 23
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
params = {
'batch_size' : 128,
'char_emb_dim' : 20,
'char_emb_hs' : 50,
'data' :'snli',
'dropout' : 0.1,
'eps' : 15,
'gpu' : 2,
'hs' : 100,
'lr' : 1e-3,
'max_sent_len' : -1,
'num_perspective' : 20,
'print_freq' : 500,
'word_emb_dim' : 300,
'val_size' : 0.1,
'test_size' : 0.1,
'model_time' : 0
}

In [None]:
def SNLI_Data():
    TEXT = data.Field(batch_first=True, tokenize=word_tokenize, lower=True)
    LABEL = data.Field(sequential=False, unk_token=None)
    
    train, dev, test = datasets.SNLI.splits(TEXT, LABEL)
    TEXT.build_vocab(train, dev, test,vectors=GloVe(name='840B', dim=300), unk_init=torch.Tensor.zero_)
    LABEL.build_vocab(train)
    sort_key = lambda x: data.interleave_keys(len(x.q1), len(x.q2))

    train_iter, dev_iter, test_iter = data.BucketIterator.splits((train, dev, test),\
                                    batch_sizes=[params['batch_size']] * 3,
                                   device=params['gpu'])
    iterator = {
        'train_iter' : train_iter,
        'dev_iter' : dev_iter,
        'test_iter' : test_iter
    }
    
    return iterator, TEXT, LABEL

In [None]:
iterator, TEXT, LABEL = SNLI_Data()

In [None]:
def Build_Char_Vocab(TEXT):
    max_word_len = 0
    for word in TEXT.vocab.itos:
        max_word_len = max(len(word), max_word_len)
    char_vocab = {'': 0}
    characterized_vectors = []
    for word in TEXT.vocab.itos:
        chars = []
        if word == '<unk>' or word == '<pad>':
            chars = (max_word_len*[0])
        else:
            for c in word:
                if c not in char_vocab:
                    char_vocab[c] = len(char_vocab)
                chars.append(char_vocab[c])
            
            chars.extend([0]*(max_word_len - len(word)))
        characterized_vectors.append(chars)
        
    return characterized_vectors, char_vocab, max_word_len

In [None]:
characterized_vectors, char_vocab, max_word_len = Build_Char_Vocab(TEXT)

In [None]:
def characterize(batch):
    batch = batch.data.cpu().numpy().astype(int).tolist()
    return [[characterized_vectors[w] for w in words] for words in batch]

In [None]:
data =  {
    'TEXT' : TEXT,
    'LABEL' : LABEL,
    'iterator' : iterator,
    'characterized_vectors' : characterized_vectors,
    'char_vocab' : char_vocab,
    'max_word_len' : max_word_len,
    'word_vocab_size': len(TEXT.vocab),
    'char_vocab_size' : len(char_vocab),
    'class_size' : len(LABEL.vocab)
}    

In [None]:
class BIMPM(nn.Module):
    def __init__(self, params, data):
        super(BIMPM, self).__init__()

        self.params = params
        self.data = data

        self.d = self.params['word_emb_dim'] + self.params['char_emb_hs']
        self.l = self.params['num_perspective']

        
        self.char_emb = nn.Embedding(self.data['char_vocab_size'], self.params['char_emb_dim'], padding_idx=0)
        self.word_emb = nn.Embedding(self.data['word_vocab_size'], self.params['word_emb_dim'])
        self.word_emb.weight.data.copy_(data['TEXT'].vocab.vectors)
        self.word_emb.weight.requires_grad = False
        self.char_LSTM = nn.LSTM(input_size=self.params['char_emb_dim'],hidden_size=self.params['char_emb_hs'],batch_first=True,num_layers=1,bidirectional=False)

        
      
        self.context_layer_LSTM = nn.LSTM(input_size=self.d, hidden_size=self.params['hs'],num_layers=1, \
            bidirectional=True,batch_first=True)
        
        
        self.W1 = nn.Parameter(torch.rand(self.l, self.params['hs']))
        self.W2 = nn.Parameter(torch.rand(self.l, self.params['hs']))


        
        self.aggregation_layer_LSTM = nn.LSTM(input_size=self.l * 2,hidden_size=self.params['hs'],num_layers=1,\
            bidirectional=True,batch_first=True)


        
        self.pred_fc1 = nn.Linear(self.params['hs'] * 4, self.params['hs'] * 2)
        self.pred_fc2 = nn.Linear(self.params['hs'] * 2, self.data['class_size'])

        self.init_BIMPM()
        
    def Initialize_LSTM(self,LSTM, rev):
        
        nn.init.kaiming_normal_(LSTM.weight_ih_l0)
        nn.init.constant_(LSTM.bias_ih_l0, val=0)
        nn.init.orthogonal_(LSTM.weight_hh_l0)
        nn.init.constant_(LSTM.bias_hh_l0, val=0)
        
        if rev == 1:
            
            nn.init.kaiming_normal_(LSTM.weight_ih_l0_reverse)
            nn.init.constant_(LSTM.bias_ih_l0_reverse, val=0)
            nn.init.orthogonal_(LSTM.weight_hh_l0_reverse)
            nn.init.constant_(LSTM.bias_hh_l0_reverse, val=0)

        return LSTM


    def init_BIMPM(self):
       
        nn.init.uniform_(self.char_emb.weight, -0.005, 0.005)
        self.char_emb.weight.data[0].fill_(0)
        nn.init.uniform_(self.word_emb.weight.data[0], -0.1, 0.1)
        self.char_LSTM = self.Initialize_LSTM(self.char_LSTM, 0)


        self.context_layer_LSTM = self.Initialize_LSTM(self.context_layer_LSTM, 1)


        nn.init.kaiming_normal_(self.W1)
        nn.init.kaiming_normal_(self.W2)


        self.aggregation_layer_LSTM = self.Initialize_LSTM(self.aggregation_layer_LSTM, 1)

        
        nn.init.uniform_(self.pred_fc1.weight, -0.005, 0.005)
        nn.init.constant_(self.pred_fc1.bias, val=0)
        nn.init.uniform_(self.pred_fc2.weight, -0.005, 0.005)
        nn.init.constant_(self.pred_fc2.bias, val=0)

    def dropout(self, v):
        return F.dropout(v, p=self.params['dropout'], training=self.training)

    def forward(self, **kwargs):
        def match_fn(v1, v2, w):
            seq_len = v1.size(1)
            w = w.transpose(1, 0).unsqueeze(0).unsqueeze(0)
            v1 = w * torch.stack([v1] * self.l, dim=3)
            if len(v2.size()) == 3:
                v2 = w * torch.stack([v2] * self.l, dim=3)
            else:
                v2 = w * torch.stack([torch.stack([v2] * seq_len, dim=1)] * self.l, dim=3)

            m = F.cosine_similarity(v1, v2, dim=2)

            return m


        p = self.word_emb(kwargs['p'])
        h = self.word_emb(kwargs['h'])

        seq_len_p = kwargs['char_p'].size(1)
        seq_len_h = kwargs['char_h'].size(1)

        char_p = kwargs['char_p'].view(-1, data['max_word_len'])
        char_h = kwargs['char_h'].view(-1, data['max_word_len'])

        _, (char_p, _) = self.char_LSTM(self.char_emb(char_p))
        _, (char_h, _) = self.char_LSTM(self.char_emb(char_h))

        char_p = char_p.view(-1, seq_len_p, self.params['char_emb_hs'])
        char_h = char_h.view(-1, seq_len_h, self.params['char_emb_hs'])

        p = torch.cat([p, char_p], dim=-1)
        h = torch.cat([h, char_h], dim=-1)

        p = self.dropout(p)
        h = self.dropout(h)

        con_p, _ = self.context_layer_LSTM(p)
        con_h, _ = self.context_layer_LSTM(h)

        con_p = self.dropout(con_p)
        con_h = self.dropout(con_h)

        con_p_fw, con_p_bw = torch.split(con_p, self.params['hs'], dim=-1)
        con_h_fw, con_h_bw = torch.split(con_h, self.params['hs'], dim=-1)


        mv_p_full_fw = match_fn(con_p_fw, con_h_fw[:, -1, :], self.W1)
        mv_p_full_bw = match_fn(con_p_bw, con_h_bw[:, 0, :], self.W2)
        mv_h_full_fw = match_fn(con_h_fw, con_p_fw[:, -1, :], self.W1)
        mv_h_full_bw = match_fn(con_h_bw, con_p_bw[:, 0, :], self.W2)
        
        mv_p = torch.cat(
            [mv_p_full_fw,mv_p_full_bw], dim=2)
        mv_h = torch.cat(
            [mv_h_full_fw,mv_h_full_bw], dim=2)

        mv_p = self.dropout(mv_p)
        mv_h = self.dropout(mv_h)

        _, (agg_p_last, _) = self.aggregation_layer_LSTM(mv_p)
        _, (agg_h_last, _) = self.aggregation_layer_LSTM(mv_h)

        x = torch.cat(
            [agg_p_last.permute(1, 0, 2).contiguous().view(-1, self.params['hs'] * 2),
             agg_h_last.permute(1, 0, 2).contiguous().view(-1, self.params['hs'] * 2)], dim=1)
        x = self.dropout(x)

        x = F.tanh(self.pred_fc1(x))
        x = self.dropout(x)
        x = self.pred_fc2(x)

        return x


In [None]:
def test(model, params, data, mode=1):
  
    if mode == 0:
        iterator = iter(data['iterator']['dev_iter'])
    else:
        iterator = iter(data['iterator']['test_iter'])

    criterion = nn.CrossEntropyLoss()
    model.eval()
    acc, loss, size = 0, 0, 0

    
    for batch in iterator:
        s1, s2 = 'premise', 'hypothesis'
        
        s1, s2 = getattr(batch, s1), getattr(batch, s2)
        
        if params['gpu'] > -1:
            s1 = s1.cuda(params['gpu'])
            s2 = s2.cuda(params['gpu'])

        kwargs = {'p': s1, 'h': s2}

        char_p = Variable(torch.LongTensor(characterize(s1)))
        char_h = Variable(torch.LongTensor(characterize(s2)))

        if params['gpu'] > -1:
            char_p = char_p.cuda(params['gpu'])
            char_h = char_h.cuda(params['gpu'])

        kwargs['char_p'] = char_p
        kwargs['char_h'] = char_h

        pred = model(**kwargs)

        batch_loss = criterion(pred, batch.label.cuda(params['gpu']))
        loss += batch_loss.data.item()

        _, pred = pred.max(dim=1)
        acc += (pred == batch.label.cuda(params['gpu'])).sum().float()
        size += len(pred)

    acc /= size
    acc = acc.cpu().data.item()
    return loss, acc



In [None]:
def train(params, data):
    model = BIMPM(params, data)

    if params['gpu'] > -1:
        model.cuda(params['gpu'])

    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.AdamW(parameters, lr=params['lr'])
    criterion = nn.CrossEntropyLoss()


    model.train()
    loss, last_epoch = 0, -1
    max_dev_acc, max_test_acc = 0, 0

    iterator = data['iterator']['train_iter']
    train_loss = []
    val_loss = []
    epochs = params['eps']
    
    while epochs != 0:
        epochs -= 1
        epoch_loss = 0
#         cnt_batches = 0
        print("Epoch: %s" %(params['eps']- epochs))
        for i, batch in enumerate(iterator):
#             cnt_batches += 1
            s1, s2 = 'premise', 'hypothesis'


            s1, s2 = getattr(batch, s1), getattr(batch, s2)

            if params['max_sent_len'] >= 0:
                if s1.size()[1] > params['max_sent_len']:
                    s1 = s1[:, :params['max_sent_len']]
                if s2.size()[1] > params['max_sent_len']:
                    s2 = s2[:, :params['max_sent_len']]

            if params['gpu'] > -1:
                s1 = s1.cuda(params['gpu'])
                s2 = s2.cuda(params['gpu'])

            kwargs = {'p': s1, 'h': s2}

            char_p = Variable(torch.LongTensor(characterize(s1)))
            char_h = Variable(torch.LongTensor(characterize(s2)))

            if params['gpu'] > -1:
                char_p = char_p.cuda(params['gpu'])
                char_h = char_h.cuda(params['gpu'])

            kwargs['char_p'] = char_p
            kwargs['char_h'] = char_h

            pred = model(**kwargs)

            pred = pred.cuda(params['gpu'])

            optimizer.zero_grad()
            batch_loss = criterion(pred, batch.label.cuda(params['gpu']))
            loss += batch_loss.data.item()
            epoch_loss += batch_loss.data.item()
            batch_loss.backward()
            optimizer.step()

            if (i + 1) % params['print_freq'] == 0:
                dev_loss, dev_acc = test(model, params, data, mode=0)
                test_loss, test_acc = test(model, params, data)
                c = (i + 1) // params['print_freq']

                print('train loss: ',loss, 'dev loss: ',dev_loss,'test loss: ', test_loss, 'dev acc: ', dev_acc , 'test acc: ', test_acc)

                if dev_acc > max_dev_acc:
                    max_dev_acc = dev_acc
                    max_test_acc = test_acc
                    if not os.path.exists('saved_models'):
                        os.makedirs('saved_models')
                    torch.save(model.state_dict(), f'../models/BIBPM_snli.pt')
                loss = 0
                model.train()
                
        train_loss.append(epoch_loss)
        
        dev_loss, dev_acc = test(model, params, data, mode=0)
        val_loss.append(dev_loss)
        model.train()

    print('max dev acc:', max_dev_acc, 'max test acc: ', max_test_acc)
    return train_loss, val_loss

In [None]:
params['model_time'] = strftime('%H:%M:%S', gmtime())
print('training start!')
train_loss, val_loss = train(params, data)

print('training finished!')


In [None]:
from sklearn.metrics import fbeta_score, precision_recall_fscore_support, f1_score,confusion_matrix,plot_confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt 

In [None]:
def plot_loss(train_loss, val_loss):
    
    X = [i for i in range(1,params['eps']+1)]
    
    plt.plot(X,train_loss)
    plt.ylabel('train-loss')
    plt.xlabel('epoch')
    plt.show()
    
    plt.plot(X,val_loss)
    plt.ylabel('val-loss')
    plt.xlabel('epoch')
    plt.show()

In [None]:
plot_loss(train_loss, val_loss)

In [None]:
def load_model(params, data):
    model = BIMPM(params, data)
    model.load_state_dict(torch.load('../models/BIBPM_snli.pt'))

    if params['gpu'] > -1:
        model.cuda(params['gpu'])

    return model

In [None]:
def test1(model, params, data):

    iterator = iter(data['iterator']['test_iter'])

    criterion = nn.CrossEntropyLoss()
    model.eval()
    acc, loss, size = 0, 0, 0

    actual = list()
    preds = list()
    
    for batch in iterator:
        s1, s2 = 'premise', 'hypothesis'
        
        s1, s2 = getattr(batch, s1), getattr(batch, s2)
        
        if params['gpu'] > -1:
            s1 = s1.cuda(params['gpu'])
            s2 = s2.cuda(params['gpu'])

        kwargs = {'p': s1, 'h': s2}

        char_p = Variable(torch.LongTensor(characterize(s1)))
        char_h = Variable(torch.LongTensor(characterize(s2)))

        if params['gpu'] > -1:
            char_p = char_p.cuda(params['gpu'])
            char_h = char_h.cuda(params['gpu'])

        kwargs['char_p'] = char_p
        kwargs['char_h'] = char_h

        pred = model(**kwargs)
        

        actual.extend(batch.label)
        preds.extend(pred.detach().cpu())

        batch_loss = criterion(pred, batch.label.cuda(params['gpu']))
        loss += batch_loss.data.item()

        _, pred = pred.max(dim=1)
        acc += (pred == batch.label.cuda(params['gpu'])).sum().float()
        size += len(pred)

    acc /= size
    acc = acc.cpu().data.item()
    

    return loss, acc, preds, actual 


In [None]:
model = load_model(params, data)
loss , acc , preds, actual = test1(model, params, data)

In [None]:
def decode_sentences(sen):
    sentence = ""
    for i in sen:
        if(TEXT.vocab.itos[i] != '<pad>'):
            sentence += TEXT.vocab.itos[i] + " "
    return sentence

In [None]:
iterator = iter(data['iterator']['test_iter'])
sentences1 = []
sentences2 = []

for batch in iterator:
        s1, s2 = 'premise', 'hypothesis'
        
        s1, s2 = getattr(batch, s1), getattr(batch, s2)
        
        for i in range(0,len(s1)):
            sentences1.append(decode_sentences(s1[i]))
            sentences2.append(decode_sentences(s2[i])) 

In [None]:
temp = preds
predicted_classes = []
for ten in temp:
    predicted_classes.append(torch.argmax(ten))

In [None]:
for i in range(0,len(actual)):
    if(actual[i] != predicted_classes[i]):
        print("premise: ",sentences1[i],"hypothesis: ",sentences2[i],"actual_label: " ,actual[i].tolist() ,"predicted_label: ",predicted_classes[i].tolist())
        print()

In [None]:
precision, recall, f1_measure, _ = precision_recall_fscore_support(actual, predicted_classes, average='macro')

print("Precision:",precision)
print("Recall:",recall)
print("f1_score:",f1_measure)

In [None]:
conf = confusion_matrix(actual, predicted_classes)
print("Confusion Matrix:\n",conf)

In [None]:
sns.heatmap(conf, annot=True); #annot=True to annotate cells