In [26]:
import json
import random
import numpy as np
from collections import Counter
import pickle as pickle
import scipy.stats
import time

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

## Load Data and Embeddings

In [46]:
unwanted_chars = ['\\','.',',','/','\'s']
start = ['<null>']

label_dict = {'neutral':0,'contradiction':1,'entailment':2}

def load_data(path): #load SNLI words
    '''
    Constructs 4 dictionaries with the same key values across the dictionaries
    '''
    #data = []
    excluded = 0
    hypothesis = {}
    premise = {}
    label = {}
    label_enc = {}
    with open(path, 'r') as f:
        #need this so indexing is continuous when sentences are skipped over
        idx = 0
        for i,line in enumerate(f):
            obj = json.loads(line)
            #skip these rows per readme
            if obj["gold_label"] == '-':
                excluded += 1
            else:
                label[idx] = obj["gold_label"]
                label_enc[idx] = label_dict[obj["gold_label"]]
                premise[idx] = obj["sentence1"]
                hypothesis[idx] = obj["sentence2"]
                idx += 1
    print('%s excluded' %excluded)
    return hypothesis, premise, label, label_enc

def load_embeddings(path,words_to_load,emb_dim): #load pre-trained GloVe embeddings
    with open(path) as f:
        loaded_embeddings = np.zeros((words_to_load, emb_dim))
        words = {}
        idx2words = {}
        ordered_words = []
        for i, line in enumerate(f):
            if i >= words_to_load: 
                break
            s = line.split()
            loaded_embeddings[i, :] = np.asarray(s[1:])
            words[s[0]] = i
            idx2words[i] = s[0]
            ordered_words.append(s[0])

    return loaded_embeddings, words, idx2words, ordered_words
            
def add_tokens(idx_mapping, embeddings, emb_dim):
    '''
    This function increases the index of the word to index mapping for GloVe so that
    0: padding index
    1: unk
    2: BoS
    '''
    words_cnt = Counter(idx_mapping)
    increment = Counter(dict.fromkeys(words, 3))
    words_cnt = words_cnt + increment
    words_cnt['<PAD_IDX>'] = 0
    words_cnt['<UNK>'] = 1
    words_cnt['<BoS>'] = 2
    
    #insert embeddings for tokens
    '''
    TO DO: FIX INITILIZATION
    '''
    #<BoS>
    print(embeddings.shape)
    embed = np.insert(embeddings,[0],np.random.rand(300),axis=0)
    print(embed.shape)
    #<UNK>
    embed = np.insert(embed,[0],np.random.rand(300),axis=0)
    print(embed.shape)
    #<PAD_IDX>
    embed = np.insert(embed,[0],np.zeros(300),axis=0)
    print(embed.shape)
    
    return words_cnt, embed

def clean_words(text_list): # Removes characters and makes all words lowercase
    for i,word in enumerate(text_list):
        for ch in unwanted_chars:
            if ch in text_list[i]:
                text_list[i] = text_list[i].replace(ch,'')
            text_list[i] = text_list[i].lower()

def tokenize(text_dict, idx_mapping, pad_len):
    '''
    text_dict: dictionary with index as key, sentence as value
    returns dictionary with the index as key, sentenece mapped to index as value, and padded to pad_len
    
    QUESTION: How should we choose pad_len?  Should we truncate or should we set to the max length of 
    premise and hypothesis?
    '''
    tokenized_data = {}
    for i in range(len(text_dict.keys())):
        text_list = text_dict[i].split()
        clean_words(text_list)
        text_idx = []
        for word in text_list:
            try:
                text_idx.append(idx_mapping[word])
            except KeyError:
                #UNK token
                text_idx.append(1)
                continue
        #insert BoS token
        text_idx.insert(0,2)
        if len(text_idx) > pad_len:
            text_idx = text_idx[:pad_len]
        text_idx = np.concatenate((text_idx,np.zeros(max(pad_len-len(text_idx),0))))
                                    
        tokenized_data[i] = np.array(text_idx).astype(int)
    return tokenized_data

In [None]:
#for lisa's laptop
vocab_size = 50000
emb_dim = 300
num_classes = 3
learning_rate = .001
path = '/Users/Lisa/Documents/Grad School/DS-GA 1101/data/'
glove_path = path+'glove.6B/glove.6B.300d.txt'
text_path = path+'snli_1.0/snli_1.0_train.jsonl'

In [31]:
vocab_size = 50000
emb_dim = 300
num_classes = 3
#do we use learning rate anywhere?
learning_rate = .001
glove_path = 'glove/glove.6B.300d.txt'
text_path = 'snli_1.0/snli_1.0_train.jsonl'

In [32]:
hypothesis, premise, label, label_enc = load_data(text_path)

785 excluded


In [50]:
embeddings, words, idx2words, ordered_words = load_embeddings(glove_path, vocab_size, emb_dim)

In [51]:
#modifies embeddings, words, idx2words in place to add tokens
words, embeddings = add_tokens(words, embeddings, emb_dim)
idx2words = {v:k for k,v in words.items()}

(50000, 300)
(50001, 300)
(50002, 300)
(50003, 300)


In [52]:
h_len = 10
p_len = 15
h_idx = tokenize(hypothesis, words, h_len)
p_idx = tokenize(premise, words, p_len)

In [55]:
fp = path+'pickles/'
with open(fp+'h_idx.pt', 'wb') as f:
    pickle.dump(h_idx, f)
with open(fp+'p_idx.pt', 'wb') as f:
    pickle.dump(p_idx, f)

## Create Batches

In [56]:
#sample indices
idx_list = np.array((0,1,2,3))

In [57]:
h_test_batch = torch.LongTensor(4, h_len)
p_test_batch = torch.LongTensor(4, p_len)
l_test_batch = torch.LongTensor(4,1)

In [58]:
#test batch
for i in idx_list:
    h_test_batch[i] = torch.from_numpy(h_idx[i])
    p_test_batch[i] = torch.from_numpy(p_idx[i])
    l_test_batch[i] = label_enc[i]
h_test_batch = h_test_batch.long()
p_test_batch = p_test_batch.long()
l_test_batch = Variable(l_test_batch)

In [59]:
class DecomposableAttention(nn.Module):
    '''
    Starting with premise (a), we see if the hypothesis (b) is an 
    entailment, a contradiction, or neutral.
    '''
    def __init__(self, glove_emb, batch_size, hidden_size, h_len, p_len, num_classes, dropout=0.2):
        super(DecomposableAttention, self).__init__()
        self.glove = glove_emb
        self.num_embeddings = glove_emb.shape[0]
        self.embedding_dim = glove_emb.shape[1]
        self.batch_size = batch_size
        self.compare_dim = 2*self.embedding_dim
        self.aggregate_dim = 2*self.compare_dim
        self.h_len = h_len
        self.p_len = p_len
        
        self.embed = nn.Embedding(self.num_embeddings, self.embedding_dim)
        
        '''
        Should the linear layers have a bias?  How many layers should we have in our feed forward network?
        Yes?
        '''
        
        self.F_a = nn.Linear(self.embedding_dim*self.p_len,self.embedding_dim*self.p_len)
        self.F_b = nn.Linear(self.embedding_dim*self.h_len,self.embedding_dim*self.h_len)
        
        self.G_a = nn.Linear(self.embedding_dim*2,self.compare_dim)
        self.G_b = nn.Linear(self.embedding_dim*2,self.compare_dim)
        
        self.H = nn.Linear(self.compare_dim,self.aggregate_dim)
        self.output = nn.Linear(self.aggregate_dim,3)
       
        self.init_weights()
    
    def _mlp_layers(self, input_dim, output_dim):
        mlp_layers = []
        mlp_layers.append(nn.Dropout(p=0.2))
        mlp_layers.append(nn.Linear(
            input_dim, output_dim, bias=True))
        mlp_layers.append(nn.ReLU())
        mlp_layers.append(nn.Dropout(p=0.2))
        mlp_layers.append(nn.Linear(
            output_dim, output_dim, bias=True))
        mlp_layers.append(nn.ReLU())   
        #sequential runs all the layers in order
        return nn.Sequential(*mlp_layers)  
    
    def forward(self, hypothesis, premise, label):
        start_time = time.time()
        '''
        Embedding layer
        max length = max length of of hypothesis/premise (respectively) in batch
        Input dim: batch size x max length
        Output dim: batch size x max length x embedding dimensions
        '''
        p_embedded = self.embed(Variable(premise))
        h_embedded = self.embed(Variable(hypothesis))
        
        '''
        Relu layer (F from paper)
        max length = max length of of hypothesis/premise (respectively) in batch
        Input dim: batch size x max length x embedding dimensions
        Output dim: batch size x max length x embedding dimensions
        '''
        F_a = self.F_a(p_embedded.view(self.batch_size,-1))
        F_b = self.F_b(h_embedded.view(self.batch_size,-1))
        F_a = F.relu(F_a).view(self.batch_size,-1,self.embedding_dim)
        F_b = F.relu(F_b).view(self.batch_size,-1,self.embedding_dim)
        #E dim: batch_size x max len of hypothesis x max len of premise
        #transpose function swaps second and third axis so that F_b is batch size x embedding dim x len premise
        E = torch.matmul(F_a,torch.transpose(F_b,1,2))  
        
        '''
        Attention! 
        Given E, we reweight using the softmax and store in W_beta, W_alpha
        W_beta dim: batch_size x len(hypothesis) x embedding dimensions
        W_alpha dim: batch_size x len(premise) x embedding dimensions
        '''
        W_beta = Variable(torch.Tensor(self.batch_size,self.p_len,self.embedding_dim))
        W_alpha = Variable(torch.Tensor(self.batch_size,self.h_len,self.embedding_dim))
        '''
        TO DO: vectorize this with softmax on dimension (should be in next release of pytorch)
        '''
        for i in range(self.batch_size):
            for j in range(F_b.size()[1]):
                W_beta[i,j] = torch.mm(F.softmax(E[i,j]).view(1,-1),h_embedded[i]).data
            for k in range(F_a.size()[1]):
                W_alpha[i,j] = torch.mm(F.softmax(E[i,:,j]).view(1,-1),p_embedded[i]).data
        
        '''
        Compare
        Open items:
        1) Check that we're concatenating along the right dimensions.  Based on AllenNLP and libowen, 
            concatenated input should be batch size x len(hypothesis/premise) x (2 * embedding dim)
        
        Output:
        v1 dim: batch_size x len(hypothesis) x compare_dim
        v2 dim: batch_size x len(premise) x compare_dim
        '''
        #dim: batch size x len(hypotheis/premise) x (2* embedding dim)
        cat_p_beta = torch.cat((p_embedded,W_beta),2)
        cat_h_alpha = torch.cat((h_embedded,W_alpha),2)
        G_a = self.G_a(cat_p_beta.view(-1,2*self.embedding_dim)).view(self.batch_size,-1,self.compare_dim)
        G_b = self.G_b(cat_h_alpha.view(-1,2*self.embedding_dim)).view(self.batch_size,-1,self.compare_dim)
        
        v_a = F.relu(G_a).view(self.batch_size,-1,self.compare_dim)
        v_b = F.relu(G_b).view(self.batch_size,-1,self.compare_dim)
        
        '''
        Aggregate
        Given:
        v_a = output of relu activation on the concatenation of a (premise) and beta
        v_b = output of relu activation on the concatenation of b (hypothesis) and alpha
        '''
        v1 = torch.sum(v_a, dim=1)
        v2 = torch.sum(v_b, dim=1)
        H = F.relu(torch.cat((v1,v2),1))
        out = F.softmax(self.output(H))
        
        print("runtime for single batch: %s seconds" % (time.time() - start_time))
        return out
    
    
    def init_weights(self):
        self.embed.weight.data.copy_(torch.from_numpy(self.glove))

In [60]:
num_classes = 3
dropout_rate = .2
batch_size = 4
hidden_size = 100
h_len = 10
p_len = 15
da = DecomposableAttention(embeddings,batch_size,hidden_size,\
                           h_len,p_len,num_classes,dropout=dropout_rate)

In [61]:
optimizer = torch.optim.Adam(da.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [73]:
'''This should go in the training loop once our batch iterator is working:'''
optimizer.zero_grad()
output = da(h_test_batch,p_test_batch,l_test_batch)
loss = criterion(output,l_test_batch.view(-1))
loss.backward()
optimizer.step()

runtime for single batch: 0.02220892906188965 seconds


In [74]:
loss

Variable containing:
 0.6252
[torch.FloatTensor of size 1]