In [201]:
import json
import random
import numpy as np
from collections import Counter
import pickle as pickle

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

## Load Data and Embeddings

In [178]:
unwanted_chars = ['\\','.',',','/','\'s']
start = ['<null>']

label_dict = {'neutral':0,'contradiction':-1,'entailment':1}

def load_data(path): #load SNLI words
    '''
    Constructs 4 dictionaries with the same key values across the dictionaries
    '''
    #data = []
    excluded = 0
    hypothesis = {}
    premise = {}
    label = {}
    label_enc = {}
    with open(path, 'r') as f:
        #need this so indexing is continuous when setences are skipped over
        idx = 0
        for i,line in enumerate(f):
            obj = json.loads(line)
            #skip these rows per readme
            if obj["gold_label"] == '-':
                excluded += 1
            else:
                label[idx] = obj["gold_label"]
                label_enc[idx] = label_dict[obj["gold_label"]]
                premise[idx] = obj["sentence1"]
                hypothesis[idx] = obj["sentence2"]
                idx += 1
    print('%s excluded' %excluded)
    return hypothesis, premise, label, label_enc

def load_embeddings(path,number_of_words,emb_dim): #load pre-trained GloVe embeddings
    words_to_load = number_of_words

    with open(path) as f:
        loaded_embeddings = np.zeros((words_to_load, emb_dim))
        words = {}
        idx2words = {}
        ordered_words = []
        for i, line in enumerate(f):
            if i >= words_to_load: 
                break
            s = line.split()
            loaded_embeddings[i, :] = np.asarray(s[1:])
            words[s[0]] = i
            idx2words[i] = s[0]
            ordered_words.append(s[0])

    return loaded_embeddings, words, idx2words, ordered_words

def clean_words(text_list): # Removes characters and makes all words lowercase
    for i,word in enumerate(text_list):
        for ch in unwanted_chars:
            if ch in text_list[i]:
                text_list[i] = text_list[i].replace(ch,'')
            text_list[i] = text_list[i].lower()
            
def add_tokens(idx_mapping, embeddings, emb_dim):
    '''
    This function increases the index of the word to index mapping for GloVe so that
    0: padding index
    1: unk
    2: BoS
    '''
    words_cnt = Counter(idx_mapping)
    increment = Counter(dict.fromkeys(words, 3))
    words_cnt = words_cnt + increment
    words_cnt['<PAD_IDX>'] = 0
    words_cnt['<UNK>'] = 1
    words_cnt['<BoS>'] = 2
    
    #insert embeddings for tokens
    '''
    TO DO: FIX INITILIZATION
    '''
    #<BoS>
    embed = np.insert(embeddings,[0],np.random.rand(300),axis=0)
    #<UNK>
    embed = np.insert(embeddings,[0],np.random.rand(300),axis=0)
    #<PAD_IDX>
    embed = np.insert(embeddings,[0],np.zeros(300),axis=0)
    
    return words_cnt, embed

def clean_words(text_list): # Removes characters and makes all words lowercase
    for i,word in enumerate(text_list):
        for ch in unwanted_chars:
            if ch in text_list[i]:
                text_list[i] = text_list[i].replace(ch,'')
            text_list[i] = text_list[i].lower()

def tokenize(text_dict, idx_mapping):
    '''
    text_dict: dictionary with index as key, sentence as value
    returns dictionary with the index as key, setenece mapped to index as value
    '''
    tokenized_data = {}
    for i in range(len(text_dict.keys())):
        text_list = text_dict[i].split()
        clean_words(text_list)
        text_idx = []
        for word in text_list:
            try:
                text_idx.append(idx_mapping[word])
            except KeyError:
                #UNK token
                text_idx.append(1)
                continue
        #insert BoS token
        text_idx.insert(0,2)
        tokenized_data[i] = np.array(text_idx)
    return tokenized_data

In [179]:
test_h = {0:hypothesis[0]}

In [38]:
vocab_size = 50000
emb_dim = 300
glove_path = '/Users/Lisa/Documents/Grad School/DS-GA 1101/data/glove.6B/glove.6B.300d.txt'
text_path = '/Users/Lisa/Documents/Grad School/DS-GA 1101/data/snli_1.0/snli_1.0_train.jsonl'

In [150]:
hypothesis, premise, label, label_enc = load_data(text_path)

785 excluded


In [190]:
embeddings, words, idx2words, ordered_words = load_embeddings(glove_path, vocab_size, emb_dim)

In [None]:
#modifies embeddings, words, idx2words in place to add tokens
words, embeddings = add_tokens(words, embeddings, emb_dim)
idx2words = {v:k for k,v in words.items()}

In [195]:
h_idx = tokenize(hypothesis, words)
p_idx = tokenize(premise, words)

In [202]:
fp = '/Users/Lisa/Documents/Grad School/DS-GA 1101/data/pickles/'
with open(fp+'h_idx.pt', 'wb') as f:
    pickle.dump(h_idx, f)
with open(fp+'p_idx.pt', 'wb') as f:
    pickle.dump(p_idx, f)

## Create Batches

In [252]:
#sample indices
idx_list = np.array((0,1,2,3))

In [272]:
h_test_batch = torch.LongTensor(4, h_max_len)
p_test_batch = torch.LongTensor(4, p_max_len)
p_max_len

12

In [283]:
#pad batch
h_max_len = max([len(h_idx[x]) for x in idx_list])
p_max_len = max([len(p_idx[x]) for x in idx_list])
h_test_batch = torch.Tensor(4, h_max_len)
p_test_batch = torch.Tensor(4, p_max_len)
p_max_len
for i in range(4):
    h_test_batch[i] = torch.from_numpy(\
                    np.concatenate((h_idx[idx_list[i]],np.zeros(max(h_max_len-len(h_idx[idx_list[i]]),0))))\
                                    )
    p_test_batch[i] = torch.from_numpy(\
                    np.concatenate((p_idx[idx_list[i]],np.zeros(max(p_max_len-len(p_idx[idx_list[i]]),0))))\
                                    )
h_test_batch=h_test_batch.long()
p_test_batch=p_test_batch.long()

In [440]:
h_test_batch


     2     10    902     17    791     29   2870     13     10    994
     2     10    902     17     25     10  19304   7490     32      0
     2     10    902     17  13080     16     10   2870      0      0
     2     42     35   8784     25     47   1111      0      0      0
[torch.LongTensor of size 4x10]

In [424]:
#TBD
class DecomposableAttention(nn.Module):
    
    def __init__(self, batch_size, glove_emb, hidden_dim, num_labels):
        super(DecomposableAttention, self).__init__()
        self.glove = glove_emb
        self.num_embeddings = glove_emb.shape[0]
        self.embedding_dim = glove_emb.shape[1]
        
        
        self.embed = nn.Embedding(self.num_embeddings, self.embedding_dim)
        
        self.a_linear = nn.Linear(self.embedding_dim,self.embedding_dim)
        self.b_linear = nn.Linear(self.embedding_dim,self.embedding_dim)
        self.init_weights()
        
    def forward(self, hypothesis, premise):
        
        '''
        Embedding layer
        max length = max length of of hypothesis/premise (respectively) in batch
        Input dim: batch size x max length
        Output dim: batch size x max length x embedding dimensions
        '''
        h_embedded = self.embed(Variable(hypothesis))
        p_embedded = self.embed(Variable(premise))
        
        
        '''
        Relu layer (F from paper)
        max length = max length of of hypothesis/premise (respectively) in batch
        Input dim: batch size x (max length*embedding dimensions)
        Output dim: batch size x (max length*embedding dimensions)
        
        CHECK THAT THE RESHAPING IS CORRECT
        '''
        #input is reshaped to batch_size x (max length*embedding dimensions)
        #this is equivalent to having each 300 length embedding appended horizontally for each word
        F_a = F.relu(h_embedded.view(batch_size,-1))
        F_b = F.relu(p_embedded.view(batch_size,-1))
        #E dim: (max length of hypothesis*embedding dimensions) x (max length of premise*embedding dimensions)
        E = torch.mm(torch.transpose(F_a,0,1),F_b)
        
        '''
        TO DO: ATTENTION STEP
        we can use probs use torch.nn.Softmax
        '''
        
        return F_a, F_b, E

    def init_weights(self):
        self.embed.weight.data.copy_(torch.from_numpy(self.glove))

In [425]:
da = DecomposableAttention(4,embeddings,1,3)

In [426]:
F_a, F_b, E = da.forward(h_test_batch,p_test_batch)

In [427]:
F_a

Variable containing:
 0.0000  0.0000  0.1317  ...   0.1720  0.0000  0.0000
 0.0000  0.0000  0.1317  ...   0.0000  0.0000  0.0000
 0.0000  0.0000  0.1317  ...   0.0000  0.0000  0.0000
 0.0000  0.0000  0.1317  ...   0.0000  0.0000  0.0000
[torch.FloatTensor of size 4x3000]

In [439]:
E

Variable containing:
  0.0000   0.0000   0.0000  ...    0.0000   0.0000   0.0000
  0.0000   0.0000   0.0000  ...    0.0000   0.0000   0.0000
  0.0000   0.0000   0.0694  ...    0.0000   0.3091   0.0808
           ...               ⋱              ...            
  0.0000   0.0000   0.0227  ...    0.0000   0.1346   0.0352
  0.0000   0.0000   0.0000  ...    0.0000   0.0000   0.0000
  0.0000   0.0000   0.0000  ...    0.0000   0.0000   0.0000
[torch.FloatTensor of size 3000x3600]

In [437]:
em_test.view(4,-1)[3][300]

Variable containing:
-0.1329
[torch.FloatTensor of size 1]

In [438]:
em_test[3][1]

Variable containing:
-1.3292e-01
 1.6985e-01
-1.4360e-01
-8.8722e-02
 7.9510e-02
-1.4212e-01
-2.4209e-02
-2.6291e-01
-7.4814e-02
-2.3600e+00
 3.4830e-01
-9.1722e-02
-5.3906e-02
 3.0418e-01
-1.3286e-01
 5.0341e-03
-1.5056e-01
 2.3562e-03
 6.8321e-02
 3.4246e-01
 3.9891e-01
 5.8813e-01
 6.0618e-02
-1.9871e-01
-4.0465e-01
-1.0706e-01
-5.9312e-03
-6.4842e-01
 1.9080e-01
-1.7630e-01
 9.2407e-02
 3.8685e-01
-3.1085e-01
-3.2574e-01
-1.6823e+00
 2.5336e-01
-2.4647e-01
-1.0874e-01
 7.6402e-03
 3.3880e-01
-5.9736e-02
-8.5940e-01
-8.0964e-02
-2.2981e-01
 1.7709e-01
 8.2094e-02
 7.4416e-01
 3.6873e-01
 1.3740e-01
 2.9408e-01
 1.0647e-01
-1.3246e-01
 1.2134e-01
-1.4273e-01
-5.3270e-01
 6.4936e-01
 4.9657e-01
 3.0029e-01
 6.7226e-01
 1.8005e-01
 8.8050e-01
 3.8144e-02
-8.7140e-02
 7.6400e-01
-1.2107e-01
-4.2809e-01
-1.2588e-01
 8.8377e-04
 1.0596e-01
-3.0802e-01
 2.2887e-01
-2.5468e-01
-3.6484e-01
 7.4524e-01
-1.5217e-01
-5.5619e-02
 1.2049e-01
 3.9876e-01
-2.1991e-01
-1.8444e-01
-9.0398e-02
 1.4077

In [7]:
def eval_iter(source):
    #tbd
    return None
    
def data_iter(source, batch_size):
    dataset_size = len(source)
    start = -1 * batch_size
    order = list(range(dataset_size))
    random.shuffle(order)

    while True:
        start += batch_size
        if start > dataset_size - batch_size:
            # Start another epoch.
            start = 0
            random.shuffle(order)   
        batch_indices = order[start:start + batch_size]
        yield [source[index] for index in batch_indices]

In [8]:
"""Note that the Decomposable attention paper selected batch_size=4"""
data_iter = data_iter(train_data,4)

In [None]:
#TBD
class DecomposableAttention(nn.Module):
    
    def __init__(self, input_size, embedding_dim, hidden_dim, num_labels):
        super(DecomposableAttention, self).__init__()
        #self.dropout = nn.Dropout(p=0.5)
            
        self.a_linear = nn.Linear(embedding_dim,embedding_dim)
        self.b_linear = nn.Linear(embedding_dim,embedding_dim)
        
    def forward(self, x):
        # Pass the input through your layers in order
        
        a_relu = F.relu(self.a_linear)
        b_relu = F.relu(self.b_linear)
        
        return 

    def init_weights(self):
        '''tbd'''
        initrange = 0.1
        lin_layers = [self.linear_1, self.linear_2]
             
        for layer in lin_layers:
            layer.weight.data.uniform_(-initrange, initrange)
            if layer in lin_layers:
                layer.bias.data.fill_(0)

In [101]:
a_bar = Variable(torch.Tensor(train_data[1432]['premise']))
b_bar = Variable(torch.Tensor(train_data[1432]['hypothesis']))

a_linear = nn.Linear(300,300)
b_linear = nn.Linear(300,300)

a_relu = F.relu(a_linear(a_bar))
b_relu = F.relu(b_linear(b_bar))

e_ij = torch.matmul(a_relu,torch.transpose(b_relu,0,1))

In [203]:
def attention_weights(e,a_input,b_input):
    ''' Part 3.1
    Takes weight matrix e_ij, a_bar, and b_bar as inputs and returns 
    attention weight matrices alpha and beta.
    The jth row in alpha aligns with the jth word in sentence b (the hypothesis).
    The ith row in beta aligns with the ith word in sentence a (the premise)'''
    len_a = b_input.data.numpy().shape[0]
    len_b = a_input.data.numpy().shape[0]
    
    alphas = []
    betas = []
    for i in range(len_a):
        alphas.append(torch.sum(a_input*torch.transpose(torch.exp(e_ij[:,i])/ \
                      torch.sum(torch.exp(e_ij),dim=0)[i].view(-1,1),0,1),dim=0))
    for j in range(len_b):
        betas.append(torch.sum(b_bar*torch.transpose(torch.exp(e_ij[i,:])/ \
                    torch.sum(torch.exp(e_ij),dim=1)[i].view(-1,1),0,1),dim=0))
    
    alpha = torch.stack(alphas)
    beta = torch.stack(betas)
    
    return alpha, beta

In [None]:
 self.linear_1 = nn.Linear(embedding_dim, hidden_dim) 
        self.linear_2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear_3 = nn.Linear(hidden_dim, num_labels)
        self.init_weights()
        
    def forward(self, x):
        # Pass the input through your layers in order

        out = F.relu(self.linear_1(out))
        out = F.relu(self.linear_2(out))