In [201]:
import json
import random
import numpy as np
from collections import Counter
import pickle as pickle

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

## Load Data and Embeddings

In [178]:
unwanted_chars = ['\\','.',',','/','\'s']
start = ['<null>']

label_dict = {'neutral':0,'contradiction':-1,'entailment':1}

def load_data(path): #load SNLI words
    '''
    Constructs 4 dictionaries with the same key values across the dictionaries
    '''
    #data = []
    excluded = 0
    hypothesis = {}
    premise = {}
    label = {}
    label_enc = {}
    with open(path, 'r') as f:
        #need this so indexing is continuous when setences are skipped over
        idx = 0
        for i,line in enumerate(f):
            obj = json.loads(line)
            #skip these rows per readme
            if obj["gold_label"] == '-':
                excluded += 1
            else:
                label[idx] = obj["gold_label"]
                label_enc[idx] = label_dict[obj["gold_label"]]
                premise[idx] = obj["sentence1"]
                hypothesis[idx] = obj["sentence2"]
                idx += 1
    print('%s excluded' %excluded)
    return hypothesis, premise, label, label_enc

def load_embeddings(path,number_of_words,emb_dim): #load pre-trained GloVe embeddings
    words_to_load = number_of_words

    with open(path) as f:
        loaded_embeddings = np.zeros((words_to_load, emb_dim))
        words = {}
        idx2words = {}
        ordered_words = []
        for i, line in enumerate(f):
            if i >= words_to_load: 
                break
            s = line.split()
            loaded_embeddings[i, :] = np.asarray(s[1:])
            words[s[0]] = i
            idx2words[i] = s[0]
            ordered_words.append(s[0])

    return loaded_embeddings, words, idx2words, ordered_words

def clean_words(text_list): # Removes characters and makes all words lowercase
    for i,word in enumerate(text_list):
        for ch in unwanted_chars:
            if ch in text_list[i]:
                text_list[i] = text_list[i].replace(ch,'')
            text_list[i] = text_list[i].lower()
            
def add_tokens(idx_mapping, embeddings, emb_dim):
    '''
    This function increases the index of the word to index mapping for GloVe so that
    0: padding index
    1: unk
    2: BoS
    '''
    words_cnt = Counter(idx_mapping)
    increment = Counter(dict.fromkeys(words, 3))
    words_cnt = words_cnt + increment
    words_cnt['<PAD_IDX>'] = 0
    words_cnt['<UNK>'] = 1
    words_cnt['<BoS>'] = 2
    
    #insert embeddings for tokens
    '''
    TO DO: FIX INITILIZATION
    '''
    #<BoS>
    embed = np.insert(embeddings,[0],np.random.rand(300),axis=0)
    #<UNK>
    embed = np.insert(embeddings,[0],np.random.rand(300),axis=0)
    #<PAD_IDX>
    embed = np.insert(embeddings,[0],np.zeros(300),axis=0)
    
    return words_cnt, embed

def clean_words(text_list): # Removes characters and makes all words lowercase
    for i,word in enumerate(text_list):
        for ch in unwanted_chars:
            if ch in text_list[i]:
                text_list[i] = text_list[i].replace(ch,'')
            text_list[i] = text_list[i].lower()

def tokenize(text_dict, idx_mapping):
    '''
    text_dict: dictionary with index as key, sentence as value
    returns dictionary with the index as key, setenece mapped to index as value
    '''
    tokenized_data = {}
    for i in range(len(text_dict.keys())):
        text_list = text_dict[i].split()
        clean_words(text_list)
        text_idx = []
        for word in text_list:
            try:
                text_idx.append(idx_mapping[word])
            except KeyError:
                #UNK token
                text_idx.append(1)
                continue
        #insert BoS token
        text_idx.insert(0,2)
        tokenized_data[i] = np.array(text_idx)
    return tokenized_data

In [179]:
test_h = {0:hypothesis[0]}

In [38]:
vocab_size = 50000
emb_dim = 300
glove_path = '/Users/Lisa/Documents/Grad School/DS-GA 1101/data/glove.6B/glove.6B.300d.txt'
text_path = '/Users/Lisa/Documents/Grad School/DS-GA 1101/data/snli_1.0/snli_1.0_train.jsonl'

In [150]:
hypothesis, premise, label, label_enc = load_data(text_path)

785 excluded


In [190]:
embeddings, words, idx2words, ordered_words = load_embeddings(glove_path, vocab_size, emb_dim)

In [None]:
#modifies embeddings, words, idx2words in place to add tokens
words, embeddings = add_tokens(words, embeddings, emb_dim)
idx2words = {v:k for k,v in words.items()}

In [195]:
h_idx = tokenize(hypothesis, words)
p_idx = tokenize(premise, words)

In [202]:
fp = '/Users/Lisa/Documents/Grad School/DS-GA 1101/data/pickles/'
with open(fp+'h_idx.pt', 'wb') as f:
    pickle.dump(h_idx, f)
with open(fp+'p_idx.pt', 'wb') as f:
    pickle.dump(p_idx, f)

## Create Batches

In [252]:
#sample indices
idx_list = np.array((0,1,2,3))

In [272]:
h_test_batch = torch.LongTensor(4, h_max_len)
p_test_batch = torch.LongTensor(4, p_max_len)
p_max_len

12

In [283]:
#pad batch
h_max_len = max([len(h_idx[x]) for x in idx_list])
p_max_len = max([len(p_idx[x]) for x in idx_list])
h_test_batch = torch.Tensor(4, h_max_len)
p_test_batch = torch.Tensor(4, p_max_len)
p_max_len
for i in range(4):
    h_test_batch[i] = torch.from_numpy(\
                    np.concatenate((h_idx[idx_list[i]],np.zeros(max(h_max_len-len(h_idx[idx_list[i]]),0))))\
                                    )
    p_test_batch[i] = torch.from_numpy(\
                    np.concatenate((p_idx[idx_list[i]],np.zeros(max(p_max_len-len(p_idx[idx_list[i]]),0))))\
                                    )
h_test_batch=h_test_batch.long()
p_test_batch=p_test_batch.long()

In [440]:
h_test_batch


     2     10    902     17    791     29   2870     13     10    994
     2     10    902     17     25     10  19304   7490     32      0
     2     10    902     17  13080     16     10   2870      0      0
     2     42     35   8784     25     47   1111      0      0      0
[torch.LongTensor of size 4x10]

In [560]:
#TBD
class DecomposableAttention(nn.Module):
    
    def __init__(self, batch_size, glove_emb, hidden_dim, num_labels):
        super(DecomposableAttention, self).__init__()
        self.glove = glove_emb
        self.num_embeddings = glove_emb.shape[0]
        self.embedding_dim = glove_emb.shape[1]
        self.batch_size = batch_size
        
        
        self.embed = nn.Embedding(self.num_embeddings, self.embedding_dim)
        
        self.a_linear = nn.Linear(self.embedding_dim,self.embedding_dim)
        self.b_linear = nn.Linear(self.embedding_dim,self.embedding_dim)
        self.init_weights()
        
    def forward(self, hypothesis, premise):
        
        '''
        Embedding layer
        max length = max length of of hypothesis/premise (respectively) in batch
        Input dim: batch size x max length
        Output dim: batch size x max length x embedding dimensions
        '''
        h_embedded = self.embed(Variable(hypothesis))
        p_embedded = self.embed(Variable(premise))
        
        
        '''
        Relu layer (F from paper)
        max length = max length of of hypothesis/premise (respectively) in batch
        Input dim: batch size x max length x embedding dimensions
        Output dim: batch size x max length x embedding dimensions
        '''
        F_a = F.relu(h_embedded.view(self.batch_size,-1)).view(self.batch_size,-1,self.embedding_dim)
        F_b = F.relu(p_embedded.view(self.batch_size,-1)).view(self.batch_size,-1,self.embedding_dim)
        #E dim: batch_size x max len of hypothesis x max len of premise
        E = Variable(torch.Tensor(4,F_a.size()[1],F_b.size()[1]))
        
        '''
        Attention! 
        Given E, we reweight using the softmax and store in W_beta, W_alpha
        W_beta dim: batch_size x len(hypothesis) x embedding dimensions
        W_alpha dim: batch_size x len(premise) x embedding dimensions
        '''
        W_beta = Variable(torch.Tensor(self.batch_size,F_a.size()[1],self.embedding_dim))
        W_alpha = Variable(torch.Tensor(self.batch_size,F_b.size()[1],self.embedding_dim))
        for i in range(self.batch_size):
            E[i] = torch.mm(F_a[i],torch.transpose(F_b[i],0,1))
            for j in range(F_a.size()[1]):
                W_beta[i,j] = torch.mm(F.softmax(E[i,j]).view(1,-1),p_embedded[i]).data
            for k in range(F_b.size()[1]):
                W_alpha[i,j] = torch.mm(F.softmax(E[i,:,j]).view(1,-1),h_embedded[i]).data

        return E, W_beta, W_alpha

    def init_weights(self):
        self.embed.weight.data.copy_(torch.from_numpy(self.glove))

In [561]:
da = DecomposableAttention(4,embeddings,1,3)

In [562]:
E, W_beta, W_alpha = da.forward(h_test_batch,p_test_batch)

In [566]:
W_beta[1]

Variable containing:
-1.0659e-01  1.1331e-01 -2.8385e-02  ...  -2.0881e-01 -3.8104e-01  2.8343e-01
-1.2761e-03  3.6513e-01 -7.7368e-02  ...  -1.8361e-01 -7.6515e-01  3.9204e-01
-1.9189e-01 -1.9580e-01  1.1768e-01  ...  -3.5758e-01 -1.9245e-01 -2.0331e-01
                ...                   ⋱                   ...                
-9.8946e-02  2.8964e-01 -2.0800e-01  ...  -3.5109e-01  1.0225e-01  1.1507e-01
-1.2077e-02  3.2002e-01 -1.0270e-01  ...  -2.1910e-01 -5.8389e-01  2.9213e-01
-1.9678e-02  9.0183e-02 -1.1415e-01  ...  -2.8583e-01 -2.1692e-01  9.3908e-02
[torch.FloatTensor of size 10x300]

In [528]:
W_beta = Variable(torch.Tensor(4,F_a.size()[1],300)

In [544]:
W_beta

Variable containing:
( 0 ,.,.) = 
  0.0000e+00  2.5244e-29  0.0000e+00  ...   4.5744e-41  2.3946e-38  4.5744e-41
  0.0000e+00  0.0000e+00  0.0000e+00  ...   4.5744e-41  6.8430e-37  1.4013e-45
  1.4013e-45  4.5744e-41  5.6052e-45  ...   0.0000e+00  2.3953e-38  4.5744e-41
                 ...                   ⋱                   ...                
  2.8665e+32  6.7331e+22  7.2065e+31  ...   1.4013e-45  3.7697e+12  1.4013e-45
  7.8405e+20  1.4013e-45  3.7328e+12  ...   1.4013e-45  3.7361e+12  1.4013e-45
  3.7343e+12  1.4013e-45  3.7456e+12  ...   1.4013e-45  3.7309e+12  1.4013e-45

( 1 ,.,.) = 
  3.7456e+12  1.4013e-45  3.7435e+12  ...   1.4013e-45  3.7371e+12  1.4013e-45
  3.7692e+12  1.4013e-45  3.7476e+12  ...   1.4013e-45  3.7333e+12  1.4013e-45
  3.7457e+12  1.4013e-45  3.7398e+12  ...   1.4013e-45  7.7666e+20  1.4013e-45
                 ...                   ⋱                   ...                
 -3.6893e+19  1.6657e+00  0.0000e+00  ...   0.0000e+00  0.0000e+00  0.0000e+00
  0.

In [559]:
E = Variable(torch.Tensor(4,F_a.size()[1],F_b.size()[1]))
W_beta = Variable(torch.Tensor(4,F_a.size()[1],300))
W_alpha = Variable(torch.Tensor(4,F_b.size()[1],300))
for i in range(4):
    E[i] = torch.mm(F_a[i],torch.transpose(F_b[i],0,1))
    for j in range(F_a.size()[1]):
        W_beta[i,j] = torch.mm(F.softmax(E[i,j]).view(1,-1),p_embedded[i]).data
    for k in range(F_b.size()[1]):
        W_alpha[i,j] = torch.mm(F.softmax(E[i,:,j]).view(1,-1),h_embedded[i]).data

In [509]:
b_1 = torch.mm(F.softmax(E[1][1]).view(1,-1),p_embedded[1])

In [546]:
F_a.size()

torch.Size([4, 10, 300])

In [553]:
E[2,1]

Variable containing:
1.00000e-43 *
  0.4484
  1.5975
  1.3593
  1.5414
  1.4433
  1.4153
  0.5605
  0.9809
  1.3312
  1.3593
  0.6446
  1.6115
[torch.FloatTensor of size 12]

In [558]:
E[1,:,1]

Variable containing:
1.00000e-43 *
  1.2051
  1.3873
  0.9809
  1.3032
  1.4714
  0.5745
  0.8548
  1.3312
  1.6255
  1.3733
[torch.FloatTensor of size 10]

In [7]:
def eval_iter(source):
    #tbd
    return None
    
def data_iter(source, batch_size):
    dataset_size = len(source)
    start = -1 * batch_size
    order = list(range(dataset_size))
    random.shuffle(order)

    while True:
        start += batch_size
        if start > dataset_size - batch_size:
            # Start another epoch.
            start = 0
            random.shuffle(order)   
        batch_indices = order[start:start + batch_size]
        yield [source[index] for index in batch_indices]

In [8]:
"""Note that the Decomposable attention paper selected batch_size=4"""
data_iter = data_iter(train_data,4)

In [None]:
#TBD
class DecomposableAttention(nn.Module):
    
    def __init__(self, input_size, embedding_dim, hidden_dim, num_labels):
        super(DecomposableAttention, self).__init__()
        #self.dropout = nn.Dropout(p=0.5)
            
        self.a_linear = nn.Linear(embedding_dim,embedding_dim)
        self.b_linear = nn.Linear(embedding_dim,embedding_dim)
        
    def forward(self, x):
        # Pass the input through your layers in order
        
        a_relu = F.relu(self.a_linear)
        b_relu = F.relu(self.b_linear)
        
        return 

    def init_weights(self):
        '''tbd'''
        initrange = 0.1
        lin_layers = [self.linear_1, self.linear_2]
             
        for layer in lin_layers:
            layer.weight.data.uniform_(-initrange, initrange)
            if layer in lin_layers:
                layer.bias.data.fill_(0)

In [101]:
a_bar = Variable(torch.Tensor(train_data[1432]['premise']))
b_bar = Variable(torch.Tensor(train_data[1432]['hypothesis']))

a_linear = nn.Linear(300,300)
b_linear = nn.Linear(300,300)

a_relu = F.relu(a_linear(a_bar))
b_relu = F.relu(b_linear(b_bar))

e_ij = torch.matmul(a_relu,torch.transpose(b_relu,0,1))

In [203]:
def attention_weights(e,a_input,b_input):
    ''' Part 3.1
    Takes weight matrix e_ij, a_bar, and b_bar as inputs and returns 
    attention weight matrices alpha and beta.
    The jth row in alpha aligns with the jth word in sentence b (the hypothesis).
    The ith row in beta aligns with the ith word in sentence a (the premise)'''
    len_a = b_input.data.numpy().shape[0]
    len_b = a_input.data.numpy().shape[0]
    
    alphas = []
    betas = []
    for i in range(len_a):
        alphas.append(torch.sum(a_input*torch.transpose(torch.exp(e_ij[:,i])/ \
                      torch.sum(torch.exp(e_ij),dim=0)[i].view(-1,1),0,1),dim=0))
    for j in range(len_b):
        betas.append(torch.sum(b_bar*torch.transpose(torch.exp(e_ij[i,:])/ \
                    torch.sum(torch.exp(e_ij),dim=1)[i].view(-1,1),0,1),dim=0))
    
    alpha = torch.stack(alphas)
    beta = torch.stack(betas)
    
    return alpha, beta

In [None]:
 self.linear_1 = nn.Linear(embedding_dim, hidden_dim) 
        self.linear_2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear_3 = nn.Linear(hidden_dim, num_labels)
        self.init_weights()
        
    def forward(self, x):
        # Pass the input through your layers in order

        out = F.relu(self.linear_1(out))
        out = F.relu(self.linear_2(out))