In [1]:
import json
import random
import numpy as np
from collections import Counter

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

## Load Data and Embeddings

In [178]:
unwanted_chars = ['\\','.',',','/','\'s']
start = ['<null>']

label_dict = {'neutral':0,'contradiction':-1,'entailment':1}

def load_data(path): #load SNLI words
    '''
    Constructs 4 dictionaries with the same key values across the dictionaries
    '''
    #data = []
    excluded = 0
    hypothesis = {}
    premise = {}
    label = {}
    label_enc = {}
    with open(path, 'r') as f:
        #need this so indexing is continuous when setences are skipped over
        idx = 0
        for i,line in enumerate(f):
            obj = json.loads(line)
            #skip these rows per readme
            if obj["gold_label"] == '-':
                excluded += 1
            else:
                label[idx] = obj["gold_label"]
                label_enc[idx] = label_dict[obj["gold_label"]]
                premise[idx] = obj["sentence1"]
                hypothesis[idx] = obj["sentence2"]
                idx += 1
    print('%s excluded' %excluded)
    return hypothesis, premise, label, label_enc

def load_embeddings(path,number_of_words,emb_dim): #load pre-trained GloVe embeddings
    words_to_load = number_of_words

    with open(path) as f:
        loaded_embeddings = np.zeros((words_to_load, emb_dim))
        words = {}
        idx2words = {}
        ordered_words = []
        for i, line in enumerate(f):
            if i >= words_to_load: 
                break
            s = line.split()
            loaded_embeddings[i, :] = np.asarray(s[1:])
            words[s[0]] = i
            idx2words[i] = s[0]
            ordered_words.append(s[0])

    return loaded_embeddings, words, idx2words, ordered_words

def clean_words(text_list): # Removes characters and makes all words lowercase
    for i,word in enumerate(text_list):
        for ch in unwanted_chars:
            if ch in text_list[i]:
                text_list[i] = text_list[i].replace(ch,'')
            text_list[i] = text_list[i].lower()
            
def add_tokens(idx_mapping, embeddings, emb_dim):
    '''
    This function increases the index of the word to index mapping for GloVe so that
    0: padding index
    1: unk
    2: BoS
    '''
    words_cnt = Counter(idx_mapping)
    increment = Counter(dict.fromkeys(words, 3))
    words_cnt = words_cnt + increment
    words_cnt['<PAD_IDX>'] = 0
    words_cnt['<UNK>'] = 1
    words_cnt['<BoS>'] = 2
    
    #insert embeddings for tokens
    '''
    TO DO: FIX INITILIZATION
    '''
    #<BoS>
    embed = np.insert(embeddings,[0],np.random.rand(300),axis=0)
    #<UNK>
    embed = np.insert(embeddings,[0],np.random.rand(300),axis=0)
    #<PAD_IDX>
    embed = np.insert(embeddings,[0],np.zeros(300),axis=0)
    
    return words_cnt, embed

def clean_words(text_list): # Removes characters and makes all words lowercase
    for i,word in enumerate(text_list):
        for ch in unwanted_chars:
            if ch in text_list[i]:
                text_list[i] = text_list[i].replace(ch,'')
            text_list[i] = text_list[i].lower()

def tokenize(text_dict, idx_mapping):
    '''
    text_dict: dictionary with index as key, sentence as value
    returns dictionary with the index as key, setenece mapped to index as value
    '''
    tokenized_data = {}
    for i in range(len(text_dict.keys())):
        text_list = text_dict[i].split()
        clean_words(text_list)
        text_idx = []
        for word in text_list:
            try:
                text_idx.append(idx_mapping[word])
            except KeyError:
                #UNK token
                text_idx.append(1)
                continue
        #insert BoS token
        text_idx.insert(0,2)
        tokenized_data[i] = np.array(text_idx)
    return tokenized_data

In [179]:
test_h = {0:hypothesis[0]}

In [38]:
vocab_size = 50000
emb_dim = 300
glove_path = '/Users/Lisa/Documents/Grad School/DS-GA 1101/data/glove.6B/glove.6B.300d.txt'
text_path = '/Users/Lisa/Documents/Grad School/DS-GA 1101/data/snli_1.0/snli_1.0_train.jsonl'

In [150]:
hypothesis, premise, label, label_enc = load_data(text_path)

785 excluded


In [190]:
embeddings, words, idx2words, ordered_words = load_embeddings(glove_path, vocab_size, emb_dim)

In [None]:
#modifies embeddings, words, idx2words in place to add tokens
words, embeddings = add_tokens(words, embeddings, emb_dim)
idx2words = {v:k for k,v in words.items()}

In [195]:
h_idx = tokenize(hypothesis, words)
p_idx = tokenize(premise, words)

In [196]:
p_idx

{0: array([    2,    10,   902,    16,    10,  2870, 11073,    77,    10,
         2324,   138,  7353]),
 1: array([    2,    10,   902,    16,    10,  2870, 11073,    77,    10,
         2324,   138,  7353]),
 2: array([    2,    10,   902,    16,    10,  2870, 11073,    77,    10,
         2324,   138,  7353]),
 3: array([   2,  274, 8784,    8, 8888,   25, 3537]),
 4: array([   2,  274, 8784,    8, 8888,   25, 3537]),
 5: array([   2,  274, 8784,    8, 8888,   25, 3537]),
 6: array([    2,    10,  1609,    17,  6864,    16, 36483,     9,     3,
          702,     6,    10,   642,  1644]),
 7: array([    2,    10,  1609,    17,  6864,    16, 36483,     9,     3,
          702,     6,    10,   642,  1644]),
 8: array([    2,    10,  1609,    17,  6864,    16, 36483,     9,     3,
          702,     6,    10,   642,  1644]),
 9: array([   2,   32, 1578,  303, 6022,   20,   29, 3203, 6991,   25,   10,
         360, 1804,    9,   10, 3427, 2858,  113, 1245,    9, 4132, 7064,
        6165

In [197]:
premise

{0: 'A person on a horse jumps over a broken down airplane.',
 1: 'A person on a horse jumps over a broken down airplane.',
 2: 'A person on a horse jumps over a broken down airplane.',
 3: 'Children smiling and waving at camera',
 4: 'Children smiling and waving at camera',
 5: 'Children smiling and waving at camera',
 6: 'A boy is jumping on skateboard in the middle of a red bridge.',
 7: 'A boy is jumping on skateboard in the middle of a red bridge.',
 8: 'A boy is jumping on skateboard in the middle of a red bridge.',
 9: 'An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background.',
 10: 'An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background.',
 11: 'An older man sits with his orange juice at a small table in a coffee shop while employees in bright colored shirts smile in the background.',
 12: 'Two blond women are hug

## Create Batches

In [7]:
def eval_iter(source):
    #tbd
    return None
    
def data_iter(source, batch_size):
    dataset_size = len(source)
    start = -1 * batch_size
    order = list(range(dataset_size))
    random.shuffle(order)

    while True:
        start += batch_size
        if start > dataset_size - batch_size:
            # Start another epoch.
            start = 0
            random.shuffle(order)   
        batch_indices = order[start:start + batch_size]
        yield [source[index] for index in batch_indices]

In [8]:
"""Note that the Decomposable attention paper selected batch_size=4"""
data_iter = data_iter(train_data,4)

In [None]:
#TBD
class DecomposableAttention(nn.Module):
    
    def __init__(self, input_size, embedding_dim, hidden_dim, num_labels):
        super(DecomposableAttention, self).__init__()
        #self.dropout = nn.Dropout(p=0.5)
            
        self.a_linear = nn.Linear(embedding_dim,embedding_dim)
        self.b_linear = nn.Linear(embedding_dim,embedding_dim)
        
    def forward(self, x):
        # Pass the input through your layers in order
        
        a_relu = F.relu(self.a_linear)
        b_relu = F.relu(self.b_linear)
        
        return 

    def init_weights(self):
        '''tbd'''
        initrange = 0.1
        lin_layers = [self.linear_1, self.linear_2]
             
        for layer in lin_layers:
            layer.weight.data.uniform_(-initrange, initrange)
            if layer in lin_layers:
                layer.bias.data.fill_(0)

In [101]:
a_bar = Variable(torch.Tensor(train_data[1432]['premise']))
b_bar = Variable(torch.Tensor(train_data[1432]['hypothesis']))

a_linear = nn.Linear(300,300)
b_linear = nn.Linear(300,300)

a_relu = F.relu(a_linear(a_bar))
b_relu = F.relu(b_linear(b_bar))

e_ij = torch.matmul(a_relu,torch.transpose(b_relu,0,1))

In [203]:
def attention_weights(e,a_input,b_input):
    ''' Part 3.1
    Takes weight matrix e_ij, a_bar, and b_bar as inputs and returns 
    attention weight matrices alpha and beta.
    The jth row in alpha aligns with the jth word in sentence b (the hypothesis).
    The ith row in beta aligns with the ith word in sentence a (the premise)'''
    len_a = b_input.data.numpy().shape[0]
    len_b = a_input.data.numpy().shape[0]
    
    alphas = []
    betas = []
    for i in range(len_a):
        alphas.append(torch.sum(a_input*torch.transpose(torch.exp(e_ij[:,i])/ \
                      torch.sum(torch.exp(e_ij),dim=0)[i].view(-1,1),0,1),dim=0))
    for j in range(len_b):
        betas.append(torch.sum(b_bar*torch.transpose(torch.exp(e_ij[i,:])/ \
                    torch.sum(torch.exp(e_ij),dim=1)[i].view(-1,1),0,1),dim=0))
    
    alpha = torch.stack(alphas)
    beta = torch.stack(betas)
    
    return alpha, beta

In [None]:
 self.linear_1 = nn.Linear(embedding_dim, hidden_dim) 
        self.linear_2 = nn.Linear(hidden_dim, hidden_dim)
        self.linear_3 = nn.Linear(hidden_dim, num_labels)
        self.init_weights()
        
    def forward(self, x):
        # Pass the input through your layers in order

        out = F.relu(self.linear_1(out))
        out = F.relu(self.linear_2(out))