# PS3: Neural Networks for Classification and Natural Language Inference

In [30]:
import json
import csv
import os
import glob

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F

from sklearn.metrics import f1_score, precision_score, recall_score

import numpy as np

The purpose of this task is to gain an understanding of training neural networks. Likewise, you will get to learn about the pytorch framework.

## Submission Instructions

After completing the exercises below, generate a pdf of the code **with** outputs. After that create a zip file containing both the completed exercise and the generated PDF. You are **required** to check the PDF to make sure all the code **and** outputs are clearly visible and easy to read. If your code goes off the page, you should reduce the line size. I generally recommend not going over 80 characters.

Finally, name the zip file using a combination of your the assigment and your name, e.g., ps3_rios.zip

## PART I: Data Cleaning (10 points)

Load the "surnames.csv" file to train a LSTM to predict nationality based on surname. You will need to transform the data from a list of strings to a list of indexes. For example, the following data

```
Anthony
John
David
```

should be transformed into a list of lists.

```
[[0, 1, 2, 3, 4, 1, 5],
 [6, 4, 3, 1],
 [7, 8, 9, 10, 11]]
```

Next, you will need zero-pad all examples to be the same size.

```
[[0, 1, 2, 3, 4, 1, 5],
 [6, 4, 3, 1, 0, 0, 0],
 [7, 8, 9, 10, 11, 0, 0]]
```

Finally, everything will be converted into numpy arrays.

In [31]:
def access_data(path):
    ret = {'X_train':[], 'y_train':[], 'X_dev':[], 'y_dev':[], 'X_test':[], 'y_test':[], 'len':[]}
    with open(path, 'r', encoding='utf-8') as f:
        reader = csv.reader(f,dialect='excel')
        for i, row in enumerate(reader):
#             if i > 5:
#                 break
#             print(row)
            if row[0] == 'train':
                ret['X_train'].append(row[1])
                ret['y_train'].append(row[2])
            if row[0] == 'test':
                ret['X_test'].append(row[1])
                ret['y_test'].append(row[2])
            if row[0] == 'dev':
                ret['X_dev'].append(row[1])
                ret['y_dev'].append(row[2])
            ret['len'].append(len(row[1]))
    return ret
    
def load_data(path):
    
    results = access_data(path)
    class2index = {}
    for class_name in set(results['y_train']+results['y_dev']+results['y_test']):
        class2index[class_name] = len(class2index)
    index2class = {ind:class_name for class_name,ind in class2index.items()}
    return results, class2index, index2class
    
    
def update_mappings(X, x2index, index2x, map_elements=True):
    if map_elements:
        xs = set([element for x in X for element in x])
    else:
        xs = set([x for x in X])
    for x in xs:
#         print("\nx:",x)
#         print(len(x2index))
        x2index[x] = len(x2index)
        index2x[len(index2x)] = x
    
        
def convert_to_index_map(X, x2index, map_element=True):
    index_mappings = []
    for x in X:
        if map_element:
            index_map = [x2index[element] if element in x2index else 0 for element in x]
        else:
            if x in x2index:
                index_map = x2index[x]
            else:
#                 print('S')
                index_map = 0
        index_mappings.append(index_map)
    return index_mappings

In [32]:
char2index = {'<PAD>': 0}
index2char = {0: '<PAD>'}
class2index = {} # stores the class index pairs.
index2class = {}
doc_lengths = [] # Stores the lengths of all docs (train, test and dev)
X_train = [] # stores an 
y_train = [] # stores an index to the correct class
X_dev = []
y_dev = []
X_test = []
y_test = []
X_train_len = [] # Stores the length of each training name
X_test_len = [] # ... length of each test name
X_dev_len = [] # ... length of each dev name

# Write code to load data here.
dataset_filename = 'surnames.csv'
datadir = 'data'
dataset_path = os.path.join(datadir, dataset_filename)
data_names = ['X_train', 'X_dev', 'X_test']


            
    
    
    
data, class2index, index2class = load_data(dataset_path)
X_train, y_train, X_dev, y_dev, X_test, y_test = data['X_train'],data['y_train'],data['X_dev'],data['y_dev'] \
                                                 ,data['X_test'],data['y_test']
doc_lengths = data['len']
d = [X_train, X_dev, X_test]
dl = [X_train_len, X_dev_len, X_test_len]
[dl[i].append(len(x)) for i in range(len(d)) for x in d[i]]
update_mappings(X_train, char2index, index2char)
# print(class2index)
# update_mappings(y_train, class2index, index2class, map_elements=False)
print(char2index)
print(class2index)
print(index2class)

X_train_nums, X_dev_nums = convert_to_index_map(X_train, char2index),convert_to_index_map(X_dev, char2index)
X_test_nums = convert_to_index_map(X_test, char2index)
y_train = convert_to_index_map(y_train, class2index, map_element=False)
y_dev = convert_to_index_map(y_dev, class2index, map_element=False)
y_test = convert_to_index_map(y_test, class2index, map_element=False)

{'<PAD>': 0, 'q': 1, 'g': 2, 's': 3, 'S': 4, 'Ś': 5, 'a': 6, ',': 7, 'n': 8, 'ö': 9, 'M': 10, 'e': 11, 'ü': 12, 'I': 13, 'ñ': 14, 'U': 15, 'ą': 16, 'x': 17, 'á': 18, 'o': 19, 'à': 20, '-': 21, 'L': 22, 'z': 23, 'Á': 24, 'J': 25, 'w': 26, 'ê': 27, ' ': 28, 't': 29, 'ł': 30, 'E': 31, 'ä': 32, 'F': 33, 'H': 34, 'ß': 35, 'u': 36, 'm': 37, 'b': 38, 'ó': 39, 'X': 40, 'v': 41, 'O': 42, 'j': 43, 'i': 44, 'l': 45, 'T': 46, 'B': 47, 'í': 48, 'V': 49, 'ż': 50, 'Q': 51, 'ú': 52, 'G': 53, 'Y': 54, 'y': 55, 'k': 56, 'C': 57, '/': 58, 'ì': 59, 'é': 60, 'c': 61, 'f': 62, 'p': 63, 'h': 64, 'd': 65, 'Z': 66, 'N': 67, 'W': 68, "'": 69, 'A': 70, 'õ': 71, 'ã': 72, 'r': 73, 'K': 74, 'ò': 75, 'ń': 76, 'è': 77, 'ù': 78, 'D': 79, 'R': 80, 'P': 81, '1': 82}
{'italian': 0, 'dutch': 1, 'chinese': 2, 'spanish': 3, 'japanese': 4, 'russian': 5, 'german': 6, 'french': 7, 'korean': 8, 'polish': 9, 'scottish': 10, 'czech': 11, 'greek': 12, 'portuguese': 13, 'english': 14, 'irish': 15, 'vietnamese': 16, 'arabic': 17}
{0

In [33]:
from collections import Counter


print(len(X_dev))
print(len(X_dev_len))
print(len(doc_lengths))
print(X_train_nums[1])
print(X_train[1])
print(y_train[0])
cnt = Counter()
cnt.update(y_train)
print(cnt)
cnt = [val for key, val in sorted(cnt.items(), key=lambda x: x[0])]
class_weights = torch.FloatTensor(cnt)/sum(cnt)
print(class_weights)

3060
3060
20074
[81, 73, 44, 56, 6, 23, 61, 64, 44, 56, 19, 41]
Prikazchikov
17
Counter({5: 7050, 14: 2713, 17: 1507, 4: 770, 6: 532, 0: 526, 11: 380, 1: 223, 3: 213, 7: 203, 2: 199, 15: 174, 12: 154, 9: 103, 10: 73, 8: 66, 13: 57, 16: 57})
tensor([0.0351, 0.0149, 0.0133, 0.0142, 0.0513, 0.4700, 0.0355, 0.0135, 0.0044,
        0.0069, 0.0049, 0.0253, 0.0103, 0.0038, 0.1809, 0.0116, 0.0038, 0.1005])


In [34]:
# PADDING

max_seq_len = max(doc_lengths)
len_to_pad = len(max(X_train, key=lambda x: len(x)))
print('longest in training set:', len_to_pad)
X_train_eq_size = []
X_dev_eq_size = []
X_test_eq_size = []

def pad_example(ex, len_to_pad, pad):
    padded = ex[:len_to_pad] +[pad]*(len_to_pad -len(ex))
    return padded
# Write code to append data to code here
for x in X_train_nums:
    X_train_eq_size.append(pad_example(x,len_to_pad, 0))
    
for x in X_dev_nums:
    X_dev_eq_size.append(pad_example(x,len_to_pad, 0))
    
for x in X_test_nums:
    X_test_eq_size.append(pad_example(x,len_to_pad, 0))
    
print(len(X_dev))
X_train = np.array(X_train_eq_size)
X_dev = np.array(X_dev_eq_size)
X_test = np.array(X_test_eq_size)
print(len(X_dev))
print(X_dev[0])
y_train = np.array(y_train)
y_dev = np.array(y_dev)
y_test = np.array(y_test)

X_train_len = np.array(X_train_len)
X_dev_len = np.array(X_dev_len)
X_test_len = np.array(X_test_len)

idx = np.argsort(X_dev_len)[::-1]
X_dev = X_dev[idx]
y_dev = y_dev[idx]
X_dev_len = X_dev_len[idx]

idx = np.argsort(X_test_len)[::-1]
X_test = X_test[idx]
y_test = y_test[idx]
X_test_len = X_test_len[idx]

doc_lengths = np.array(doc_lengths)
print(X_train.shape)

longest in training set: 20
3060
3060
[33 36 73 45 19  8  2  0  0  0  0  0  0  0  0  0  0  0  0  0]
(15000, 20)


In [35]:
print(np.unique(y_train, axis=0, return_counts=True))
print(X_dev.shape)
print(np.unique(X_dev,  return_counts=True))
print(X_dev)
print(X_dev[1])

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17]), array([ 526,  223,  199,  213,  770, 7050,  532,  203,   66,  103,   73,
        380,  154,   57, 2713,  174,   57, 1507], dtype=int64))
(3060, 20)
(array([ 0,  1,  2,  3,  4,  5,  6,  8,  9, 10, 11, 13, 14, 15, 17, 18, 19,
       20, 21, 22, 23, 25, 26, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39,
       40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, 52, 53, 54, 55, 56, 57,
       60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 73, 74, 75, 79, 80, 81]), array([39281,     2,   333,   999,   216,     1,  2312,  1395,     1,
         230,  1573,    35,     1,    20,     9,     1,  1625,     2,
           4,   104,   205,   118,   131,    15,   716,    66,     1,
          91,   178,     1,   673,   404,   330,     4,     6,   842,
          54,    80,  1488,   880,   173,   251,     2,   117,     6,
           2,   179,    59,   503,   870,    89,     2,   389,   199,
         132,  1019,   465,    78,   108

## PART II: Classification (25 points)

In [36]:
class LSTM(nn.Module):
    def __init__(self, nb_layers, word2index, class2index, nb_lstm_units=100,
                 embedding_dim=3, batch_size=3, bidirectional=False):
        super(LSTM, self).__init__()
        self.vocab = word2index
        self.tags = class2index

        self.nb_layers = nb_layers
        self.nb_lstm_units = nb_lstm_units
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size

        self.nb_tags = len(self.tags)
        self.bidirectional = bidirectional
        self.num_directions = 2 if self.bidirectional else 1
        print('num_directions:', self.num_directions)
        # build actual NN
        self.__build_model()

    def __build_model(self):
        nb_vocab_words = len(self.vocab)

        padding_idx = self.vocab['<PAD>']
        self.word_embedding = nn.Embedding(
            num_embeddings=nb_vocab_words,
            embedding_dim=self.embedding_dim,
            padding_idx=padding_idx
        )

        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.nb_lstm_units,
            num_layers=self.nb_layers,
            batch_first=True
        )
        
        if self.bidirectional:
            
            self.lstm_back = nn.LSTM(
                input_size=self.embedding_dim,
                hidden_size=self.nb_lstm_units,
                num_layers=self.nb_layers,
                batch_first=True
            )

        self.hidden_to_tag = nn.Linear(self.nb_lstm_units*self.nb_layers*self.num_directions, self.nb_tags)
        
        self.logsoftmax = nn.LogSoftmax(dim=1)
        self.softmax = nn.Softmax(dim=1)
        self.inference = False

    def init_hidden(self, X, bidirectional=False):
#         if bidirectional:
#             h0 = torch.zeros(self.nb_layers, X.size(0), self.nb_lstm_units*self.num_directions).float()
#             c0 = torch.zeros(self.nb_layers, X.size(0), self.nb_lstm_units*self.num_directions).float()
#         else:
            # Initial ht (hidden state) and ct (context)
        h0 = torch.zeros(self.nb_layers, X.size(0), self.nb_lstm_units).float()
        c0 = torch.zeros(self.nb_layers, X.size(0), self.nb_lstm_units).float()
        return (h0,c0)

    def forward(self, X, X_lengths):
        # reset the LSTM hidden state. Must be done before you run a new batch.
        # Otherwise the LSTM will treat
        # a new batch as a continuation of a sequence
        self.hidden = self.init_hidden(X)
        self.hidden_back = self.init_hidden(X)
        
        batch_size, seq_len = X.shape
        
        # ---------------------
        # 1. embed the input
        # Dim transformation: (batch_size, seq_len) -> (batch_size, seq_len, embedding_dim)
#         print(X[:128,:])
#         print(X[:128,:].size())
        
        X = self.word_embedding(X)
        if self.bidirectional:
            X = torch.cat((X,torch.flip(X,[1])), 2)
            

        # ---------------------
        # 2. Run through RNN
        # TRICK 2 ********************************
        # Dim transformation: (batch_size, seq_len, embedding_dim) -> (batch_size, seq_len, nb_lstm_units)

    
        # pack_padded_sequence so that padded items in the sequence won't be shown to the LSTM
        if self.bidirectional:
            X_for = torch.nn.utils.rnn.pack_padded_sequence(X[:,:,:self.embedding_dim], X_lengths, batch_first=True)
            X_back = torch.nn.utils.rnn.pack_padded_sequence(X[:,:,self.embedding_dim:], X_lengths, batch_first=True)
        else:
            X = torch.nn.utils.rnn.pack_padded_sequence(X, X_lengths, batch_first=True)
        
        
        # now run through LSTM
        # X contains the padded sequence output and ht contains the final hidden states
        if self.bidirectional:
            X_for, (ht_for, ct_for) = self.lstm(X_for, self.hidden)
#             X_back, (ht_back, ct_back) = self.lstm(X_back, self.hidden_back)
            X_back, (ht_back, ct_back) = self.lstm_back(X_back, self.hidden) 
            ht = torch.cat((ht_for, ht_back), 2)
        else:
            X, (ht, ct) = self.lstm(X, self.hidden)
        
#         print('hidden state shape:', ht.size())
        
        # Reshape to use the final state from each lstm layer
        out = ht.view(ht.size(1), self.nb_lstm_units*self.nb_layers*self.num_directions)

        # pass final states to output layer
        out = self.hidden_to_tag(out)
        
        # Use logsoftmax for training and softmax for testing
        if not self.inference:
            Y_hat = self.logsoftmax(out)
        else:
            Y_hat = self.softmax(out)

        return Y_hat

In [47]:
num_layers = 1
epochs = 15
batch_size = 16
lstm_unit_size = 128
embedding_size = 128
prints_per_epoch = 12
print_iter = len(y_train)//batch_size//prints_per_epoch
bidirectional = False

m = LSTM(num_layers, char2index, class2index, nb_lstm_units = lstm_unit_size,
         embedding_dim = embedding_size, batch_size = batch_size, bidirectional=bidirectional)

criterion = nn.NLLLoss()#size_average=False,weight=1/class_weights
optim = torch.optim.Adam(m.parameters(), lr=0.01)

indeces = np.arange(X_train.shape[0])     
print('y_train shape:', y_train.shape)
print('Iterations per epoch:', y_train.shape[0] // batch_size)
print('Number of prints per epoch:',y_train.shape[0] //batch_size // print_iter)

dev_idx = np.argsort(np.array(X_dev_len))[::-1]
np_X_dev = torch.tensor(X_dev[dev_idx]).long()
np_X_dev_len = torch.tensor(X_dev_len[dev_idx]).long() 

for epoch in range(epochs):
    np.random.shuffle(indeces)
    x_train = X_train[indeces]
    y_train2 = y_train[indeces]
    x_lens = X_train_len[indeces]

    np_x_sorted_lens = np.array(x_lens)[np.argsort(np.array(x_lens))[::-1]]
    current_batch = 0
    for iteration in range(y_train2.shape[0] // batch_size):
        
        batch_lengths = x_lens[current_batch: current_batch + batch_size]
        lengths = np.array(batch_lengths)
        idx = np.argsort(lengths)[::-1]
        batch_lengths = batch_lengths[idx]
        batch_lengths = torch.tensor(batch_lengths).long()
        
        
        batch_x = x_train[current_batch: current_batch + batch_size]
        batch_x = batch_x[idx]
        batch_x = torch.tensor(batch_x).long()
        
        batch_y = y_train2[current_batch: current_batch + batch_size]
        batch_y = batch_y[idx]
        batch_y = torch.tensor(batch_y).long()
        
        current_batch += batch_size
                        
        optim.zero_grad()
        if len(batch_x) > 0:
            batch_pred = m(batch_x, batch_lengths)
            
            loss = criterion(batch_pred, batch_y)
            loss.backward()
            optim.step()

        if iteration % print_iter == 0:
            with torch.no_grad():
                m.train(False)
                m.inference = True
                train_batch_pred = np.array(m(batch_x, batch_lengths)).argmax(axis=1)
                train_mic_f1 = f1_score(batch_y, train_batch_pred, average='micro')
                train_mac_f1 = f1_score(batch_y, train_batch_pred, average='macro')
          
                dev_pred = np.array(m(np_X_dev, np_X_dev_len)).argmax(axis=1)
                
                #There is a bug in the shuffling here
                dev_batch_y = y_dev[dev_idx]
                dev_mic_f1 = f1_score(dev_batch_y, dev_pred, average='micro')
                dev_mac_f1 = f1_score(dev_batch_y, dev_pred, average='macro')
#                 precision = precision_score(batch_y, batch_pred, average='micro')
#                 recall = recall_score(batch_y, batch_pred, average='micro')
                print(f'\ttraining loss {loss.item():.3f}\titeraton: { iteration}\tepoch {epoch} ')
                print('\tTrain:')      
                print(f'\t\tmicro f1 { train_mic_f1:.3f} macro f1 {train_mac_f1:.3f}')
                print('\tDev:')      
                print(f'\t\tmicro f1 { dev_mic_f1:.3f} macro f1 {dev_mac_f1:.3f}\n')
#                 print('\tPrediction counts:')
#                 uniques = np.unique(train_batch_pred, axis=0, return_counts=True)
#                 print('\tpred indeces')
#                 print(uniques[0])
#                 print(uniques[0][0])
#                 print(len(uniques[0]))
#                 print('\t\t',end='')
#                 [print(str(uniques[0][i])+'  ', end='') for i in range(len(uniques[0]))]
#                 print()
#                 print('\tpred counts')
#                 print('\t\t',end='')
#                 [print(str(uniques[1][i])+'  ', end='') for i in range(len(uniques[0]))]
#                 print()
#                 print('\ttrue indeces')
#                 print('\t\t',end='')
#                 true_unqs = np.unique(batch_y, axis=0, return_counts=True)
#                 [print(str(true_unqs[0][i])+'  ', end='') for i in range(len(true_unqs[0]))]
#                 print()
#                 print('\ttrue counts')
#                 print('\t\t',end='')
#                 [print(str(true_unqs[1][i])+'  ', end='') for i in range(len(true_unqs[0]))]
#                 print()
#                 
                m.train(True)
                m.inference = False
        
        
    
    with torch.no_grad():
        m.train(False)
        m.inference = False
        np_full_lens = np.array(x_lens
        full_idx = np.argsort()[::-1]
        raw_pred = m(torch.tensor(x_train[full_idx]).long(),torch.tensor(x_lens[full_idx]).long() )
        train_pred = np.array(raw_pred).argmax(axis=1)
        train_mic_f1 = f1_score(y_train2, train_pred, average='micro')
        print(train_pred[:10])
        print(y_train2[:10])
        train_mac_f1 = f1_score(y_train2, train_pred, average='macro')
        loss = criterion(raw_pred, torch.tensor(y_train2).long())

        
        dev_pred = np.array(m(np_X_dev, np_X_dev_len)).argmax(axis=1)

        dev_batch_y = y_dev[dev_idx]
        dev_mic_f1 = f1_score(y_dev, dev_pred, average='micro')
        dev_mac_f1 = f1_score(y_dev, dev_pred, average='macro')
#                 precision = precision_score(batch_y, batch_pred, average='micro')
#                 recall = recall_score(batch_y, batch_pred, average='micro')
        print('EPOCH SUMMARY:')
        print(f'training loss {loss.item():.3f}\titeraton: { iteration}\tepoch {epoch} ')
        print('Train:')      
        print(f'\tmicro f1 { train_mic_f1:.3f} macro f1 {train_mac_f1:.3f}')
        print('Dev:')      
        print(f'\tmicro f1 { dev_mic_f1:.3f} macro f1 {dev_mac_f1:.3f}\n')
#                 print('\tPrediction counts:')
#                 uniques = np.unique(train_batch_pred, axis=0, return_counts=True)
#                 print('\tpred indeces')
#                 print(uniques[0])
#                 print(uniques[0][0])
#                 print(len(uniques[0]))
#                 print('\t\t',end='')
#                 [print(str(uniques[0][i])+'  ', end='') for i in range(len(uniques[0]))]
#                 print()
#                 print('\tpred counts')
#                 print('\t\t',end='')
#                 [print(str(uniques[1][i])+'  ', end='') for i in range(len(uniques[0]))]
#                 print()
#                 print('\ttrue indeces')
#                 print('\t\t',end='')
#                 true_unqs = np.unique(batch_y, axis=0, return_counts=True)
#                 [print(str(true_unqs[0][i])+'  ', end='') for i in range(len(true_unqs[0]))]
#                 print()
#                 print('\ttrue counts')
#                 print('\t\t',end='')
#                 [print(str(true_unqs[1][i])+'  ', end='') for i in range(len(true_unqs[0]))]
#                 print()
#                 
        m.train(True)
        m.inference = False

num_directions: 1
y_train shape: (15000,)
Iterations per epoch: 937
Number of prints per epoch: 12
	training loss 2.932	iteraton: 0	epoch 0 
	Train:
		micro f1 0.812 macro f1 0.682
	Dev:
		micro f1 0.411 macro f1 0.053

	training loss 1.148	iteraton: 78	epoch 0 
	Train:
		micro f1 0.812 macro f1 0.764
	Dev:
		micro f1 0.651 macro f1 0.230

	training loss 0.966	iteraton: 156	epoch 0 
	Train:
		micro f1 0.750 macro f1 0.438
	Dev:
		micro f1 0.702 macro f1 0.267

	training loss 1.787	iteraton: 234	epoch 0 
	Train:
		micro f1 0.438 macro f1 0.152
	Dev:
		micro f1 0.715 macro f1 0.280

	training loss 0.947	iteraton: 312	epoch 0 
	Train:
		micro f1 0.812 macro f1 0.747
	Dev:
		micro f1 0.722 macro f1 0.298

	training loss 1.689	iteraton: 390	epoch 0 
	Train:
		micro f1 0.500 macro f1 0.216
	Dev:
		micro f1 0.737 macro f1 0.283

	training loss 1.029	iteraton: 468	epoch 0 
	Train:
		micro f1 0.688 macro f1 0.318
	Dev:
		micro f1 0.730 macro f1 0.297

	training loss 0.626	iteraton: 546	epoch 0 

KeyboardInterrupt: 

Answer the following questions below:

1. What was the micro and macro F1 on the test and dev sets?

They have a lot of variation, but both hover around the high 40s for micro, then in the teens for the macro f1
2. Implement a bidirectional LSTM model. You will need to modify the hidden states and self.lstm variables. Does it work better? 
No, it didn't
3. Experiments with the various hyperparameters (hidden state size, learning rate, etc.). What hyperparemeters result in the best performance? 
Many of them seemed similar. The original ones were among the best

## PART III: Natural Language Inference (25 points)

Natural language inference is the task of determining whether a "hypothesis" is true (entailment), false (contradiction), or undetermined (neutral) given a "premise"[1, 2]. This task has been known to perform well for zero-shot classification[3].

Example:

| Premise | Label | Hypothesis |
| ------- | ----- | ---------- |
| A man inspects the uniform of a figure in some East Asian country. | contradiction | The man is sleeping. |
| An older and younger man smiling | neutral | Two men are smiling and laughing at the cats playing on the floor. |
| A soccer game with multiple males playing. | entailment | Some men are playing a sport. |

Your task is to load and train a model on the "multinli_1.0_train.jsonl" dataset and evaluate on "multinli_1.0_dev_matched.jsonl" using accuracy.

I am leaving this task relativley open. One solution is to modify the LSTM code above to pass two documents through a LSTM model and return the last hidden state for each. Next, concatenate the two vectors, then pass it through a softmax layer. Finally, train using the same forumlate as Part I.

**NOTE:** You do not need to train until convergence. You can train for only an epoch or 2 max; train less if it takes to long. I simply want to see that it runs and is learning.


[1] Williams, Adina, Nikita Nangia, and Samuel Bowman. "A Broad-Coverage Challenge Corpus for Sentence Understanding through Inference." Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers). 2018.

[2] Bowman, Samuel R., et al. "A large annotated corpus for learning natural language inference." Proceedings of the 2015 Conference on Empirical Methods in Natural Language Processing. 2015.

[3] Yin, Wenpeng, Jamaal Hay, and Dan Roth. "Benchmarking Zero-shot Text Classification: Datasets, Evaluation and Entailment Approach." Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP). 2019.

In [14]:
# COPY AND EDIT CODE HERE
import json

train_path = 'data/multinli_1.0_train.jsonl'

def load_data(path):
    with open(path, 'r') as f:
        data = f.readlines()
        
    X = []
    Y = []
    for d in data:
    #     print(d)
        d_dict = json.loads(d)
    #     print(d_dict)
    #     print(type(d_dict))
        X.append((d_dict['sentence1'], d_dict['sentence2']))
        Y.append(d_dict['gold_label'])
    return X, Y

trainX, trainY = load_data(train_path)

In [15]:
from nltk.tokenize import word_tokenize

# This takes like 5-10 minutes to run
# flipped = [a for a in zip(*trainX)]
trainX = [[word_tokenize(sent) for sent in a] for a in trainX]

In [16]:
trainX_lens = [ [len(t) for t in tup] for tup in  trainX]
max_len = max([ max(a) for a in zip(*trainX_lens)])
print(max_len)

401


In [21]:
print(len(trainX))

392702


In [22]:
def pad_example(ex, len_to_pad, pad):
    padded = [ex[0][:len_to_pad] +[pad]*(len_to_pad -len(ex[0])),ex[1][:len_to_pad] +[pad]*(len_to_pad -len(ex[1]))]
    return padded
# Write code to append data to code here
# for x in X_train_nums:
#     X_train_eq_size.append(pad_example(x,max_len, 0))
    
# for x in X_dev_nums:
#     X_dev_eq_size.append(pad_example(x,max_len, 0))
    
# for x in X_test_nums:
#     X_test_eq_size.append(pad_example(x,max_len, 0))

def indecize(sent, tok2index):
    indeces = []
    for tok in sent:
        if tok in tok2index:
            indeces.append(tok2index[tok])
        else:
            tok2index[tok] = len(tok2index)
            indeces.append(tok2index[tok])
    return indeces
    
def indecize_trainX(train_toked, tok2index):
    train_indeces = []
    for tup in train_toked:
        train_row = []
        for t in tup:
            train_row.append(indecize(t, tok2index))
        train_indeces.append(train_row)
    return train_indeces
            
    
    
    
tok2index = {'<PAD>': 0}
index2tok = {0: '<PAD>'}
class2index = {} # stores the class index pairs.
index2class = {}
# train_toked = [t for t in zip(*train_toked)]
trainX_nums = indecize_trainX(trainX, tok2index)

In [23]:
print(len(trainX_nums))

392702


In [24]:
# print(t[0][1])
# trainX_nums = [x for x in zip(*trainX_nums)]
print(len(trainX_nums))
print(len(tok2index))
trainX_eq_size = []
# Write code to append data to code here
for x in trainX_nums:
#     print(len(x[0]), len(x[1]))
    new_x = pad_example(x,max_len, 0)
#     print(len(new_x[0]), len(new_x[1]))
    trainX_eq_size.append(pad_example(x,max_len, 0))
    


392702
101901


In [25]:
# for a, b in trainX_eq_size:
#     if len(a) != max_len or len(b) != max_len:
#         print('ding!', len(a), len(b))

392702


In [29]:

class LSTM(nn.Module):
    def __init__(self, nb_layers, word2index, class2index, nb_lstm_units=100,
                 embedding_dim=3, batch_size=3, bidirectional=False):
        super(LSTM, self).__init__()
        self.vocab = word2index
        self.tags = class2index

        self.nb_layers = nb_layers
        self.nb_lstm_units = nb_lstm_units
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size

        self.nb_tags = len(self.tags)
        self.bidirectional = bidirectional
        self.num_directions = 2 if self.bidirectional else 1
        print('num_directions:', self.num_directions)
        # build actual NN
        self.__build_model()

    def __build_model(self):
        nb_vocab_words = len(self.vocab)

        padding_idx = self.vocab['<PAD>']
        self.word_embedding = nn.Embedding(
            num_embeddings=nb_vocab_words,
            embedding_dim=self.embedding_dim,
            padding_idx=padding_idx
        )

        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.nb_lstm_units,
            num_layers=self.nb_layers,
            batch_first=True
        )
        
        if self.bidirectional:
            
            self.lstm_back = nn.LSTM(
                input_size=self.embedding_dim,
                hidden_size=self.nb_lstm_units,
                num_layers=self.nb_layers,
                batch_first=True
            )

        self.hidden_to_tag = nn.Linear(self.nb_lstm_units*self.nb_layers*self.num_directions*2, self.nb_tags)
        
        self.logsoftmax = nn.LogSoftmax(dim=1)
        self.softmax = nn.Softmax(dim=1)
        self.inference = False

    def init_hidden(self, X, bidirectional=False):
#         if bidirectional:
#             h0 = torch.zeros(self.nb_layers, X.size(0), self.nb_lstm_units*self.num_directions).float()
#             c0 = torch.zeros(self.nb_layers, X.size(0), self.nb_lstm_units*self.num_directions).float()
#         else:
            # Initial ht (hidden state) and ct (context)
        h0 = torch.zeros(self.nb_layers, X.size(0), self.nb_lstm_units).float()
        c0 = torch.zeros(self.nb_layers, X.size(0), self.nb_lstm_units).float()
        return (h0,c0)
    
    def embed_sent(self, X, X_lengths):
        self.hidden = self.init_hidden(X)
        self.hidden_back = self.init_hidden(X)
        
        batch_size, seq_len = X.shape
        
        # ---------------------
        # 1. embed the input
        # Dim transformation: (batch_size, seq_len) -> (batch_size, seq_len, embedding_dim)
#         print(X[:128,:])
#         print(X[:128,:].size())
        
        X = self.word_embedding(X)
        if self.bidirectional:
            X = torch.cat((X,torch.flip(X,[1])), 2)
            

        # ---------------------
        # 2. Run through RNN
        # TRICK 2 ********************************
        # Dim transformation: (batch_size, seq_len, embedding_dim) -> (batch_size, seq_len, nb_lstm_units)

    
        # pack_padded_sequence so that padded items in the sequence won't be shown to the LSTM
        if self.bidirectional:
            X_for = torch.nn.utils.rnn.pack_padded_sequence(X[:,:,:self.embedding_dim], X_lengths, batch_first=True)
            X_back = torch.nn.utils.rnn.pack_padded_sequence(X[:,:,self.embedding_dim:], X_lengths, batch_first=True)
        else:
            X = torch.nn.utils.rnn.pack_padded_sequence(X, X_lengths, batch_first=True)
        
        
        # now run through LSTM
        # X contains the padded sequence output and ht contains the final hidden states
        if self.bidirectional:
            X_for, (ht_for, ct_for) = self.lstm(X_for, self.hidden)
            X_back, (ht_back, ct_back) = self.lstm(X_back, self.hidden_back)
#             X_back, (ht_back, ct_back) = self.lstm_back(X_back, self.hidden) 
            ht = torch.cat((ht_for, ht_back), 2)
        else:
            X, (ht, ct) = self.lstm(X, self.hidden)
        
#         print('hidden state shape:', ht.size())
        
        # Reshape to use the final state from each lstm layer
        out = ht.view(ht.size(1), self.nb_lstm_units*self.nb_layers*self.num_directions)
        return out
        
    def forward(self, X, X_lengths):
        # reset the LSTM hidden state. Must be done before you run a new batch.
        # Otherwise the LSTM will treat
        # a new batch as a continuation of a sequence
        column0 = embed_sent(X[0], X_lengths[0])
        column1 = embed_sent(X[1], X_lengths[1])
        tot_col = np.concatenate((column0, column1), axis=1)

        # pass final states to output layer
        
        out = self.hidden_to_tag(tot_col)
        
        # Use logsoftmax for training and softmax for testing
        if not self.inference:
            Y_hat = self.logsoftmax(out)
        else:
            Y_hat = self.softmax(out)

        return Y_hat

In [27]:
class2index = {}
print(len(trainX_eq_size))
for class_type in trainY:
    if class_type not in class2index:
        class2index[class_type] = len(class2index)
train_Y = convert_to_index_map(trainY, class2index, map_element=False)
print(train_Y[:100])
print(class2index)
train_Y = np.array(train_Y)
print(train_Y[0])


392702
[0, 1, 1, 1, 0, 1, 0, 1, 2, 2, 1, 2, 0, 0, 2, 1, 1, 2, 2, 0, 1, 1, 2, 1, 1, 0, 2, 0, 2, 1, 1, 1, 2, 0, 2, 1, 1, 2, 2, 2, 1, 0, 1, 0, 1, 1, 2, 2, 0, 1, 1, 1, 2, 0, 0, 0, 2, 0, 2, 0, 0, 1, 2, 1, 0, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 1, 2, 0, 1, 0, 2, 2, 2, 1, 0, 1, 0, 1, 2, 0, 0, 2, 2, 1, 1, 2, 0, 1, 2, 1]
{'neutral': 0, 'entailment': 1, 'contradiction': 2}
0


In [28]:
train_X = np.array(trainX_eq_size)
print(train_X.shape)
print(train_X[0][0])
for ex in trainX_eq_size[:1000]:
    for i, sent in enumerate(ex):
        if len(sent) != 401:
            print(len(sent), i)
print(train_X.shape)

(392702, 2, 401)
[ 1  2  3  4  5  6  7  8  9 10 11 12  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0 

In [None]:
num_layers = 1
epochs = 15
batch_size = 16
lstm_unit_size = 64
embedding_size = 64
prints_per_epoch = 12



print_iter = len(train_Y)//batch_size//prints_per_epoch
bidirectional = False

m = LSTM(num_layers, tok2index, class2index, nb_lstm_units = lstm_unit_size,
         embedding_dim = embedding_size, batch_size = batch_size, bidirectional=bidirectional)

criterion = nn.NLLLoss() #size_average=False,weight=1/class_weights
optim = torch.optim.Adam(m.parameters(), lr=0.01)

indeces = np.arange(train_X.shape[0])     
print('y_train shape:', train_Y.shape)
print('Iterations per epoch:', train_Y.shape[0] // batch_size)
print('Number of prints per epoch:',print_iter)

# np_X_dev = torch.tensor(X_dev).long()
# np_X_dev_len = torch.tensor(np.array(X_dev_len)[np.argsort(np.array(X_dev_len))[::-1]] ).long()

for epoch in range(epochs):
    np.random.shuffle(indeces)
    train_x = X_train[indeces]
    train_y = train_Y[indeces]
    lens_x = trainX_lens[indeces]

    np_x_sorted_lens = np.array(lens_x)[np.argsort(np.array(lens_x))[::-1]]
    current_batch = 0
    for iteration in range(y_train2.shape[0] // batch_size):
        
        batch_lengths = lens_x[current_batch: current_batch + batch_size]
        lengths = np.array(batch_lengths)
        idx = np.argsort(lengths)[::-1]
        batch_lengths = batch_lengths[idx]
        batch_lengths = torch.tensor(batch_lengths).long()
        
        
        batch_x = train_x[current_batch: current_batch + batch_size]
        batch_x = batch_x[idx]
        batch_x = torch.tensor(batch_x).long()
        
        batch_y = y_train2[current_batch: current_batch + batch_size]
        batch_y = batch_y[idx]
        batch_y = torch.tensor(batch_y).long()
        
        current_batch += batch_size
                        
        optim.zero_grad()
        if len(batch_x) > 0:
            batch_pred = m(batch_x, batch_lengths)
            
            loss = criterion(batch_pred, batch_y)
            loss.backward()
            optim.step()

        if iteration % print_iter == 0:
            with torch.no_grad():
                m.train(False)
                m.inference = True
                train_batch_pred = np.array(m(batch_x, batch_lengths)).argmax(axis=1)
                train_mic_f1 = f1_score(batch_y, train_batch_pred, average='micro')
                train_mac_f1 = f1_score(batch_y, train_batch_pred, average='macro')
          
                batch_pred = np.array(m(np_X_dev, np_X_dev_len)).argmax(axis=1)
                
                dev_batch_y = y_dev
                dev_mic_f1 = f1_score(dev_batch_y, batch_pred, average='micro')
                dev_mac_f1 = f1_score(dev_batch_y, batch_pred, average='macro')
#                 precision = precision_score(batch_y, batch_pred, average='micro')
#                 recall = recall_score(batch_y, batch_pred, average='micro')
                print(f'\ttraining loss {loss.item():.3f}\titeraton: { iteration}\tepoch {epoch} ')
                print('\tTrain:')      
                print(f'\t\tmicro f1 { train_mic_f1:.3f} macro f1 {train_mac_f1:.3f}')
                print('\tDev:')      
                print(f'\t\tmicro f1 { dev_mic_f1:.3f} macro f1 {dev_mac_f1:.3f}\n')
#                 print('\tPrediction counts:')
#                 uniques = np.unique(train_batch_pred, axis=0, return_counts=True)
#                 print('\tpred indeces')
#                 print(uniques[0])
#                 print(uniques[0][0])
#                 print(len(uniques[0]))
#                 print('\t\t',end='')
#                 [print(str(uniques[0][i])+'  ', end='') for i in range(len(uniques[0]))]
#                 print()
#                 print('\tpred counts')
#                 print('\t\t',end='')
#                 [print(str(uniques[1][i])+'  ', end='') for i in range(len(uniques[0]))]
#                 print()
#                 print('\ttrue indeces')
#                 print('\t\t',end='')
#                 true_unqs = np.unique(batch_y, axis=0, return_counts=True)
#                 [print(str(true_unqs[0][i])+'  ', end='') for i in range(len(true_unqs[0]))]
#                 print()
#                 print('\ttrue counts')
#                 print('\t\t',end='')
#                 [print(str(true_unqs[1][i])+'  ', end='') for i in range(len(true_unqs[0]))]
#                 print()
#                 
                m.train(True)
                m.inference = False
        
        
    
    with torch.no_grad():
        m.train(False)
        m.inference = False
        raw_pred = m(torch.tensor(np.array(x_train)).long(),torch.tensor(np_x_sorted_lens).long() )
        train_pred = np.array(raw_pred.long()).argmax(axis=1)
        train_mic_f1 = f1_score(y_train, train_pred, average='micro')
        train_mac_f1 = f1_score(y_train, train_pred, average='macro')
        loss = criterion(raw_pred, torch.tensor(y_train2).long())

        
        dev_pred = np.array(m(np_X_dev, np_X_dev_len)).argmax(axis=1)

        dev_batch_y = y_dev
        dev_mic_f1 = f1_score(y_dev, dev_pred, average='micro')
        dev_mac_f1 = f1_score(y_dev, dev_pred, average='macro')
#                 precision = precision_score(batch_y, batch_pred, average='micro')
#                 recall = recall_score(batch_y, batch_pred, average='micro')
        print('EPOCH SUMMARY:')
        print(f'training loss {loss.item():.3f}\titeraton: { iteration}\tepoch {epoch} ')
        print('Train:')      
        print(f'\tmicro f1 { train_mic_f1:.3f} macro f1 {train_mac_f1:.3f}')
        print('Dev:')      
        print(f'\tmicro f1 { dev_mic_f1:.3f} macro f1 {dev_mac_f1:.3f}\n')
#                 print('\tPrediction counts:')
#                 uniques = np.unique(train_batch_pred, axis=0, return_counts=True)
#                 print('\tpred indeces')
#                 print(uniques[0])
#                 print(uniques[0][0])
#                 print(len(uniques[0]))
#                 print('\t\t',end='')
#                 [print(str(uniques[0][i])+'  ', end='') for i in range(len(uniques[0]))]
#                 print()
#                 print('\tpred counts')
#                 print('\t\t',end='')
#                 [print(str(uniques[1][i])+'  ', end='') for i in range(len(uniques[0]))]
#                 print()
#                 print('\ttrue indeces')
#                 print('\t\t',end='')
#                 true_unqs = np.unique(batch_y, axis=0, return_counts=True)
#                 [print(str(true_unqs[0][i])+'  ', end='') for i in range(len(true_unqs[0]))]
#                 print()
#                 print('\ttrue counts')
#                 print('\t\t',end='')
#                 [print(str(true_unqs[1][i])+'  ', end='') for i in range(len(true_unqs[0]))]
#                 print()
#                 
        m.train(True)
        m.inference = False

1. Describe your solution.

**ANSWER HERE**

## EXTRA CREDIT 1 (10 points)

Modify the LSTM model to train a language model, then write code to generate new text from the model. Do not forget to mask the loss function when training the language model to handle the different lengths of the sequences. Use the "en-ud-train.upos.tsv" dataset.

Generate 10 examples from your model.

In [None]:
# COPY AND EDIT CODE HERE