In [31]:
import torch
import torch.nn as nn
from torchtext import data
import torch.optim as optim

import pandas as pd
import numpy as np
import sys
from collections import Counter

#Reproducing same results
seed = 2020

#Set the seed to be fixed
torch.manual_seed(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

In [32]:
train_df = pd.read_csv('../DATA/SICK/SICK train val.csv')
test_df = pd.read_csv('../DATA/SICK/SICK test.csv')

#To show that there is imabalance in both datasets
print(Counter(train_df['entailment_judgment']))
print(Counter(test_df['entailment_judgment']))

Counter({'NEUTRAL': 2818, 'ENTAILMENT': 1443, 'CONTRADICTION': 739})
Counter({'NEUTRAL': 2793, 'ENTAILMENT': 1414, 'CONTRADICTION': 720})


In [33]:
train_class_count = torch.tensor([v for k,v in Counter(train_df['entailment_judgment']).items()])
test_class_count = torch.tensor([v for k,v in Counter(test_df['entailment_judgment']).items()])

class_loss_weights = {
    'train' : train_class_count.float()/torch.max(train_class_count),
    'test' : test_class_count.float()/torch.max(test_class_count)
}

In [34]:
class_loss_weights

{'train': tensor([1.0000, 0.5121, 0.2622]),
 'test': tensor([1.0000, 0.5063, 0.2578])}

In [35]:
train_label = data.LabelField(dtype=torch.int64, batch_first=True)
train_sentAB = data.Field(tokenize='spacy', include_lengths=True, batch_first=True)

test_label = data.LabelField(dtype=torch.int64, batch_first=True)
test_sentAB = data.Field(tokenize='spacy', include_lengths=True, batch_first=True)

train_fields = [(None, None), (None, None), (None, None), (None, None), (None, None),
                ('train_label', train_label), ('train_sentAB', train_sentAB)]
test_fields = [(None, None), (None, None), (None, None), (None, None), (None, None),
               ('test_label', test_label), ('test_sentAB', test_sentAB)]

train_dataset = data.TabularDataset(path='../DATA/SICK/SICK train val.csv', format='CSV', 
                                    fields=train_fields, skip_header=True)
test_dataset = data.TabularDataset(path='../DATA/SICK/SICK test.csv', format='CSV', 
                                    fields=test_fields, skip_header=True)

In [36]:
print(vars(train_dataset.examples[0]))

{'train_label': '0', 'train_sentAB': ['A', 'group', 'of', 'kids', 'is', 'playing', 'in', 'a', 'yard', 'and', 'an', 'old', 'man', 'is', 'standing', 'in', 'the', 'background', '<', 'sep', '>', 'A', 'group', 'of', 'boys', 'in', 'a', 'yard', 'is', 'playing', 'and', 'a', 'man', 'is', 'standing', 'in', 'the', 'background']}


In [37]:
print("Size of training set {}".format(len(train_dataset)))
#Example of training data
print(vars(train_dataset.examples[0]))

Size of training set 5000
{'train_label': '0', 'train_sentAB': ['A', 'group', 'of', 'kids', 'is', 'playing', 'in', 'a', 'yard', 'and', 'an', 'old', 'man', 'is', 'standing', 'in', 'the', 'background', '<', 'sep', '>', 'A', 'group', 'of', 'boys', 'in', 'a', 'yard', 'is', 'playing', 'and', 'a', 'man', 'is', 'standing', 'in', 'the', 'background']}


In [38]:
print("Size of test set {}".format(len(test_dataset)))
#Example of training data
print(vars(test_dataset.examples[0]))

Size of test set 4927
{'test_label': '0', 'test_sentAB': ['There', 'is', 'no', 'boy', 'playing', 'outdoors', 'and', 'there', 'is', 'no', 'man', 'smiling', '<', 'sep', '>', 'A', 'group', 'of', 'kids', 'is', 'playing', 'in', 'a', 'yard', 'and', 'an', 'old', 'man', 'is', 'standing', 'in', 'the', 'background']}


In [39]:
train_sentAB.build_vocab(train_dataset, min_freq=1, vectors_cache="Vectors/",
                         vectors="glove.6B.300d")

test_sentAB.build_vocab(test_dataset, min_freq=1, vectors_cache="Vectors/",
                        vectors="glove.6B.300d")

train_label.build_vocab(train_dataset)
test_label.build_vocab(test_dataset)
# score.build_vocab(training_data)

In [40]:
#Example of word is matched to index from embeddings
for word in train_dataset.examples[1].train_sentAB[:10]:
    print((word, train_sentAB.vocab.stoi[word]))

('A', 4)
('group', 66)
('of', 16)
('children', 83)
('is', 2)
('playing', 18)
('in', 10)
('the', 8)
('house', 525)
('and', 11)


In [41]:
print(len(train_sentAB.vocab))

print(len(test_sentAB.vocab))

2298
2273


In [42]:
print(train_label.vocab.stoi)
print(test_label.vocab.stoi)

defaultdict(None, {'0': 0, '1': 1, '2': 2})
defaultdict(None, {'0': 0, '1': 1, '2': 2})


In [43]:
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 13

train_iterator = data.BucketIterator(train_dataset, TRAIN_BATCH_SIZE,
                                     sort_key=lambda x : x.train_sentAB,
                                     device=device,
                                     shuffle=True)

test_iterator = data.BucketIterator(test_dataset, TEST_BATCH_SIZE,
                                     sort_key=lambda x : x.test_sentAB,
                                     device=device,
                                     shuffle=False)

In [44]:
class RNNModel(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim,
                 output_dim, num_layers, bidirectional,dropout_rate):
        super().__init__()
        
        #Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #More regularization
#         self.dropout1 = nn.Dropout(p=dropout_rate)
        
        #LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            num_layers=num_layers,
                            bidirectional=bidirectional,
#                             dropout=dropout_rate,
                            batch_first=True)
        
        #More regularization
        self.dropout2 = nn.Dropout(p=dropout_rate)
        
        #Full connected layer
        self.fc = nn.Linear(2 * hidden_dim, output_dim)
        
    def forward(self, text, text_lengths):
        
        #text = [batch size, max sentence length in batch]
        embedded = self.embedding(text)
        
        #Regularize!
#         reg_embedded = self.dropout1(embedded)
        
        #pack the batch sentences to max length
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, 
                                                            batch_first=True, enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        
        inputs = self.dropout2(hidden)
        
        #Direct values are given to CE loss for loss calculation
        #LogSoftmax is used for inference
        outputs = self.fc(inputs)
    
        return outputs

In [45]:
vocab_size = len(train_sentAB.vocab)
embedding_dim = 300
num_hidden_nodes = 32
num_output_nodes = 3
num_layers = 1
bidirectional = True
dropout_rate = 0.4

#Create the model
model = RNNModel(vocab_size, embedding_dim, num_hidden_nodes,
                 num_output_nodes, num_layers, bidirectional,dropout_rate)
model.to(device)

model

RNNModel(
  (embedding): Embedding(2298, 300)
  (lstm): LSTM(300, 32, batch_first=True, bidirectional=True)
  (dropout2): Dropout(p=0.4, inplace=False)
  (fc): Linear(in_features=64, out_features=3, bias=True)
)

In [16]:
#Trying transfer learning here
# for name, params in model.named_parameters():
#     if 'embedding' in name:
#         params.requires_grad = False

In [46]:
#Set pretrained word embedding as weights for embedding layer
embeddings = train_sentAB.vocab.vectors
model.embedding.weight.data.copy_(embeddings)

print("Size of embedding matrix {}".format(embeddings.size()))

num_elements = 0
for params in model.parameters():
    if params.requires_grad:
        num_elements += params.numel()
    
print("Number of trainable elements in the model {}".format(num_elements))

Size of embedding matrix torch.Size([2298, 300])
Number of trainable elements in the model 775099


In [23]:
#Optimizer and Loss
# optimizer = optim.Adam(model.parameters(), lr=1e-2)
optimizer = optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
criterion = nn.CrossEntropyLoss()

# scheduler = optim.lr_scheduler.CyclicLR(optimizer, base_lr=1e-3,
#                                         max_lr=1e-2, step_size_up=1250, mode='triangular2')
# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=6, gamma=0.1)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[8])

#Softmax layer only used to get probabilties
softmax = nn.LogSoftmax(dim=1)


In [19]:
#Training loop

def train():
    iterations = len(train_dataset) / TRAIN_BATCH_SIZE

    model.train()

    running_loss = 0
    running_correct = 0
    phase = "train"
#     criterion = nn.CrossEntropyLoss(weight=class_loss_weights[phase])
    
    for batch_idx, items in enumerate(train_iterator):

        sys.stdout.write('\r')
        sys.stdout.write("{} Iteration :{}/{}"
                            .format(phase, batch_idx + 1, iterations))

        optimizer.zero_grad()

        #Get the text and length of sentences
        text, text_lengths = items.train_sentAB
        text = text.to(device)
        text_lengths = text_lengths.to(device)

        #Get labels of each batch
        labels = items.train_label
        labels = labels.to(device)

        #Predictions are in size [1, ..]
        outputs = model(text, text_lengths).squeeze()

        #Calculate loss
        loss = criterion(outputs, items.train_label)
        
        #Convert predictions to probabilities
        probabilites = softmax(outputs)
        #Give's the index of the node with the highest probability
        predictions = torch.argmax(probabilites, dim=1)
        correct = (predictions == labels).float()

        loss.backward()   
        optimizer.step()
        
        #Learning rate scheduler to improve convergence
#         scheduler.step()

        #Save the loss details for each epoch
        running_loss += loss.item()
        running_correct += correct.sum().item()

    epoch_loss = running_loss / len(train_iterator)
    epoch_accuracy = running_correct / len(train_dataset)
    
    return epoch_loss, epoch_accuracy

In [20]:
#Evaluate the results

def evaluate():
    iterations = len(test_dataset) / TEST_BATCH_SIZE

    model.eval()

    running_loss = 0
    running_correct = 0
    phase = "test"
#     criterion = nn.CrossEntropyLoss(weight=class_loss_weights[phase])
    
    with torch.no_grad():
        for batch_idx, items in enumerate(test_iterator):

            sys.stdout.write('\r')
            sys.stdout.write("{} Iteration :{}/{}"
                                .format(phase, batch_idx + 1, iterations))


            #Get the text and length of sentences
            text, text_lengths = items.test_sentAB
            text = text.to(device)
            text_lengths = text_lengths.to(device)

            #Get labels of each batch
            labels = items.test_label
            labels = labels.to(device)

            #Predictions are in size [1, ..]
            outputs = model(text, text_lengths).squeeze()

            #Calculate loss
            loss = criterion(outputs, items.test_label)

            #Convert predictions to probabilities
            probabilites = softmax(outputs)
            #Give's the index of the node with the highest probability
            predictions = torch.argmax(probabilites, dim=1)
            correct = (predictions == labels).float()

            #Save the loss details for each epoch
            running_loss += loss.item()
            running_correct += correct.sum().item()

        epoch_loss = running_loss / len(test_iterator)
        epoch_accuracy = running_correct / len(test_dataset)
    
    return epoch_loss, epoch_accuracy

In [34]:

num_epochs = 6

for epoch in range(num_epochs):
    
    print("Epoch :{}/{}".format(epoch + 1, num_epochs))
    
    print(optimizer.param_groups[0]["lr"])
    
    train_loss, train_acc = train() 
    print()
    
#     scheduler.step()
    
    test_loss, test_acc = evaluate() 
    print()
    
    print("Train Loss :{:.3f} | Train Accuracy :{:.3f} | "\
          "Eval Loss :{:.3f} | Eval Accuracy :{:.3f}".format(train_loss, train_acc, test_loss, test_acc))

Epoch :1/6
0.0001
train Iteration :625/625.0
test Iteration :379/379.0
Train Loss :0.914 | Train Accuracy :0.574 | Eval Loss :0.949 | Eval Accuracy :0.565
Epoch :2/6
0.0001
train Iteration :625/625.0
test Iteration :379/379.0
Train Loss :0.909 | Train Accuracy :0.577 | Eval Loss :0.949 | Eval Accuracy :0.565
Epoch :3/6
0.0001
train Iteration :625/625.0
test Iteration :379/379.0
Train Loss :0.912 | Train Accuracy :0.578 | Eval Loss :0.949 | Eval Accuracy :0.565
Epoch :4/6
0.0001
train Iteration :625/625.0
test Iteration :379/379.0
Train Loss :0.910 | Train Accuracy :0.580 | Eval Loss :0.949 | Eval Accuracy :0.565
Epoch :5/6
0.0001
train Iteration :625/625.0
test Iteration :379/379.0
Train Loss :0.912 | Train Accuracy :0.575 | Eval Loss :0.949 | Eval Accuracy :0.565
Epoch :6/6
0.0001
train Iteration :625/625.0
test Iteration :379/379.0
Train Loss :0.909 | Train Accuracy :0.578 | Eval Loss :0.949 | Eval Accuracy :0.565


In [35]:
#For saving the model
# torch.save({ "optimizer" : optimizer.state_dict(),
#              "scheduler" : scheduler.state_dict(),
#              "model":model.state_dict(),
#              "epoch" : 20}, '../Models/bilstm_task1_fn.pth')

In [47]:
checkpoint = torch.load('../Models/bilstm_task1_fn.pth')
model.load_state_dict(checkpoint["model"])

<All keys matched successfully>

In [25]:
def prediction():
    iterations = len(test_dataset) / TEST_BATCH_SIZE

    model.eval()

    _predictions = []
    
    with torch.no_grad():
        for batch_idx, items in enumerate(test_iterator):

            sys.stdout.write('\r')
            sys.stdout.write("{} Iteration :{}/{}"
                                .format("test", batch_idx + 1, iterations))


            #Get the text and length of sentences
            text, text_lengths = items.test_sentAB
            text = text.to(device)
            text_lengths = text_lengths.to(device)

            #Get labels of each batch
            labels = items.test_label
            labels = labels.to(device)

            #Predictions are in size [1, ..]
            outputs = model(text, text_lengths).squeeze()

            #Convert predictions to probabilities
            probabilites = softmax(outputs)
            #Give's the index of the node with the highest probability
            predictions = torch.argmax(probabilites, dim=1)

            #Save the predictions in a list
            _predictions += list(predictions.cpu().squeeze().numpy())
            
    return _predictions    

In [26]:
predictions = prediction()

test Iteration :379/379.0

In [29]:
test_df = pd.read_csv('../Data/SICK/SICK test.csv')
result_df = pd.DataFrame(columns=['pair_ID', 'entailment_judgment','label','prediction'])

result_df['pair_ID'] = test_df['pair_ID']
result_df['entailment_judgment'] = test_df['entailment_judgment']
result_df['label'] = test_df['label']
result_df['prediction'] = predictions

In [30]:
result_df.to_csv('../Data/SICK/result.csv', index=False)

In [55]:
activation = nn.LogSoftmax(dim=1)

model.eval()
with torch.no_grad():
    for idx, items in enumerate(test_iterator):
            #Get the text and length of sentences
            text, text_lengths = items.test_sentAB
            text = text.to(device)
            text_lengths = text_lengths.to(device)

            #Get labels of each batch
            labels = items.test_label
            labels = labels.to(device)
            
            #Predictions are in size [1, ..]
            outputs = model(text, text_lengths).squeeze()
            softmax_output = activation(outputs)
            pred = torch.argmax(softmax_output, dim=1)
            
            print((softmax_output, pred))
            
            if idx == 1:
                break


(tensor([[-0.5118, -1.4192, -1.8408],
        [-0.4908, -1.1667, -2.5711],
        [-0.5027, -1.2054, -2.3486],
        [-0.4768, -1.2158, -2.4920],
        [-0.4756, -1.2178, -2.4940],
        [-0.8929, -1.5737, -0.9590],
        [-0.5291, -1.1897, -2.2392],
        [-0.7620, -1.5137, -1.1610],
        [-0.7517, -1.5167, -1.1744],
        [-0.4264, -1.2431, -2.8364],
        [-0.5558, -1.3162, -1.8436],
        [-0.4689, -1.1897, -2.6596],
        [-0.5049, -1.3165, -2.0529]]), tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))
(tensor([[-0.6405, -1.4067, -1.4783],
        [-0.4954, -1.3200, -2.0912],
        [-0.4891, -1.3153, -2.1335],
        [-0.4621, -1.2235, -2.5793],
        [-0.5291, -1.2354, -2.1191],
        [-0.3858, -1.3515, -2.7933],
        [-0.5276, -1.3893, -1.8280],
        [-0.3846, -1.3449, -2.8357],
        [-0.6256, -1.3382, -1.5958],
        [-0.5667, -1.4725, -1.5934],
        [-0.4426, -1.3415, -2.3416],
        [-0.7428, -1.7278, -1.0598],
        [-0.8336, -1.7

In [None]:
# eval_loss, eval_acc = evaluate()
# eval_loss, eval_acc

In [4]:
# #Preprocessing the data
# df = pd.read_csv('../Data/SICK/SICK val.txt', sep="\t")

# label = []

# for val in df['entailment_judgment']:
#     if val == "CONTRADICTION":
#         label.append(2)
#     elif val == "ENTAILMENT":
#         label.append(1)
#     else:
#         label.append(0)

# df['label'] = label
# input_sent = []

# for idx, row in df.iterrows():
#     input_sent.append(row['sentence_A'] + ' <sep> ' + row['sentence_B'])

# df['sentAB'] = input_sent

# df.to_csv('../DATA/SICK/SICK val.csv', index=False)

In [5]:
# train_df = pd.read_csv('../Data/SICK/SICK train.csv')
# val_df = pd.read_csv('../Data/SICK/SICK val.csv')

# train_val_df = pd.concat([train_df, val_df], axis=0)
# train_val_df.reset_index(inplace=True)
# train_val_df.drop(columns='index', inplace=True)

# train_val_df.to_csv('../Data/SICK/SICK train val.csv', index=False)