In [106]:
import torch
import torch.nn as nn
from torchtext import data
import torch.optim as optim

import pandas as pd
import numpy as np
import sys
from scipy.stats import pearsonr
from collections import Counter

#Reproducing same results
seed = 2020

#Set the seed to be fixed
torch.manual_seed(seed)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

In [107]:
# How to make a regression training set
#Fields for each set of data
train_score = data.Field(dtype=torch.float, use_vocab=False, batch_first=True, 
                         is_target=True, sequential=False)
train_sentAB = data.Field(tokenize='spacy', include_lengths=True, batch_first=True)

test_score = data.Field(dtype=torch.float, use_vocab=False, batch_first=True, 
                         is_target=True, sequential=False)
test_sentAB = data.Field(tokenize='spacy', include_lengths=True, batch_first=True)

train_fields = [(None, None), (None, None), (None, None), ('train_score', train_score),
                (None, None), (None, None), ('train_sentAB', train_sentAB)]
test_fields = [(None, None), (None, None), (None, None), ('test_score', test_score),
               (None, None), (None, None), ('test_sentAB', test_sentAB)]

#Complete datasets of train and test
train_dataset = data.TabularDataset(path='../DATA/SICK/SICK train val.csv', format='CSV', 
                                    fields=train_fields, skip_header=True)
test_dataset = data.TabularDataset(path='../DATA/SICK/SICK test.csv', format='CSV', 
                                    fields=test_fields, skip_header=True)


In [108]:
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 13

train_sentAB.build_vocab(train_dataset, min_freq=1, vectors_cache="Vectors/",
                         vectors="glove.6B.300d")

test_sentAB.build_vocab(test_dataset, min_freq=1, vectors_cache="Vectors/",
                        vectors="glove.6B.300d")

train_iterator = data.BucketIterator(train_dataset, TRAIN_BATCH_SIZE,
                                     device=device,
                                     shuffle=True)

test_iterator = data.BucketIterator(test_dataset, TEST_BATCH_SIZE,
                                     device=device,
                                     shuffle=False)

In [109]:
#Example of word is matched to index from embeddings
for word in train_dataset.examples[2].train_sentAB[:10]:
    print((word, train_sentAB.vocab.stoi[word]))

('The', 14)
('young', 45)
('boys', 109)
('are', 12)
('playing', 18)
('outdoors', 174)
('and', 11)
('the', 8)
('man', 9)
('is', 2)


In [110]:
print(len(train_sentAB.vocab))

print(len(test_sentAB.vocab))

2298
2273


In [111]:
class RNNModel(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim,
                 output_dim, num_layers, bidirectional,dropout_rate):
        super().__init__()
        
        #Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #More regularization
#         self.dropout1 = nn.Dropout(p=dropout_rate)
        
        #LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            num_layers=num_layers,
                            bidirectional=bidirectional,
#                             dropout=dropout_rate,
                            batch_first=True)
        
        #More regularization
        self.dropout2 = nn.Dropout(p=dropout_rate)
        
        #Full connected layer
        self.fc = nn.Linear(2 * hidden_dim, output_dim)
        
        #Activation function
        self.act_fn = nn.ReLU()
        
    def forward(self, text, text_lengths):
        
        #text = [batch size, max sentence length in batch]
        embedded = self.embedding(text)
        
        #Regularize!
#         reg_embedded = self.dropout1(embedded)
        
        #pack the batch sentences to max length
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, 
                                                            batch_first=True, enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        
        inputs = self.dropout2(hidden)
        
        #Direct values are given to CE loss for loss calculation
        #LogSoftmax is used for inference
        outputs = self.fc(inputs)
        
        outputs = self.act_fn(outputs)
    
        return outputs

In [112]:
vocab_size = len(train_sentAB.vocab)
embedding_dim = 300
num_hidden_nodes = 32
num_output_nodes = 1
num_layers = 1
bidirectional = True
dropout_rate = 0.4

#Create the model
model = RNNModel(vocab_size, embedding_dim, num_hidden_nodes,
                 num_output_nodes, num_layers, bidirectional,dropout_rate)
model.to(device)

model

RNNModel(
  (embedding): Embedding(2298, 300)
  (lstm): LSTM(300, 32, batch_first=True, bidirectional=True)
  (dropout2): Dropout(p=0.4, inplace=False)
  (fc): Linear(in_features=64, out_features=1, bias=True)
  (act_fn): ReLU()
)

In [113]:
#Set pretrained word embedding as weights for embedding layer
embeddings = train_sentAB.vocab.vectors
model.embedding.weight.data.copy_(embeddings)

print("Size of embedding matrix {}".format(embeddings.size()))

num_elements = 0
for params in model.parameters():
    if params.requires_grad:
        num_elements += params.numel()
    
print("Number of trainable elements in the model {}".format(num_elements))

Size of embedding matrix torch.Size([2298, 300])
Number of trainable elements in the model 774969


In [114]:
#Optimizer and Loss
# optimizer = optim.Adam(model.parameters(), lr=1e-2)
optimizer = optim.SGD(model.parameters(), lr=1e-2, momentum=0.9)
criterion = nn.MSELoss()

# scheduler = optim.lr_scheduler.CyclicLR(optimizer, base_lr=1e-3,
#                                         max_lr=1e-2, step_size_up=1250, mode='triangular2')
# scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=6, gamma=0.1)
scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[3])

#Softmax layer only used to get probabilties
softmax = nn.LogSoftmax(dim=1)

In [115]:
#Train the model

def train(dataset, iterator, optimizer, scheduler, criterion):
    iterations = len(dataset) / TRAIN_BATCH_SIZE

    model.train()

    running_loss = 0
    running_correct = 0
    phase = "train"
    predictions = []
    targets = []
    
    for batch_idx, items in enumerate(iterator):

        sys.stdout.write('\r')
        sys.stdout.write("{} Iteration :{}/{}"
                            .format(phase, batch_idx + 1, iterations))

        optimizer.zero_grad()

        #Get the text and length of sentences
        text, text_lengths = items.train_sentAB
        text = text.to(device)
        text_lengths = text_lengths.to(device)

        #Get scores of each batch
        scores = items.train_score
        scores = scores.to(device)

        #Predictions are in size [1, ..]
        outputs = model(text, text_lengths).squeeze()

        #Calculate loss
        loss = criterion(outputs, scores)

        #Get the prediction outputs and targets into a list for eval function
        predictions += list(outputs.detach().cpu().numpy())
        targets += list(scores.cpu().numpy())

        loss.backward()   
        optimizer.step()
        
        #Learning rate scheduler to improve convergence
#         scheduler.step() 

        #Save the loss details for each epoch
        running_loss += loss.item()

    epoch_loss = running_loss / len(iterator)
    
    #Numpy arrays are more memory efficient
    return epoch_loss, np.array(predictions), np.array(targets)

In [116]:
#Evaluate the model on the training and test set
def evaluate(dataset, iterator, optimizer, criterion):
    iterations = len(test_dataset) / TEST_BATCH_SIZE

    model.eval()

    running_loss = 0
    running_correct = 0
    phase = "test"
    predictions = []
    targets = []
    
    with torch.no_grad():
        for batch_idx, items in enumerate(iterator):

            sys.stdout.write('\r')
            sys.stdout.write("{} Iteration :{}/{}"
                                .format(phase, batch_idx + 1, iterations))

            #Get the text and length of sentences
            text, text_lengths = items.test_sentAB
            text = text.to(device)
            text_lengths = text_lengths.to(device)

            #Get labels of each batch
            scores = items.test_score
            scores = scores.to(device)

            #Predictions are in size [1, ..]
            outputs = model(text, scores).squeeze()

            #Calculate loss
            loss = criterion(outputs, scores)
            
            #Get the prediction outputs and targets into a list for eval function
            predictions += list(outputs.detach().cpu().numpy())
            targets += list(scores.cpu().numpy())
        
            #Save the loss details for each epoch
            running_loss += loss.item()


        epoch_loss = running_loss / len(test_iterator)

    
    return epoch_loss, np.array(predictions), np.array(targets)

In [117]:
num_epochs = 8

for epoch in range(num_epochs):
    
    print("Epoch :{}/{}".format(epoch + 1, num_epochs))
    
    print(optimizer.param_groups[0]["lr"])
    
    train_loss, train_pred, train_targets = train(train_dataset, train_iterator, 
                                                  optimizer, scheduler, criterion) 
    print()
    
    scheduler.step()
    
    test_loss, test_pred, test_targets = evaluate(test_dataset, test_iterator, optimizer, criterion) 
    print()
    
    
    print("Train Loss :{:.3f} | PC (Train) :{:.3f} | "\
          "Eval Loss :{:.3f} | PC (Test) :{:.3f}".format(train_loss, 
                                                         pearsonr(train_pred, train_targets)[0], 
                                                         test_loss, 
                                                         pearsonr(test_pred, test_targets)[0]))

Epoch :1/8
0.01
train Iteration :625/625.0
test Iteration :379/379.0
Train Loss :1.198 | PC (Train) :0.011 | Eval Loss :1.027 | PC (Test) :0.022
Epoch :2/8
0.01
train Iteration :625/625.0
test Iteration :379/379.0
Train Loss :1.047 | PC (Train) :0.112 | Eval Loss :1.038 | PC (Test) :0.036
Epoch :3/8
0.01
train Iteration :625/625.0
test Iteration :379/379.0
Train Loss :1.014 | PC (Train) :0.191 | Eval Loss :0.993 | PC (Test) :0.196
Epoch :4/8
0.001
train Iteration :625/625.0
test Iteration :379/379.0
Train Loss :0.954 | PC (Train) :0.299 | Eval Loss :1.005 | PC (Test) :0.123
Epoch :5/8
0.001
train Iteration :625/625.0
test Iteration :379/379.0
Train Loss :0.946 | PC (Train) :0.312 | Eval Loss :1.004 | PC (Test) :0.129
Epoch :6/8
0.001
train Iteration :625/625.0
test Iteration :379/379.0
Train Loss :0.938 | PC (Train) :0.323 | Eval Loss :1.010 | PC (Test) :0.113
Epoch :7/8
0.001
train Iteration :625/625.0
test Iteration :379/379.0
Train Loss :0.928 | PC (Train) :0.338 | Eval Loss :1.014 

In [105]:
#Save the model. Comment immediately after saving
torch.save({ "optimizer" : optimizer.state_dict(),
             "scheduler" : scheduler.state_dict(),
             "model":model.state_dict(),
             "epoch" : 8}, '../Models/bilstm_task2_fn.pth')