In [1]:
import torch
import torch.nn as nn
from torchtext import data
import torch.optim as optim

import pandas as pd
import sys

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

In [2]:
# sentA = data.Field(tokenize='spacy', include_lengths=True, batch_first=True)
# sentB = data.Field(tokenize='spacy', include_lengths=True, batch_first=True)
# score = data.LabelField(dtype=torch.float, batch_first=True)
train_label = data.LabelField(dtype=torch.int64, batch_first=True)
train_sentAB = data.Field(tokenize='spacy', include_lengths=True, batch_first=True)

test_label = data.LabelField(dtype=torch.int64, batch_first=True)
test_sentAB = data.Field(tokenize='spacy', include_lengths=True, batch_first=True)

In [3]:
train_fields = [(None, None), (None, None), (None, None), 
                (None, None), ('train_label', train_label), ('train_sentAB', train_sentAB)]
test_fields = [(None, None), (None, None), (None, None), 
               (None, None), ('test_label', test_label), ('test_sentAB', test_sentAB)]

train_dataset = data.TabularDataset(path='../DATA/SICK/SICK train.csv', format='CSV', 
                                    fields=train_fields, skip_header=True)
test_dataset = data.TabularDataset(path='../DATA/SICK/SICK val.csv', format='CSV', 
                                    fields=test_fields, skip_header=True)

In [4]:
print("Size of training set {}".format(len(train_dataset)))
#Example of training data
print(vars(train_dataset.examples[0]))

Size of training set 4500
{'train_label': '0', 'train_sentAB': ['A', 'group', 'of', 'kids', 'is', 'playing', 'in', 'a', 'yard', 'and', 'an', 'old', 'man', 'is', 'standing', 'in', 'the', 'background', '<', 'sep', '>', 'A', 'group', 'of', 'boys', 'in', 'a', 'yard', 'is', 'playing', 'and', 'a', 'man', 'is', 'standing', 'in', 'the', 'background']}


In [5]:
print("Size of test set {}".format(len(test_dataset)))
#Example of training data
print(vars(test_dataset.examples[0]))

Size of test set 500
{'test_label': '2', 'test_sentAB': ['The', 'young', 'boys', 'are', 'playing', 'outdoors', 'and', 'the', 'man', 'is', 'smiling', 'nearby', '<', 'sep', '>', 'There', 'is', 'no', 'boy', 'playing', 'outdoors', 'and', 'there', 'is', 'no', 'man', 'smiling']}


In [6]:
train_sentAB.build_vocab(train_dataset, min_freq=3, vectors_cache="Vectors/",
                         vectors="glove.6B.100d")

test_sentAB.build_vocab(test_dataset, min_freq=3, vectors_cache="Vectors/",
                        vectors="glove.6B.100d")

train_label.build_vocab(train_dataset)
test_label.build_vocab(test_dataset)
# score.build_vocab(training_data)

In [7]:
print(len(train_sentAB.vocab))

print(len(test_sentAB.vocab))
# sentAB.vocab.stoi   #Print dictionary of words and indices
# sentAB.vocab.vectors  #Print vectors assigned to each word based on index

1697
439


In [8]:
print(train_label.vocab.stoi)
print(test_label.vocab.stoi)

defaultdict(None, {'0': 0, '1': 1, '2': 2})
defaultdict(None, {'0': 0, '1': 1, '2': 2})


In [9]:
BATCH_SIZE = 4

train_iterator = data.BucketIterator(train_dataset, BATCH_SIZE,
                                     sort_key=lambda x : x.train_sentAB,
                                     device=device,
                                     shuffle=True)

test_iterator = data.BucketIterator(test_dataset, BATCH_SIZE,
                                     sort_key=lambda x : x.test_sentAB,
                                     device=device,
                                     shuffle=True)

In [20]:
class RNNModel(nn.Module):
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim,
                 output_dim, num_layers, bidirectional,dropout_rate):
        super().__init__()
        
        #Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        #LSTM layer
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                            num_layers=num_layers,
                            bidirectional=bidirectional,
                            dropout=dropout_rate,
                            batch_first=True)
        
        #Full connected layer
        self.fc = nn.Linear(2 * hidden_dim, output_dim)
        
        #Activation function
#         self.act_fn = nn.Softmax(dim=1)
        
    def forward(self, text, text_lengths):
        
        #text = [batch size, max sentence length in batch]
        embedded = self.embedding(text)
        
        #pack the batch sentences to max length
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, 
                                                            batch_first=True, enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        
        #
        fc_outputs = self.fc(hidden)
        
        #Calculate softmax
        #Softmax is inbuilt in CE loss
#         outputs = self.act_fn(fc_outputs)
        outputs = fc_outputs
    
        return outputs

In [27]:
vocab_size = len(train_sentAB.vocab)
embedding_dim = 100
num_hidden_nodes = 32
num_output_nodes = 3
num_layers = 2
bidirectional = True
dropout_rate = 0.2

#Create the model
model = RNNModel(vocab_size, embedding_dim, num_hidden_nodes,
                 num_output_nodes, num_layers, bidirectional,dropout_rate)
model.to(device)

model

RNNModel(
  (embedding): Embedding(1697, 100)
  (lstm): LSTM(100, 32, num_layers=2, batch_first=True, dropout=0.2, bidirectional=True)
  (fc): Linear(in_features=64, out_features=3, bias=True)
)

In [28]:
num_elements = 0
for params in model.parameters():
    if params.requires_grad:
        num_elements += params.numel()
    
print("Number of trainable elements in the model {}".format(num_elements))

#Set pretrained word embedding as weights for embedding layer
embeddings = train_sentAB.vocab.vectors
model.embedding.weight.data.copy_(embeddings)

print("Size of embedding matrix {}".format(embeddings.size()))

Number of trainable elements in the model 229287
Size of embedding matrix torch.Size([1697, 100])


In [29]:
#Optimizer and Loss
optimizer = optim.Adam(model.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()
#Softmax layer only used to get probabilties
softmax = nn.LogSoftmax(dim=1)
softmax.requires_grad = False

criterion.to(device)

CrossEntropyLoss()

In [30]:
#Training loop

def train():
    iterations = len(train_dataset) / BATCH_SIZE

    model.train()

    running_loss = 0
    running_correct = 0

    for batch_idx, items in enumerate(train_iterator):

        sys.stdout.write('\r')
        sys.stdout.write("{} Iteration :{}/{}"
                            .format("train", batch_idx + 1, iterations))

        optimizer.zero_grad()

        #Get the text and length of sentences
        text, text_lengths = items.train_sentAB
        text = text.to(device)
        text_lengths = text_lengths.to(device)

        #Get labels of each batch
        labels = items.train_label
        labels = labels.to(device)

        #Predictions are in size [1, ..]
        outputs = model(text, text_lengths).squeeze()

        #Calculate loss
        loss = criterion(outputs, items.train_label)
        
        #Convert predictions to probabilities
        probabilites = softmax(outputs)
        #Give's the index of the node with the highest probability
        predictions = torch.argmax(probabilites, dim=1)
        correct = (predictions == labels).float()

        loss.backward()   
        optimizer.step()

        #Save the loss details for each epoch
        running_loss += loss.item()
        running_correct += correct.sum().item()

    epoch_loss = running_loss / len(train_iterator)
    epoch_accuracy = running_correct / len(train_dataset)
    
    return epoch_loss, epoch_accuracy

In [31]:
#Evaluate the results

def evaluate():
    iterations = len(test_dataset) / BATCH_SIZE

    model.eval()

    running_loss = 0
    running_correct = 0

    with torch.no_grad():
        for batch_idx, items in enumerate(test_iterator):

            sys.stdout.write('\r')
            sys.stdout.write("{} Iteration :{}/{}"
                                .format("test", batch_idx + 1, iterations))


            #Get the text and length of sentences
            text, text_lengths = items.test_sentAB
            text = text.to(device)
            text_lengths = text_lengths.to(device)

            #Get labels of each batch
            labels = items.test_label
            labels = labels.to(device)

            #Predictions are in size [1, ..]
            outputs = model(text, text_lengths).squeeze()

            #Calculate loss
            loss = criterion(outputs, items.test_label)

            #Convert predictions to probabilities
            probabilites = softmax(outputs)
            #Give's the index of the node with the highest probability
            predictions = torch.argmax(probabilites, dim=1)
            correct = (predictions == labels).float()

            #Save the loss details for each epoch
            running_loss += loss.item()
            running_correct += correct.sum().item()

        epoch_loss = running_loss / len(test_iterator)
        epoch_accuracy = running_correct / len(test_dataset)
    
    return epoch_loss, epoch_accuracy

In [32]:

num_epochs = 4

for epoch in range(num_epochs):
    sys.stdout.write('\r')
    sys.stdout.write("Epoch :{}/{}"
                        .format(epoch + 1, num_epochs))
    
    train_loss, train_acc = train() 
    print()
    print("Train Loss :{}, Train Accuracy :{}".format(train_loss, train_acc))
    
    test_loss, test_acc = evaluate() 
    print("Eval Loss :{}, Eval Accuracy :{}".format(test_loss, test_acc))

train Iteration :1125/1125.0
Train Loss :0.9349760778215196, Train Accuracy :0.5675555555555556
test Iteration :125/125.0Eval Loss :0.9686191704273224, Eval Accuracy :0.564
train Iteration :1125/1125.0
Train Loss :0.8618767706553141, Train Accuracy :0.5848888888888889
test Iteration :125/125.0Eval Loss :1.0281346366405486, Eval Accuracy :0.54
train Iteration :1125/1125.0
Train Loss :0.8290792955160141, Train Accuracy :0.5951111111111111
test Iteration :125/125.0Eval Loss :1.0288572356700898, Eval Accuracy :0.574
train Iteration :1125/1125.0
Train Loss :0.7574913047684564, Train Accuracy :0.6171111111111112
test Iteration :125/125.0Eval Loss :1.1715995078086854, Eval Accuracy :0.544


In [None]:
eval_loss, eval_acc = evaluate()
eval_loss, eval_acc

In [None]:
#Preprocessing the data
df = pd.read_csv('../Data/SICK/SICK train.txt', sep="\t")

label = []

for val in df['entailment_judgment']:
    if val == "CONTRADICTION":
        label.append(2)
    elif val == "ENTAILMENT":
        label.append(1)
    else:
        label.append(0)

df['entailment_judgment'] = label
input_sent = []

for idx, row in df.iterrows():
    input_sent.append(row['sentence_A'] + ' <sep> ' + row['sentence_B'])

df['sentAB'] = input_sent

df.to_csv('../DATA/SICK/SICK train.csv', index=False)