In [1]:
#seeding for comparing experiment in part 2
import torch
SEED = 1111
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda')

# SNLI Bert
## Second Tutorial
https://towardsdatascience.com/fine-tuning-pre-trained-transformer-models-for-sentence-entailment-d87caf9ec9db
Check his Github code for complete notebook. I never referred to it. Medium was enough.
BERT in keras-tf: https://towardsdatascience.com/bert-in-keras-with-tensorflow-hub-76bcbc9417b

In [2]:
# define macros
BERT_MODEL = 'bert-base-uncased'
MAX_SEQ_LENGTH = 100 # we dont need to enforce this now because snli is a relatively sanitized dataset where sentence lenghts are reasonable

# Prepare data

## load the dataset

In [3]:
# !wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip

In [4]:
import pandas as pd

#kaggle dataset format
# train_df = pd.read_csv('snli/snli_1.0_train.csv')
# val_df = pd.read_csv('snli/snli_1.0_dev.csv')
# test_df = pd.read_csv('snli/snli_1.0_test.csv')

# official dataset format
# we only need specific columns
train_df = pd.read_csv('./snli_1.0/snli_1.0_train.txt', sep='\t', usecols = ['gold_label', 'sentence1', 'sentence2'])
val_df = pd.read_csv('./snli_1.0/snli_1.0_dev.txt', sep='\t', usecols = ['gold_label', 'sentence1', 'sentence2'])
test_df = pd.read_csv('./snli_1.0/snli_1.0_test.txt', sep='\t', usecols = ['gold_label', 'sentence1', 'sentence2'])

# for testing code, use a subset of the data
train_df_sample = train_df[:10]
val_df_sample = val_df[:3]
test_df_sample = test_df[:3]

train_df_sample.head()

Unnamed: 0,gold_label,sentence1,sentence2
0,neutral,A person on a horse jumps over a broken down a...,A person is training his horse for a competition.
1,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette."
2,entailment,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse."
3,neutral,Children smiling and waving at camera,They are smiling at their parents
4,entailment,Children smiling and waving at camera,There are children present


## Clean the data

In [5]:
# there are a few samples with "-" label. removing them.
train_df = train_df.loc[train_df['gold_label'].isin(['entailment','contradiction','neutral'])]
val_df = val_df.loc[val_df['gold_label'].isin(['entailment','contradiction','neutral'])]
test_df = test_df.loc[test_df['gold_label'].isin(['entailment','contradiction','neutral'])]

#dropping any empty values
train_df.dropna(inplace=True)
val_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# train_df = train_df[:1000]
# val_df = val_df[:100]
# test_df = test_df[:100]

## Prepare a dataset handler class

In [6]:

import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer
import pandas as pd

class MNLIDataBert(Dataset):

  def __init__(self, train_df, val_df, test_df):
    self.train_df = train_df
    self.val_df = val_df
    self.test_df = test_df

    self.tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True) # Using a pre-trained BERT tokenizer to encode sentences
    self.train_data = None
    self.val_data = None
    self.test_data = None
    self.init_data()

  def init_data(self):
    self.get_label_mapping()
    self.train_data = self.load_data(self.train_df)
    self.val_data = self.load_data(self.val_df)
    self.test_data = self.load_data(self.test_df)

  def get_label_mapping(self):
      self.label_dict = {'entailment': 0, 'contradiction': 1, 'neutral': 2}
      # needed later for classification report object to generate precision and recall on test dataset
      self.rev_label_dict = {self.label_dict[k]:k for k in self.label_dict.keys()} 

  def load_data(self, df):
    MAX_LEN = 512 # dont need to enforce this now because snli is a sanitized dataset where sentence lenghts are reasonable. otherwise the beert model doesn't have enough parameters to handle long length sentences
    token_ids = []
    mask_ids = []
    seg_ids = []
    y = []

    premise_list = df['sentence1'].to_list()
    hypothesis_list = df['sentence2'].to_list()
    label_list = df['gold_label'].to_list()

    for (premise, hypothesis, label) in zip(premise_list, hypothesis_list, label_list):
      premise_id = self.tokenizer.encode(premise, add_special_tokens = False)
      hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False)
      pair_token_ids = [self.tokenizer.cls_token_id] + premise_id + [self.tokenizer.sep_token_id] + hypothesis_id + [self.tokenizer.sep_token_id]
      premise_len = len(premise_id)
      hypothesis_len = len(hypothesis_id)

      segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))  # sentence 0 and sentence 1
      attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values

      token_ids.append(torch.tensor(pair_token_ids))
      seg_ids.append(segment_ids)
      mask_ids.append(attention_mask_ids)
      y.append(self.label_dict[label])
    
    token_ids = pad_sequence(token_ids, batch_first=True)
    mask_ids = pad_sequence(mask_ids, batch_first=True)
    seg_ids = pad_sequence(seg_ids, batch_first=True)
    print('token_ids: ', token_ids[2])
    print('seg_ids: ', seg_ids[2])
    print('mask_ids: ', mask_ids[2])

    y = torch.tensor(y)
    dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
    return dataset

  def get_data_loaders(self, batch_size=32, shuffle=True):
    train_loader = DataLoader(
      self.train_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    val_loader = DataLoader(
      self.val_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    test_loader = DataLoader(
      self.test_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    return train_loader, val_loader, test_loader

In [7]:
mnli_dataset = MNLIDataBert(train_df, val_df, test_df)

train_loader, val_loader, test_loader = mnli_dataset.get_data_loaders()
label_dict = mnli_dataset.label_dict # required by custom func to calculate accuracy
rev_label_dict = mnli_dataset.rev_label_dict # required by custom func to calculate accuracy

token_ids:  tensor([  101,  1037,  2711,  2006,  1037,  3586, 14523,  2058,  1037,  3714,
         2091, 13297,  1012,   102,  1037,  2711,  2003, 19350,  1010,  2006,
         1037,  3586,  1012,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0])
seg_ids:

In [8]:
from transformers import BertForSequenceClassification, AdamW

model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=3).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
# to evaluate model for train and test. And also use classification report for testing

# helper function to calculate the batch accuracy
def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

# freeze model weights and measure validation / test 
def evaluate_accuracy(model, optimizer, data_loader, rev_label_dict):
  model.eval()
  total_val_acc  = 0
  total_val_loss = 0
  
  #for classification report
  y_true = []
  y_pred = []
  
  with torch.no_grad():
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(data_loader):
      optimizer.zero_grad()
      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)
      
      loss, prediction = model(pair_token_ids, 
                            token_type_ids=seg_ids, 
                            attention_mask=mask_ids, 
                            labels=labels).values()
      
      acc = multi_acc(prediction, labels)

      total_val_loss += loss.item()
      total_val_acc  += acc.item()

      argmax_predictions = torch.argmax(prediction,dim=1).tolist()
      labels_list = labels.tolist()
      assert(len(labels_list)==len(argmax_predictions))
      for p in argmax_predictions: y_pred.append(rev_label_dict[int(p)])
      for l in labels_list: y_true.append(rev_label_dict[l])

  val_acc  = total_val_acc/len(data_loader)
  val_loss = total_val_loss/len(data_loader)
  cr = classification_report(y_true, y_pred)
  
  return val_acc, val_loss, cr, model, optimizer

In [10]:
import time
from sklearn.metrics import classification_report

EPOCHS = 5

def train(model, train_loader, val_loader, optimizer, rev_label_dict):  
  for epoch in range(EPOCHS):
    start = time.time()
    model.train()
    total_train_loss = 0
    total_train_acc  = 0
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
      optimizer.zero_grad()
      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)

      loss, prediction = model(pair_token_ids, 
                             token_type_ids=seg_ids, 
                             attention_mask=mask_ids, 
                             labels=labels).values()

      acc = multi_acc(prediction, labels)

      loss.backward()
      optimizer.step()
      
      total_train_loss += loss.item()
      total_train_acc  += acc.item()

    train_acc  = total_train_acc/len(train_loader)
    train_loss = total_train_loss/len(train_loader)

    val_acc, val_loss, cr, model, optimizer = evaluate_accuracy(model, optimizer, val_loader, rev_label_dict)
    end = time.time()
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)

    print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

In [11]:
train(model, train_loader, val_loader, optimizer, rev_label_dict)

Epoch 1: train_loss: 0.3737 train_acc: 0.8592 | val_loss: 0.2667 val_acc: 0.9045
01:08:46.39
Epoch 2: train_loss: 0.2653 train_acc: 0.9043 | val_loss: 0.2663 val_acc: 0.9071
01:12:54.83
Epoch 3: train_loss: 0.2046 train_acc: 0.9274 | val_loss: 0.2755 val_acc: 0.9075
01:05:30.23
Epoch 4: train_loss: 0.1625 train_acc: 0.9434 | val_loss: 0.3097 val_acc: 0.9054
01:05:28.14
Epoch 5: train_loss: 0.1324 train_acc: 0.9543 | val_loss: 0.3199 val_acc: 0.9063
01:05:31.80


In [12]:
torch.save(model.state_dict(), 'bert-nli.pt')

In [13]:
def validate(model, test_loader, optimizer, rev_label_dict):
  start = time.time()
  test_acc, test_loss, cr, model, optimizer = evaluate_accuracy(model, optimizer, test_loader, rev_label_dict)
  end = time.time()
  hours, rem = divmod(end-start, 3600)
  minutes, seconds = divmod(rem, 60)

  print(f'Test_loss: {test_loss:.4f} test_acc: {test_acc:.4f}')
  print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
  print(cr)

  return test_loss, test_acc


model.load_state_dict(torch.load('bert-nli.pt'))
test_loss, test_acc = validate(model, test_loader, optimizer, rev_label_dict)
print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test_loss: 0.3315 test_acc: 0.9023
00:00:14.07
               precision    recall  f1-score   support

contradiction       0.93      0.93      0.93      3237
   entailment       0.93      0.89      0.91      3368
      neutral       0.85      0.88      0.87      3219

     accuracy                           0.90      9824
    macro avg       0.90      0.90      0.90      9824
 weighted avg       0.90      0.90      0.90      9824

Test Loss: 0.331 |  Test Acc: 90.23%


In [14]:
# TODO: Medium page.
# TODO: Make code modular. eg - evaluation called for dev and test adn train. Loss and prediction can become loss and logits. Another wrapper to return basic predictions/log them in it as well.