# Broken code. Refer only till tokenization step for clarity
https://www.analyticsvidhya.com/blog/2021/05/bert-for-natural-language-inference-simplified-in-pytorch/

# Jump to section SNLI Bert

In [1]:
import torch
SEED = 1111
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [2]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
len(tokenizer)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

30522

In [3]:
tokens = tokenizer.tokenize('Heyy There!! See some boys are playing in rain')
print(tokens)

['hey', '##y', 'there', '!', '!', 'see', 'some', 'boys', 'are', 'playing', 'in', 'rain']


In [4]:
indexes = tokenizer.convert_tokens_to_ids(tokens)
print(indexes)

[4931, 2100, 2045, 999, 999, 2156, 2070, 3337, 2024, 2652, 1999, 4542]


In [5]:
cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
print(cls_token, sep_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [6]:
cls_token_idx = tokenizer.cls_token_id
sep_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id
print(cls_token_idx, sep_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [7]:
max_input_length = 256

In [8]:
def tokenize_bert(sentence):
    tokens = tokenizer.tokenize(sentence) 
    return tokens
def split_and_cut(sentence):
    tokens = sentence.strip().split(" ")
    tokens = tokens[:max_input_length]
    return tokens

def trim_sentence(sent):
    try:
        sent = sent.split()
        sent = sent[:128]
        return " ".join(sent)
    except:
        return sent

# Helper funcs for data preparation

In [10]:
#Get list of 0s 
def get_sent1_token_type(sent):
    try:
        return [0]* len(sent)
    except:
        return []
#Get list of 1s
def get_sent2_token_type(sent):
    try:
        return [1]* len(sent)
    except:
        return []
#combine from lists
def combine_seq(seq):
    return " ".join(seq)
#combines from lists of int
def combine_mask(mask):
    mask = [str(m) for m in mask]
    return " ".join(mask)

In [56]:
import pandas as pd
#load dataset
df_train = pd.read_csv('snli/snli_1.0_train.csv')
df_dev = pd.read_csv('snli/snli_1.0_dev.csv')
df_test = pd.read_csv('snli/snli_1.0_test.csv')

#Get neccesary columns
df_train = df_train[['gold_label','sentence1','sentence2']]
df_dev = df_dev[['gold_label','sentence1','sentence2']]
df_test = df_test[['gold_label','sentence1','sentence2']]

#Take small dataset
df_train = df_train[:80000]
df_dev = df_train[:8000]
df_test = df_train[:8000]

#Trim each sentence upto maximum length
df_train['sentence1'] = df_train['sentence1'].apply(trim_sentence)
df_train['sentence2'] = df_train['sentence2'].apply(trim_sentence)
df_dev['sentence1'] = df_dev['sentence1'].apply(trim_sentence)
df_dev['sentence2'] = df_dev['sentence2'].apply(trim_sentence)
df_test['sentence1'] = df_test['sentence1'].apply(trim_sentence)
df_test['sentence2'] = df_test['sentence2'].apply(trim_sentence)

#Add [CLS] and [SEP] tokens
df_train['sent1'] = '[CLS] ' + df_train['sentence1'] + ' [SEP] '
df_train['sent2'] = df_train['sentence2'] + ' [SEP]'
df_dev['sent1'] = '[CLS] ' + df_dev['sentence1'] + ' [SEP] '
df_dev['sent2'] = df_dev['sentence2'] + ' [SEP]'
df_test['sent1'] = '[CLS] ' + df_test['sentence1'] + ' [SEP] '
df_test['sent2'] = df_test['sentence2'] + ' [SEP]'

#Apply Bert Tokenizer for tokeinizing
df_train['sent1_t'] = df_train['sent1'].apply(tokenize_bert)
df_train['sent2_t'] = df_train['sent2'].apply(tokenize_bert)
df_dev['sent1_t'] = df_dev['sent1'].apply(tokenize_bert)
df_dev['sent2_t'] = df_dev['sent2'].apply(tokenize_bert)
df_test['sent1_t'] = df_test['sent1'].apply(tokenize_bert)
df_test['sent2_t'] = df_test['sent2'].apply(tokenize_bert)


#Get Topen type ids for both sentence
df_train['sent1_token_type'] = df_train['sent1_t'].apply(get_sent1_token_type)
df_train['sent2_token_type'] = df_train['sent2_t'].apply(get_sent2_token_type)
df_dev['sent1_token_type'] = df_dev['sent1_t'].apply(get_sent1_token_type)
df_dev['sent2_token_type'] = df_dev['sent2_t'].apply(get_sent2_token_type)
df_test['sent1_token_type'] = df_test['sent1_t'].apply(get_sent1_token_type)
df_test['sent2_token_type'] = df_test['sent2_t'].apply(get_sent2_token_type)

#Combine both sequences
df_train['sequence'] = df_train['sent1_t'] + df_train['sent2_t']
df_dev['sequence'] = df_dev['sent1_t'] + df_dev['sent2_t']
df_test['sequence'] = df_test['sent1_t'] + df_test['sent2_t']


#Get attention mask
df_train['attention_mask'] = df_train['sequence'].apply(get_sent2_token_type)
df_dev['attention_mask'] = df_dev['sequence'].apply(get_sent2_token_type)
df_test['attention_mask'] = df_test['sequence'].apply(get_sent2_token_type)

#Get combined token type ids for input
df_train['token_type'] = df_train['sent1_token_type'] + df_train['sent2_token_type']
df_dev['token_type'] = df_dev['sent1_token_type'] + df_dev['sent2_token_type']
df_test['token_type'] = df_test['sent1_token_type'] + df_test['sent2_token_type']

#Now make all these inputs as sequential data to be easily fed into torchtext Field.
df_train['sequence'] = df_train['sequence'].apply(combine_seq)
df_dev['sequence'] = df_dev['sequence'].apply(combine_seq)
df_test['sequence'] = df_test['sequence'].apply(combine_seq)
df_train['attention_mask'] = df_train['attention_mask'].apply(combine_mask)
df_dev['attention_mask'] = df_dev['attention_mask'].apply(combine_mask)
df_test['attention_mask'] = df_test['attention_mask'].apply(combine_mask)
df_train['token_type'] = df_train['token_type'].apply(combine_mask)
df_dev['token_type'] = df_dev['token_type'].apply(combine_mask)
df_test['token_type'] = df_test['token_type'].apply(combine_mask)
df_train = df_train[['gold_label', 'sequence', 'attention_mask', 'token_type']]
df_dev = df_dev[['gold_label', 'sequence', 'attention_mask', 'token_type']]
df_test = df_test[['gold_label', 'sequence', 'attention_mask', 'token_type']]
df_train = df_train.loc[df_train['gold_label'].isin(['entailment','contradiction','neutral'])]
df_dev = df_dev.loc[df_dev['gold_label'].isin(['entailment','contradiction','neutral'])]
df_test = df_test.loc[df_test['gold_label'].isin(['entailment','contradiction','neutral'])]

#Save prepared data as csv file
df_train.to_csv('snli/snli_1.0_train_analyticsvidhya.csv', index=False)
df_dev.to_csv('snli/snli_1.0_dev_analyticsvidhya.csv', index=False)
df_test.to_csv('snli/snli_1.0_test_analyticsvidhya.csv', index=False)

In [58]:
df_dev.head()

Unnamed: 0,gold_label,sequence,attention_mask,token_type
0,neutral,[CLS] a person on a horse jumps over a broken ...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
1,contradiction,[CLS] a person on a horse jumps over a broken ...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 ...,0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 ...
2,entailment,[CLS] a person on a horse jumps over a broken ...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
3,neutral,[CLS] children smiling and waving at camera [S...,1 1 1 1 1 1 1 1 1 1 1 1 1 1 1,0 0 0 0 0 0 0 0 1 1 1 1 1 1 1
4,entailment,[CLS] children smiling and waving at camera [S...,1 1 1 1 1 1 1 1 1 1 1 1 1,0 0 0 0 0 0 0 0 1 1 1 1 1


# SNLI Bert
## Second Tutorial
https://towardsdatascience.com/fine-tuning-pre-trained-transformer-models-for-sentence-entailment-d87caf9ec9db
Check his Github code for complete notebook. I never referred to it. Medium was enough.
BERT in keras-tf: https://towardsdatascience.com/bert-in-keras-with-tensorflow-hub-76bcbc9417b

In [8]:

import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pickle
import os
from transformers import BertTokenizer
import pandas as pd

class MNLIDataBert(Dataset):

  def __init__(self, train_df, val_df, test_df):
    self.label_dict = {'entailment': 0, 'contradiction': 1, 'neutral': 2}

    self.train_df = train_df
    self.val_df = val_df
    self.test_df = test_df

    self.base_path = '/content/'
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # Using a pre-trained BERT tokenizer to encode sentences
    self.train_data = None
    self.val_data = None
    self.test_data = None
    self.init_data()

  def init_data(self):
    self.train_data = self.load_data(self.train_df)
    self.val_data = self.load_data(self.val_df)
    self.test_data = self.load_data(self.test_df)

  def load_data(self, df):
    MAX_LEN = 512
    token_ids = []
    mask_ids = []
    seg_ids = []
    y = []

    premise_list = df['sentence1'].to_list()
    hypothesis_list = df['sentence2'].to_list()
    label_list = df['gold_label'].to_list()

    for (premise, hypothesis, label) in zip(premise_list, hypothesis_list, label_list):
      premise_id = self.tokenizer.encode(premise, add_special_tokens = False)
      hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False)
      pair_token_ids = [self.tokenizer.cls_token_id] + premise_id + [self.tokenizer.sep_token_id] + hypothesis_id + [self.tokenizer.sep_token_id]
      premise_len = len(premise_id)
      hypothesis_len = len(hypothesis_id)

      segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))  # sentence 0 and sentence 1
      attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values

      token_ids.append(torch.tensor(pair_token_ids))
      seg_ids.append(segment_ids)
      mask_ids.append(attention_mask_ids)
      y.append(self.label_dict[label])
    
    token_ids = pad_sequence(token_ids, batch_first=True)
    mask_ids = pad_sequence(mask_ids, batch_first=True)
    seg_ids = pad_sequence(seg_ids, batch_first=True)
    y = torch.tensor(y)
    dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
    print(len(dataset))
    return dataset

  def get_data_loaders(self, batch_size=32, shuffle=True):
    train_loader = DataLoader(
      self.train_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    val_loader = DataLoader(
      self.val_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    test_loader = DataLoader(
      self.test_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    return train_loader, val_loader, test_loader

train_df = pd.read_csv('snli/snli_1.0_train.csv')
val_df = pd.read_csv('snli/snli_1.0_dev.csv')
test_df = pd.read_csv('snli/snli_1.0_test.csv')

# train_df = train_df[:1000]
# val_df = val_df[:100]
# test_df = test_df[:100]

train_df = train_df.loc[train_df['gold_label'].isin(['entailment','contradiction','neutral'])]
val_df = val_df.loc[val_df['gold_label'].isin(['entailment','contradiction','neutral'])]
test_df = test_df.loc[test_df['gold_label'].isin(['entailment','contradiction','neutral'])]

train_df.dropna(inplace=True)
val_df.dropna(inplace=True)
test_df.dropna(inplace=True)

mnli_dataset = MNLIDataBert(train_df, val_df, test_df)
train_loader, val_loader, test_loader = mnli_dataset.get_data_loaders()

36303
9831
9815


In [36]:
label_dict = mnli_dataset.label_dict
rev_label_dict = {label_dict[k]:k for k in label_dict.keys()}
rev_label_dict, label_dict

({0: 'entailment', 1: 'contradiction', 2: 'neutral'},
 {'entailment': 0, 'contradiction': 1, 'neutral': 2})

In [9]:
from transformers import BertForSequenceClassification, AdamW

device = torch.device('cuda')

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [10]:
def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

import time

EPOCHS = 5



def train(model, train_loader, val_loader, optimizer):  
  total_step = len(train_loader)

  for epoch in range(EPOCHS):
    start = time.time()
    model.train()
    total_train_loss = 0
    total_train_acc  = 0
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
      optimizer.zero_grad()
      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)

      loss, prediction = model(pair_token_ids, 
                             token_type_ids=seg_ids, 
                             attention_mask=mask_ids, 
                             labels=labels).values()

      acc = multi_acc(prediction, labels)

      loss.backward()
      optimizer.step()
      
      total_train_loss += loss.item()
      total_train_acc  += acc.item()

    train_acc  = total_train_acc/len(train_loader)
    train_loss = total_train_loss/len(train_loader)
    model.eval()
    total_val_acc  = 0
    total_val_loss = 0
    with torch.no_grad():
      for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(val_loader):
        optimizer.zero_grad()
        pair_token_ids = pair_token_ids.to(device)
        mask_ids = mask_ids.to(device)
        seg_ids = seg_ids.to(device)
        labels = y.to(device)
        
        loss, prediction = model(pair_token_ids, 
                             token_type_ids=seg_ids, 
                             attention_mask=mask_ids, 
                             labels=labels).values()
        
        acc = multi_acc(prediction, labels)

        total_val_loss += loss.item()
        total_val_acc  += acc.item()

    val_acc  = total_val_acc/len(val_loader)
    val_loss = total_val_loss/len(val_loader)
    end = time.time()
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)

    print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

In [11]:
train(model, train_loader, val_loader, optimizer)

Epoch 1: train_loss: 0.5308 train_acc: 0.7828 | val_loss: 0.4308 val_acc: 0.8371
00:04:13.68
Epoch 2: train_loss: 0.2874 train_acc: 0.8954 | val_loss: 0.4141 val_acc: 0.8475
00:04:13.47
Epoch 3: train_loss: 0.1633 train_acc: 0.9441 | val_loss: 0.4863 val_acc: 0.8538
00:04:11.44
Epoch 4: train_loss: 0.0976 train_acc: 0.9672 | val_loss: 0.5690 val_acc: 0.8483
00:04:11.98
Epoch 5: train_loss: 0.0663 train_acc: 0.9778 | val_loss: 0.6344 val_acc: 0.8440
00:04:12.46


In [12]:
torch.save(model.state_dict(), 'bert-nli.pt')

In [70]:
from sklearn.metrics import classification_report

def evaluate(model, iterator):
  start = time.time()
  model.eval()
  total_val_acc  = 0
  total_val_loss = 0

  #for classification report
  y_true = []
  y_pred = []

  with torch.no_grad():
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(iterator):
      optimizer.zero_grad()
      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)
      
      loss, prediction = model(pair_token_ids, 
                            token_type_ids=seg_ids, 
                            attention_mask=mask_ids, 
                            labels=labels).values()
      
      acc = multi_acc(prediction, labels)

      total_val_loss += loss.item()
      total_val_acc  += acc.item()

      argmax_predictions = torch.argmax(prediction,dim=1).tolist()
      labels_list = labels.tolist()
      assert(len(labels_list)==len(argmax_predictions))
      for p in argmax_predictions: y_pred.append(rev_label_dict[int(p)])
      for l in labels_list: y_true.append(rev_label_dict[l])

  val_acc  = total_val_acc/len(val_loader)
  val_loss = total_val_loss/len(val_loader)
  end = time.time()
  hours, rem = divmod(end-start, 3600)
  minutes, seconds = divmod(rem, 60)

  print(f'Test_loss: {val_loss:.4f} test_acc: {val_acc:.4f}')
  print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

  print(classification_report(y_true, y_pred))
  return val_loss, val_acc, y_true, y_pred

model.load_state_dict(torch.load('bert-nli.pt'))
test_loss, test_acc, y_true, y_pred = evaluate(model, test_loader)
print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test_loss: 0.6332 test_acc: 0.8423
00:00:13.74
               precision    recall  f1-score   support

contradiction       0.85      0.88      0.86      3236
   entailment       0.90      0.83      0.86      3364
      neutral       0.79      0.82      0.81      3215

     accuracy                           0.85      9815
    macro avg       0.85      0.85      0.85      9815
 weighted avg       0.85      0.85      0.85      9815

Test Loss: 0.633 |  Test Acc: 84.23%


In [71]:
# TODO: Medium page.
# TODO: Make code modular. eg - evaluation called for dev and test adn train. Loss and prediction can become loss and logits. Another wrapper to return basic predictions/log them in it as well.