In [1]:
#seeding for comparing experiment in part 2
import torch
SEED = 1111
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda')

# SNLI Bert
## Second Tutorial
https://towardsdatascience.com/fine-tuning-pre-trained-transformer-models-for-sentence-entailment-d87caf9ec9db
Check his Github code for complete notebook. I never referred to it. Medium was enough.
BERT in keras-tf: https://towardsdatascience.com/bert-in-keras-with-tensorflow-hub-76bcbc9417b

In [2]:
# define macros
BERT_MODEL = 'bert-base-uncased'
MAX_SEQ_LENGTH = 100 # we dont need to enforce this now because snli is a relatively sanitized dataset where sentence lenghts are reasonable

# Prepare data

## load the dataset

In [3]:
# !wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip

In [4]:
import pandas as pd

# custom reader needed to handle quotechars
def read_df_custom(file):
    header = 'doc     unit1_toks      unit2_toks      unit1_txt       unit2_txt       s1_toks s2_toks unit1_sent      unit2_sent      dir     nuc_children    sat_children    genre   u1_discontinuous        u2_discontinuous       u1_issent        u2_issent       u1_length       u2_length       length_ratio    u1_speaker      u2_speaker      same_speaker    u1_func u1_pos  u1_depdir       u2_func u2_pos  u2_depdir       doclen  u1_position      u2_position     percent_distance        distance        lex_overlap_words       lex_overlap_length      unit1_case      unit2_case      label'
    header = header.split()
    df = pd.DataFrame(columns=['unit1_txt', 'unit1_sent', 'unit2_txt', 'unit2_sent', 'dir', 'label'])
    file = open(file, 'r')

    rows = []
    count = 0 
    for line in file:
        line = line[:-1].split('\t')
        count+=1
        if count ==1: continue
        row = {}
        for column in ['unit1_txt', 'unit1_sent', 'unit2_txt', 'unit2_sent', 'dir', 'label']:
            index = header.index(column)
            row[column] = line[index]
        rows.append(row)

    df = pd.concat([df, pd.DataFrame.from_records(rows)])
    return df

# we only need specific columns
train_df = read_df_custom('./eng.rst.gum_train_enriched.rels')
test_df = read_df_custom('./eng.rst.gum_test_enriched.rels')
val_df = read_df_custom('./eng.rst.gum_dev_enriched.rels')
len(train_df)

13897

## Clean the data

In [5]:
#dropping any empty values
train_df.dropna(inplace=True)
val_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# train_df = train_df[:1000]
# val_df = val_df[:100]
# test_df = test_df[:100]

## Prepare a dataset handler class

In [6]:

import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer
import pandas as pd

class MNLIDataBert(Dataset):

  def __init__(self, train_df, val_df, test_df):
    self.train_df = train_df
    self.val_df = val_df
    self.test_df = test_df

    self.tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True) # Using a pre-trained BERT tokenizer to encode sentences
    self.train_data = None
    self.val_data = None
    self.test_data = None
    self.init_data()

  def init_data(self):
    self.get_label_mapping()
    self.train_data = self.load_data(self.train_df)
    self.val_data = self.load_data(self.val_df)
    self.test_data = self.load_data(self.test_df)

  def get_label_mapping(self):
    labels = {}
    labels_list = list(set(list(self.train_df['label'].unique()) + list(self.test_df['label'].unique()) + list(self.val_df['label'].unique())))
    for i in range(len(labels_list)):
        labels[labels_list[i]] = i
    self.label_dict = labels# {'entailment': 0, 'contradiction': 1, 'neutral': 2}
    # needed later for classification report object to generate precision and recall on test dataset
    self.rev_label_dict = {self.label_dict[k]:k for k in self.label_dict.keys()} 

  def load_data(self, df):
    MAX_LEN = 256 # dont need to enforce this now because snli is a sanitized dataset where sentence lenghts are reasonable. otherwise the beert model doesn't have enough parameters to handle long length sentences
    token_ids = []
    mask_ids = []
    seg_ids = []
    y = []

    premise_list = df['unit1_txt'].to_list()
    hypothesis_list = df['unit2_txt'].to_list()
    label_list = df['label'].to_list()

    for (premise, hypothesis, label) in zip(premise_list, hypothesis_list, label_list):
      premise_id = self.tokenizer.encode(premise, add_special_tokens = False, max_length=MAX_LEN, truncation=True)
      hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False, max_length=MAX_LEN, truncation=True)
      pair_token_ids = [self.tokenizer.cls_token_id] + premise_id + [self.tokenizer.sep_token_id] + hypothesis_id + [self.tokenizer.sep_token_id]
      premise_len = len(premise_id)
      hypothesis_len = len(hypothesis_id)

      segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))  # sentence 0 and sentence 1
      attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values

      token_ids.append(torch.tensor(pair_token_ids))
      seg_ids.append(segment_ids)
      mask_ids.append(attention_mask_ids)
      y.append(self.label_dict[label])
    
    token_ids = pad_sequence(token_ids, batch_first=True)
    mask_ids = pad_sequence(mask_ids, batch_first=True)
    seg_ids = pad_sequence(seg_ids, batch_first=True)

    y = torch.tensor(y)
    dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
    return dataset

  def get_data_loaders(self, batch_size=32, shuffle=True):
    train_loader = DataLoader(
      self.train_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    val_loader = DataLoader(
      self.val_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    test_loader = DataLoader(
      self.test_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    return train_loader, val_loader, test_loader

In [7]:
mnli_dataset = MNLIDataBert(train_df, val_df, test_df)

train_loader, val_loader, test_loader = mnli_dataset.get_data_loaders()
label_dict = mnli_dataset.label_dict # required by custom func to calculate accuracy, bert model
rev_label_dict = mnli_dataset.rev_label_dict # required by custom func to calculate accuracy

In [8]:
from transformers import BertForSequenceClassification, AdamW
from torch import optim

model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=len(label_dict)).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.6, mode='max', patience=2, min_lr=5e-7, verbose=True)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
# to evaluate model for train and test. And also use classification report for testing

# helper function to calculate the batch accuracy
def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

# freeze model weights and measure validation / test 
def evaluate_accuracy(model, optimizer, data_loader, rev_label_dict):
  model.eval()
  total_val_acc  = 0
  total_val_loss = 0
  
  #for classification report
  y_true = []
  y_pred = []
  
  with torch.no_grad():
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(data_loader):      
      optimizer.zero_grad()
      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)
      
      loss, prediction = model(pair_token_ids, 
                            token_type_ids=seg_ids, 
                            attention_mask=mask_ids, 
                            labels=labels).values()
      
      acc = multi_acc(prediction, labels)

      total_val_loss += loss.item()
      total_val_acc  += acc.item()

      # log predictions for classification report
      argmax_predictions = torch.argmax(prediction,dim=1).tolist()
      labels_list = labels.tolist()
      assert(len(labels_list)==len(argmax_predictions))
      for p in argmax_predictions: y_pred.append(rev_label_dict[int(p)])
      for l in labels_list: y_true.append(rev_label_dict[l])

  val_acc  = total_val_acc/len(data_loader)
  val_loss = total_val_loss/len(data_loader)
  cr = classification_report(y_true, y_pred)
  
  return val_acc, val_loss, cr, model, optimizer

In [10]:
# ### HUGGINGFACE PLAYGROUND
# from transformers import Trainer
# class MyTrainer(Trainer):
#   def compute_loss(self,model, inputs, rev_label_dict):
#     pair_token_ids, seg_ids, mask_ids, labels = inputs
#     loss = model(pair_token_ids, 
#                             token_type_ids=seg_ids, 
#                             attention_mask=mask_ids, 
#                             labels=labels)#.values()
#     return loss #my_custom_loss(logits, labels)

In [11]:
### MODIFIED
import time
import traceback

from sklearn.metrics import classification_report

def EarlyStoppingCallbackCustomBasedOnLoss(val_loss, prev_loss, trigger_times, patience, model):
  # https://clay-atlas.com/us/blog/2021/08/25/pytorch-en-early-stopping/
  if val_loss > prev_loss:
    trigger_times += 1
    print('trigger times:', trigger_times)
    if trigger_times >= patience:
        print('Early stopping!\nStart to test process.')
        return model
  else:
    print('trigger times: 0')
    trigger_times = 0
  prev_loss = val_loss
  return prev_loss

EPOCHS = 100

def train(model, train_loader, val_loader, optimizer, scheduler, rev_label_dict):  
  for epoch in range(EPOCHS):
    start = time.time()
    model.train()
    total_train_loss = 0
    total_train_acc  = 0

    # logging for scheduler
    losses = []
    accuracies= []

    # Early stopping
    prev_loss = 100
    patience = 12
    trigger_times = 0

    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
      optimizer.zero_grad()
      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)

      try:
        loss, prediction = model(pair_token_ids, 
                              token_type_ids=seg_ids, 
                              attention_mask=mask_ids, 
                              labels=labels).values()

        acc = multi_acc(prediction, labels)
        loss.backward()
        optimizer.step()
        total_train_loss += loss.item()
        total_train_acc  += acc.item()

        # log losses for scheduler
        losses.append(loss)
        accuracies.append(acc)
        mean_loss = sum(losses)/len(losses)
        scheduler.step(mean_loss)

       
        
      except Exception as e:
        print(traceback.format_exc())
        print('helpp')
        break

    train_acc  = total_train_acc/len(train_loader)
    train_loss = total_train_loss/len(train_loader)

    val_acc, val_loss, cr, model, optimizer = evaluate_accuracy(model, optimizer, val_loader, rev_label_dict)
    prev_loss = EarlyStoppingCallbackCustomBasedOnLoss(val_loss, prev_loss, trigger_times, patience, model)

    end = time.time()
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)

    print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

In [12]:
# ### WORKING CODE
# import time
# import traceback

# from sklearn.metrics import classification_report

# EPOCHS = 1

# def train(model, train_loader, val_loader, optimizer, rev_label_dict):  
#   for epoch in range(EPOCHS):
#     start = time.time()
#     model.train()
#     total_train_loss = 0
#     total_train_acc  = 0
#     for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
#       optimizer.zero_grad()
#       pair_token_ids = pair_token_ids.to(device)
#       mask_ids = mask_ids.to(device)
#       seg_ids = seg_ids.to(device)
#       labels = y.to(device)

#       try:
#         loss, prediction = model(pair_token_ids, 
#                               token_type_ids=seg_ids, 
#                               attention_mask=mask_ids, 
#                               labels=labels).values()

#         acc = multi_acc(prediction, labels)
#         loss.backward()
#         optimizer.step()
#         total_train_loss += loss.item()
#         total_train_acc  += acc.item()
#         # print(seg_ids.shape, pair_token_ids.shape, mask_ids.shape)
#       except Exception as e:
#         print(labels)
#         print(seg_ids.shape, pair_token_ids.shape, mask_ids.shape)
#         print(pair_token_ids)
#         print(traceback.format_exc())
#         print('helpp')
#         break

#     train_acc  = total_train_acc/len(train_loader)
#     train_loss = total_train_loss/len(train_loader)

#     val_acc, val_loss, cr, model, optimizer = evaluate_accuracy(model, optimizer, val_loader, rev_label_dict)
#     end = time.time()
#     hours, rem = divmod(end-start, 3600)
#     minutes, seconds = divmod(rem, 60)

#     print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
#     print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

In [13]:
import warnings
from sklearn.exceptions import DataConversionWarning
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    train(model, train_loader, val_loader, optimizer, scheduler, rev_label_dict)

Epoch 00005: reducing learning rate of group 0 to 1.2000e-05.
Epoch 00008: reducing learning rate of group 0 to 7.2000e-06.
Epoch 00011: reducing learning rate of group 0 to 4.3200e-06.
Epoch 00014: reducing learning rate of group 0 to 2.5920e-06.
Epoch 00017: reducing learning rate of group 0 to 1.5552e-06.
Epoch 00020: reducing learning rate of group 0 to 9.3312e-07.
Epoch 00023: reducing learning rate of group 0 to 5.5987e-07.
Epoch 00026: reducing learning rate of group 0 to 5.0000e-07.
trigger times: 0
Epoch 1: train_loss: 2.7123 train_acc: 0.2174 | val_loss: 2.7235 val_acc: 0.2109
00:06:01.59
trigger times: 0
Epoch 2: train_loss: 2.6379 train_acc: 0.2295 | val_loss: 2.6636 val_acc: 0.2261
00:06:01.36
trigger times: 0
Epoch 3: train_loss: 2.5749 train_acc: 0.2498 | val_loss: 2.5904 val_acc: 0.2597
00:06:01.16
trigger times: 0
Epoch 4: train_loss: 2.4988 train_acc: 0.2846 | val_loss: 2.4989 val_acc: 0.3285
00:06:01.16
trigger times: 0
Epoch 5: train_loss: 2.4177 train_acc: 0.3356 |

KeyboardInterrupt: 

In [None]:
# torch.save(model.state_dict(), 'bert-nli.pt')

In [14]:
def validate(model, test_loader, optimizer, rev_label_dict):
  start = time.time()
  test_acc, test_loss, cr, model, optimizer = evaluate_accuracy(model, optimizer, test_loader, rev_label_dict)
  end = time.time()
  hours, rem = divmod(end-start, 3600)
  minutes, seconds = divmod(rem, 60)

  print(f'Test_loss: {test_loss:.4f} test_acc: {test_acc:.4f}')
  print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
  print(cr)

  return test_loss, test_acc


# model.load_state_dict(torch.load('bert-nli.pt'))
test_loss, test_acc = validate(model, test_loader, optimizer, rev_label_dict)
print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test_loss: 1.5502 test_acc: 0.5702
00:00:03.77
              precision    recall  f1-score   support

  antithesis       0.67      0.05      0.10        38
 attribution       0.76      0.87      0.81       101
  background       0.21      0.15      0.17        96
       cause       0.42      0.55      0.48        38
circumstance       0.56      0.77      0.65        74
  concession       0.32      0.53      0.40        51
   condition       0.66      0.85      0.74        47
    contrast       0.38      0.23      0.29        56
 elaboration       0.65      0.72      0.68       531
  evaluation       0.38      0.43      0.40       115
    evidence       0.73      0.42      0.53        76
       joint       0.54      0.63      0.58       346
     justify       0.00      0.00      0.00        49
      manner       1.00      0.10      0.17        21
       means       0.87      0.76      0.81        17
  motivation       0.00      0.00      0.00        14
 preparation       0.60      0.67 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# TODO: Another wrapper to return basic predictions/log them in it as well.