In [89]:
#seeding for comparing experiment in part 2
import torch
SEED = 1111
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device('cuda')

# SNLI Bert
## Second Tutorial
https://towardsdatascience.com/fine-tuning-pre-trained-transformer-models-for-sentence-entailment-d87caf9ec9db
Check his Github code for complete notebook. I never referred to it. Medium was enough.
BERT in keras-tf: https://towardsdatascience.com/bert-in-keras-with-tensorflow-hub-76bcbc9417b

In [90]:
# define macros
BERT_MODEL = 'GroNLP/bert-base-dutch-cased' #'bert-base-uncased'
MAX_SEQ_LENGTH = 100 # we dont need to enforce this now because snli is a relatively sanitized dataset where sentence lenghts are reasonable

# Prepare data

## load the dataset

In [91]:
# !wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip

In [92]:
import pandas as pd

# custom reader needed to handle quotechars
def read_df_custom(file):
    header = 'doc     unit1_toks      unit2_toks      unit1_txt       unit2_txt       s1_toks s2_toks unit1_sent      unit2_sent      dir     nuc_children    sat_children    genre   u1_discontinuous        u2_discontinuous       u1_issent        u2_issent       u1_length       u2_length       length_ratio    u1_speaker      u2_speaker      same_speaker    u1_func u1_pos  u1_depdir       u2_func u2_pos  u2_depdir       doclen  u1_position      u2_position     percent_distance        distance        lex_overlap_words       lex_overlap_length      unit1_case      unit2_case      label'
    header = header.split()
    df = pd.DataFrame(columns=['unit1_txt', 'unit1_sent', 'unit2_txt', 'unit2_sent', 'dir', 'label'])
    file = open(file, 'r')

    rows = []
    count = 0 
    for line in file:
        line = line[:-1].split('\t')
        count+=1
        if count ==1: continue
        row = {}
        for column in ['unit1_txt', 'unit1_sent', 'unit2_txt', 'unit2_sent', 'dir', 'label']:
            index = header.index(column)
            row[column] = line[index]
        rows.append(row)

    df = pd.concat([df, pd.DataFrame.from_records(rows)])
    return df

# we only need specific columns
train_df = read_df_custom('./processed/nld.rst.nldt_train_enriched.rels')
test_df = read_df_custom('./processed/nld.rst.nldt_test_enriched.rels')
val_df = read_df_custom('./processed/nld.rst.nldt_dev_enriched.rels')
len(train_df)

1608

## Clean the data

In [93]:
#dropping any empty values
train_df.dropna(inplace=True)
val_df.dropna(inplace=True)
test_df.dropna(inplace=True)

# train_df = train_df[:500]
# val_df = val_df[:50]
# test_df = test_df[:50]

## Prepare a dataset handler class

In [94]:

import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer
import pandas as pd

class MNLIDataBert(Dataset):

  def __init__(self, train_df, val_df, test_df):
    self.num_labels = -1
    self.train_df = train_df
    self.val_df = val_df
    self.test_df = test_df

    self.tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True) # Using a pre-trained BERT tokenizer to encode sentences
    self.train_data = None
    self.val_data = None
    self.test_data = None
    self.init_data()

  def init_data(self):
    self.get_label_mapping()
    self.train_data = self.load_data(self.train_df)
    self.val_data = self.load_data(self.val_df)
    self.test_data = self.load_data(self.test_df)

  def get_label_mapping(self):
    labels = {}
    labels_list = list(set(list(self.train_df['label'].unique()) + list(self.test_df['label'].unique()) + list(self.val_df['label'].unique())))
    for i in range(len(labels_list)):
        labels[labels_list[i]] = i
    self.label_dict = labels# {'entailment': 0, 'contradiction': 1, 'neutral': 2}
    # needed later for classification report object to generate precision and recall on test dataset
    self.rev_label_dict = {self.label_dict[k]:k for k in self.label_dict.keys()} 
  
  def add_directionality(self, premise, hypothesis, dir):
    if dir == "1<2":
        hypothesis = '< ' + hypothesis + ' {'
    else:
        premise = '} ' + premise + ' >'
    return premise, hypothesis
    

  def load_data(self, df):
    MAX_LEN = 256 # dont need to enforce this now because snli is a sanitized dataset where sentence lenghts are reasonable. otherwise the beert model doesn't have enough parameters to handle long length sentences
    token_ids = []
    mask_ids = []
    seg_ids = []
    y = []

    premise_list = df['unit1_txt'].to_list()
    hypothesis_list = df['unit2_txt'].to_list()
    label_list = df['label'].to_list()
    dir_list = df['dir'].to_list()
    
    self.num_labels = max(self.num_labels, len(df['label'].unique()))

    for (premise, hypothesis, label, dir) in zip(premise_list, hypothesis_list, label_list, dir_list):
      premise, hypothesis = self.add_directionality(premise, hypothesis, dir)
      premise_id = self.tokenizer.encode(premise, add_special_tokens = False, max_length=MAX_LEN, truncation=True)
      hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False, max_length=MAX_LEN, truncation=True)
      pair_token_ids = [self.tokenizer.cls_token_id] + premise_id + [self.tokenizer.sep_token_id] + hypothesis_id + [self.tokenizer.sep_token_id]
      premise_len = len(premise_id)
      hypothesis_len = len(hypothesis_id)

      segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))  # sentence 0 and sentence 1
      attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values

      token_ids.append(torch.tensor(pair_token_ids))
      seg_ids.append(segment_ids)
      mask_ids.append(attention_mask_ids)
      y.append(self.label_dict[label])
    
    token_ids = pad_sequence(token_ids, batch_first=True)
    mask_ids = pad_sequence(mask_ids, batch_first=True)
    seg_ids = pad_sequence(seg_ids, batch_first=True)

    y = torch.tensor(y)
    dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
    return dataset

  def get_data_loaders(self, batch_size=16, shuffle=True):
    train_loader = DataLoader(
      self.train_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    val_loader = DataLoader(
      self.val_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    test_loader = DataLoader(
      self.test_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    return train_loader, val_loader, test_loader

In [95]:
mnli_dataset = MNLIDataBert(train_df, val_df, test_df)

train_loader, val_loader, test_loader = mnli_dataset.get_data_loaders(batch_size=64)
label_dict = mnli_dataset.label_dict # required by custom func to calculate accuracy, bert model
rev_label_dict = mnli_dataset.rev_label_dict # required by custom func to calculate accuracy

Downloading:   0%|          | 0.00/236k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/254 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/608 [00:00<?, ?B/s]

In [96]:
print(label_dict)

{'restatement-mn': 0, 'background': 1, 'purpose': 2, 'sequence': 3, 'nonvolitional-cause': 4, 'enablement': 5, 'joint': 6, 'concession': 7, 'volitional-cause': 8, 'motivation': 9, 'unconditional': 10, 'elaboration': 11, 'unless': 12, 'evidence': 13, 'restatement': 14, 'antithesis': 15, 'preparation': 16, 'conjunction': 17, 'justify': 18, 'contrast': 19, 'span': 20, 'volitional-result': 21, 'list': 22, 'evaluation': 23, 'solutionhood': 24, 'condition': 25, 'nonvolitional-result': 26, 'otherwise': 27, 'interpretation': 28, 'circumstance': 29, 'disjunction': 30, 'summary': 31, 'means': 32}


# Define the model

## load pretrained model

In [97]:
from transformers import BertForSequenceClassification, AdamW
from torch import optim

model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=len(label_dict)).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.6, mode='max', patience=2, min_lr=5e-7, verbose=True)

Downloading:   0%|          | 0.00/417M [00:00<?, ?B/s]

Some weights of the model checkpoint at GroNLP/bert-base-dutch-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at GroNLP/bert-base

## define evaulation metric

In [98]:
# to evaluate model for train and test. And also use classification report for testing
import torch.nn as nn

# helper function to calculate the batch accuracy
def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

# freeze model weights and measure validation / test 
def evaluate_accuracy(model, optimizer, data_loader, rev_label_dict, is_training=True):
  model.eval()
  total_val_acc  = 0
  total_val_loss = 0
  
  #for classification report
  y_true = []
  y_pred = []
  
  with torch.no_grad():
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(data_loader):      
      optimizer.zero_grad()
      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)
      
      # loss, prediction = model(pair_token_ids, 
      #                       token_type_ids=seg_ids, 
      #                       attention_mask=mask_ids, 
      #                       labels=labels).values()
      # acc = multi_acc(prediction, labels)

      ############new code#####################

      outputs = model(pair_token_ids, 
                            token_type_ids=seg_ids, 
                            attention_mask=mask_ids)
      # probs = F.softmax(outputs, dim=1)
      # max_idx = torch.max(outputs, 1).indices
      # one_hot = F.one_hot(max_idx, outputs.shape[1])

      criterion = nn.CrossEntropyLoss()
      loss = criterion(outputs, labels)
      acc = multi_acc(outputs, labels)
      ########################################

      total_val_loss += loss.item()
      total_val_acc  += acc.item()

      # log predictions for classification report
      argmax_predictions = torch.argmax(outputs,dim=1).tolist()
      labels_list = labels.tolist()
      assert(len(labels_list)==len(argmax_predictions))
      for p in argmax_predictions: y_pred.append(rev_label_dict[int(p)])
      for l in labels_list: y_true.append(rev_label_dict[l])

  val_acc  = total_val_acc/len(data_loader)
  val_loss = total_val_loss/len(data_loader)
  cr = classification_report(y_true, y_pred)
  
  return val_acc, val_loss, cr, model, optimizer

In [99]:
# probs = torch.tensor([[1.0890e-04, 5.2673e-05, 1.2360e-04, 1.0418e-04, 1.2566e-04, 9.8514e-01,
#          5.3395e-05, 1.2905e-04, 4.4259e-05, 6.8244e-05, 7.6694e-05, 1.9960e-05,
#          1.2552e-03, 4.4366e-04, 5.7364e-05, 3.5897e-05, 1.8665e-04, 1.4762e-04,
#          2.0718e-04, 3.6333e-04, 1.4289e-04, 7.9607e-04, 1.4576e-04, 9.2251e-05,
#          8.8410e-05, 6.9524e-05, 6.2618e-03, 1.7346e-04, 3.2187e-03, 2.6895e-04], 
#          [1.1661e-03, 5.0136e-04, 6.6534e-03, 2.9997e-04, 1.7821e-02, 4.7819e-04,
#          3.6546e-04, 9.7448e-04, 1.7562e-03, 6.0838e-03, 3.8355e-04, 1.9341e-03,
#          4.0050e-04, 2.3111e-04, 9.4716e-04, 4.5940e-04, 9.0194e-01, 5.8271e-03,
#          3.8972e-04, 2.8722e-03, 5.6120e-04, 5.8885e-03, 2.2553e-04, 3.2046e-04,
#          2.2358e-02, 5.4926e-03, 5.0868e-03, 6.9310e-04, 3.8507e-03, 4.0405e-03]])
# max_idx = torch.max(probs, 1).indices
# F.one_hot(max_idx, probs.shape[1])
# one_hot = torch.FloatTensor(probs.shape)
# one_hot.zero_()
# one_hot.scatter_(0, max_idx, 1)
# one_hot
# assert sum(one_hot)==1
# max_idx

# [1.8954e-03, 2.5696e-04, 8.2484e-04, 2.2843e-04, 1.0369e-03, 4.5720e-01,
#          1.2688e-03, 1.1442e-03, 1.2277e-03, 1.5966e-03, 1.7590e-02, 2.4132e-04,
#          3.7562e-01, 7.7780e-04, 9.8553e-05, 5.1157e-04, 6.1281e-05, 2.0767e-03,
#          8.3567e-04, 2.0880e-03, 1.1388e-01, 4.5953e-03, 5.4112e-03, 2.9414e-04,
#          3.7741e-04, 6.8984e-04, 2.0752e-04, 5.5542e-03, 3.4503e-04, 2.0683e-03]
        # [1.0890e-04, 5.2673e-05, 1.2360e-04, 1.0418e-04, 1.2566e-04, 9.8514e-01,
        #  5.3395e-05, 1.2905e-04, 4.4259e-05, 6.8244e-05, 7.6694e-05, 1.9960e-05,
        #  1.2552e-03, 4.4366e-04, 5.7364e-05, 3.5897e-05, 1.8665e-04, 1.4762e-04,
        #  2.0718e-04, 3.6333e-04, 1.4289e-04, 7.9607e-04, 1.4576e-04, 9.2251e-05,
        #  8.8410e-05, 6.9524e-05, 6.2618e-03, 1.7346e-04, 3.2187e-03, 2.6895e-04]
        # [1.1661e-03, 5.0136e-04, 6.6534e-03, 2.9997e-04, 1.7821e-02, 4.7819e-04,
        #  3.6546e-04, 9.7448e-04, 1.7562e-03, 6.0838e-03, 3.8355e-04, 1.9341e-03,
        #  4.0050e-04, 2.3111e-04, 9.4716e-04, 4.5940e-04, 9.0194e-01, 5.8271e-03,
        #  3.8972e-04, 2.8722e-03, 5.6120e-04, 5.8885e-03, 2.2553e-04, 3.2046e-04,
        #  2.2358e-02, 5.4926e-03, 5.0868e-03, 6.9310e-04, 3.8507e-03, 4.0405e-03]

## define bert custom model

In [100]:
from transformers import BertModel, AutoTokenizer
import torch.nn as nn
class CustomBERTModel(nn.Module):
    #https://stackoverflow.com/questions/64156202/add-dense-layer-on-top-of-huggingface-bert-model
    def __init__(self, num_labels):
          super(CustomBERTModel, self).__init__()
          self.num_classes = num_labels+1 # zero indexed classes
          print('ASSIGN:', self.num_classes)
          self.bert = BertModel.from_pretrained(BERT_MODEL)
          ### New layers:
          self.linear1 = nn.Linear(768, 512)
          self.linear2 = nn.Linear(512, 256)
          self.linear3 = nn.Linear(256, 128)
          self.linear4 = nn.Linear(128, self.num_classes)
          self.act1 = nn.ReLU() # can i use the same activation object everywhere?
          self.act2 = nn.ReLU()
          self.act3 = nn.ReLU()
          self.drop = nn.Dropout(0.1) 

    def forward(self, pair_token_ids, token_type_ids, attention_mask):
        sequence_output, pooled_output = self.bert(input_ids=pair_token_ids, 
                        token_type_ids=token_type_ids, 
                        attention_mask=attention_mask).values()

        # sequence_output has the following shape: (batch_size, sequence_length, 768)
        linear1_output = self.linear1(sequence_output[:,0,:].view(-1,768)) ## extract the 1st token's embeddings
        linear1_output = self.act1(linear1_output)
        linear2_output = self.linear2(linear1_output)
        linear2_output = self.act2(linear2_output)
        linear3_output = self.linear3(linear2_output)
        linear3_output = self.act3(linear3_output)
        linear4_output = self.linear4(linear3_output)
        drop_output = self.drop(linear4_output)
        return drop_output# loss, outputs

# tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL)
model = CustomBERTModel(mnli_dataset.num_labels) # You can pass the parameters if required to have more flexible model
model.to(device) ## can be gpu
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)

ASSIGN: 33


Some weights of the model checkpoint at GroNLP/bert-base-dutch-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at GroNLP/bert-base-dutch-cased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.

## define training regime

In [101]:
### MODIFIED
import time
import traceback
import torch.nn.functional as F
from typing import Optional, Iterable, Dict, Any
from EarlyStopperUtil import MetricTracker


from sklearn.metrics import classification_report

class EarlyStoppingCallbackCustom:
  def __init__(self, validation_metric='accuracy', patience=12):
    self.validation_metric = validation_metric
    self.patience = patience
    self.trigger_times = 0
    self.prev_loss = 1000 if self.validation_metric=='loss' else -1000
    
  def compare_and_trigger(self, current, previous):
    if self.validation_metric=='loss': return current > previous
    else: return current <= previous

  def callback_to_stop(self, val_loss):
    if self.compare_and_trigger(val_loss, self.prev_loss):
      self.trigger_times += 1
      print('trigger times:', self.trigger_times)
      if self.trigger_times >= self.patience:
          print('Early stopping!\nStart to test process.')
          return True
    else:
      if self.trigger_times!=0: print('Resetting trigertime:', self.trigger_times)
      print('trigger times: 0')
      self.trigger_times = 0
    self.prev_loss = val_loss
    return False

EPOCHS = 100

def train(model, train_loader, val_loader, optimizer, scheduler, rev_label_dict):  
  # EarlyStopper = EarlyStoppingCallbackCustom(validation_metric='accuracy')
  EarlyStopper = MetricTracker(patience=12, metric_name='+accuracy')

  for epoch in range(EPOCHS):
    start = time.time()
    model.train()
    total_train_loss = 0
    total_train_acc  = 0

    # logging for scheduler
    losses = []
    accuracies= []

    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
      optimizer.zero_grad()
      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)

      ############new code#####################

      outputs = model(pair_token_ids, 
                            token_type_ids=seg_ids, 
                            attention_mask=mask_ids)
      # outputs = F.log_softmax(outputs, dim=1) # log prob
      # outputs = np.argmax(prob, axis=1) # preds
      # https://stackoverflow.com/questions/43672047/convert-probability-vector-into-target-vector-in-python
      # https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
      criterion = nn.CrossEntropyLoss()
      loss = criterion(outputs, labels)
      loss.backward()
      acc = multi_acc(outputs, labels)
      optimizer.step()
      ################old code#################

      # loss, prediction = model(pair_token_ids, 
      #                       token_type_ids=seg_ids, 
      #                       attention_mask=mask_ids, 
      #                       labels=labels).values()

      # acc = multi_acc(prediction, labels)
      # loss.backward()
      # optimizer.step()

      ########################################
      total_train_loss += loss.item()
      total_train_acc  += acc.item()

      # log losses for scheduler
      losses.append(loss)
      accuracies.append(acc)
      mean_loss = sum(losses)/len(losses)
      scheduler.step(mean_loss)


    train_acc  = total_train_acc/len(train_loader)
    train_loss = total_train_loss/len(train_loader)

    val_acc, val_loss, cr, model, optimizer = evaluate_accuracy(model, optimizer, val_loader, rev_label_dict)
    EarlyStopper.add_metric(val_acc)
    if EarlyStopper.should_stop_early(): break
    # if EarlyStopper.callback_to_stop(val_acc): break

    end = time.time()
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)

    print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

In [102]:
# ### WORKING CODE 2
# import time
# import traceback

# from sklearn.metrics import classification_report

# def EarlyStoppingCallbackCustomBasedOnLoss(val_loss, prev_loss, trigger_times, patience, model):
#   # https://clay-atlas.com/us/blog/2021/08/25/pytorch-en-early-stopping/
#   if val_loss > prev_loss:
#     trigger_times += 1
#     print('trigger times:', trigger_times)
#     if trigger_times >= patience:
#         print('Early stopping!\nStart to test process.')
#         return model
#   else:
#     print('trigger times: 0')
#     trigger_times = 0
#   prev_loss = val_loss
#   return prev_loss

# EPOCHS = 100

# def train(model, train_loader, val_loader, optimizer, scheduler, rev_label_dict):  
#   for epoch in range(EPOCHS):
#     start = time.time()
#     model.train()
#     total_train_loss = 0
#     total_train_acc  = 0

#     # logging for scheduler
#     losses = []
#     accuracies= []

#     # Early stopping
#     prev_loss = 100
#     patience = 12
#     trigger_times = 0

#     for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
#       optimizer.zero_grad()
#       pair_token_ids = pair_token_ids.to(device)
#       mask_ids = mask_ids.to(device)
#       seg_ids = seg_ids.to(device)
#       labels = y.to(device)

#       try:
#         loss, prediction = model(pair_token_ids, 
#                               token_type_ids=seg_ids, 
#                               attention_mask=mask_ids, 
#                               labels=labels).values()

#         acc = multi_acc(prediction, labels)
#         loss.backward()
#         optimizer.step()
#         total_train_loss += loss.item()
#         total_train_acc  += acc.item()

#         # log losses for scheduler
#         losses.append(loss)
#         accuracies.append(acc)
#         mean_loss = sum(losses)/len(losses)
#         scheduler.step(mean_loss)

       
        
#       except Exception as e:
#         print(traceback.format_exc())
#         print('helpp')
#         break

#     train_acc  = total_train_acc/len(train_loader)
#     train_loss = total_train_loss/len(train_loader)

#     val_acc, val_loss, cr, model, optimizer = evaluate_accuracy(model, optimizer, val_loader, rev_label_dict)
#     prev_loss = EarlyStoppingCallbackCustomBasedOnLoss(val_loss, prev_loss, trigger_times, patience, model)

#     end = time.time()
#     hours, rem = divmod(end-start, 3600)
#     minutes, seconds = divmod(rem, 60)

#     print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
#     print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

In [103]:
# ### WORKING CODE
# import time
# import traceback

# from sklearn.metrics import classification_report

# EPOCHS = 1

# def train(model, train_loader, val_loader, optimizer, rev_label_dict):  
#   for epoch in range(EPOCHS):
#     start = time.time()
#     model.train()
#     total_train_loss = 0
#     total_train_acc  = 0
#     for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
#       optimizer.zero_grad()
#       pair_token_ids = pair_token_ids.to(device)
#       mask_ids = mask_ids.to(device)
#       seg_ids = seg_ids.to(device)
#       labels = y.to(device)

#       try:
#         loss, prediction = model(pair_token_ids, 
#                               token_type_ids=seg_ids, 
#                               attention_mask=mask_ids, 
#                               labels=labels).values()

#         acc = multi_acc(prediction, labels)
#         loss.backward()
#         optimizer.step()
#         total_train_loss += loss.item()
#         total_train_acc  += acc.item()
#         # print(seg_ids.shape, pair_token_ids.shape, mask_ids.shape)
#       except Exception as e:
#         print(labels)
#         print(seg_ids.shape, pair_token_ids.shape, mask_ids.shape)
#         print(pair_token_ids)
#         print(traceback.format_exc())
#         print('helpp')
#         break

#     train_acc  = total_train_acc/len(train_loader)
#     train_loss = total_train_loss/len(train_loader)

#     val_acc, val_loss, cr, model, optimizer = evaluate_accuracy(model, optimizer, val_loader, rev_label_dict)
#     end = time.time()
#     hours, rem = divmod(end-start, 3600)
#     minutes, seconds = divmod(rem, 60)

#     print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
#     print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

In [104]:
import warnings
from sklearn.exceptions import DataConversionWarning
with warnings.catch_warnings():
    warnings.simplefilter("ignore")

    train(model, train_loader, val_loader, optimizer, scheduler, rev_label_dict)

Epoch 00004: reducing learning rate of group 0 to 1.2000e-05.
Epoch 00007: reducing learning rate of group 0 to 7.2000e-06.
Epoch 00010: reducing learning rate of group 0 to 4.3200e-06.
Epoch 00013: reducing learning rate of group 0 to 2.5920e-06.
Epoch 00016: reducing learning rate of group 0 to 1.5552e-06.
Epoch 00019: reducing learning rate of group 0 to 9.3312e-07.
Epoch 00022: reducing learning rate of group 0 to 5.5987e-07.
Epoch 00025: reducing learning rate of group 0 to 5.0000e-07.
Epoch 1: train_loss: 3.2038 train_acc: 0.2308 | val_loss: 2.8509 val_acc: 0.2898
00:00:08.25
Epoch 2: train_loss: 2.8869 train_acc: 0.2476 | val_loss: 2.6128 val_acc: 0.3575
00:00:08.25
Epoch 3: train_loss: 2.6171 train_acc: 0.3215 | val_loss: 2.3724 val_acc: 0.4039
00:00:08.26
Epoch 4: train_loss: 2.2378 train_acc: 0.4038 | val_loss: 2.2289 val_acc: 0.4283
00:00:08.25
Epoch 5: train_loss: 1.9646 train_acc: 0.4712 | val_loss: 2.1999 val_acc: 0.4638
00:00:08.30
Epoch 6: train_loss: 1.7497 train_acc: 

In [105]:
torch.save(model.state_dict(), 'bert-disrpt_nld.pt')

# test

In [106]:
def validate(model, test_loader, optimizer, rev_label_dict):
  start = time.time()
  test_acc, test_loss, cr, model, optimizer = evaluate_accuracy(model, optimizer, test_loader, rev_label_dict)
  end = time.time()
  hours, rem = divmod(end-start, 3600)
  minutes, seconds = divmod(rem, 60)

  print(f'Test_loss: {test_loss:.4f} test_acc: {test_acc:.4f}')
  print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
  print(cr)

  return test_loss, test_acc


# model.load_state_dict(torch.load('bert-nli.pt'))
test_loss, test_acc = validate(model, test_loader, optimizer, rev_label_dict)
print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')

Test_loss: 2.8523 test_acc: 0.4306
00:00:00.48
                      precision    recall  f1-score   support

          antithesis       0.00      0.00      0.00         2
          background       0.00      0.00      0.00         3
        circumstance       0.24      0.44      0.31        16
          concession       0.58      0.58      0.58        12
           condition       0.62      0.62      0.62         8
         conjunction       0.50      0.42      0.46        19
            contrast       0.33      0.14      0.20         7
         disjunction       0.67      0.50      0.57         4
         elaboration       0.60      0.68      0.64        95
          enablement       0.25      0.25      0.25         4
          evaluation       0.00      0.00      0.00         2
            evidence       0.20      0.17      0.18         6
      interpretation       0.11      0.20      0.14        10
               joint       0.00      0.00      0.00         3
             justify  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [107]:
# TODO: Another wrapper to return basic predictions/log them in it as well.