## Finetuning ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli NLI model
- This is based on the Prof. Mihai Surdeanu's text book <a href="https://github.com/clulab/gentlenlp/blob/main/notebooks/chap13_classification_bert.ipynb">Gentle NLP Chapter 13 Classification using BERT model</a>
- Modified for NLI evaluation and analysis over SICCK dataset
- Reference: <a href="https://huggingface.co/ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli">HuggingFace roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli </a>

- Author: Sushma Anand Akoju, Email: sushmaakoju@arizona.edu

In [1]:
!pip install datasets
!pip install transformers
!pip install sentencepiece
!pip install accelerate
!pip install 'transformers[torch]'

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m16.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.5/212.5 kB[0m [31m29.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collec

# Text Classification Using Transformer Networks (Deberta and Roberta)

Some initialization:

In [2]:
import random
import torch
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm

# enable tqdm in pandas
tqdm.pandas()

# set to True to use the gpu (if there is one available)
use_gpu = True

# select device
device = torch.device('cuda' if use_gpu and torch.cuda.is_available() else 'cpu')
print(f'device: {device.type}')

device: cuda


In [3]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [4]:
import os
import pandas as pd

In [5]:
label2id_roberta = {
    'Contradiction': 2,
    'Neutral': 1,
    'FE': 0,
    'RE': 0,
}

Read the train/dev/test datasets and create a HuggingFace `Dataset` object:

## Load Cross validation sets

In [6]:
filenames = ["fold0.xlsx", "fold1.xlsx", "fold2.xlsx", "fold3.xlsx", "fold4.xlsx"]
path = "/content/drive/MyDrive/Colab Notebooks/natural-logic/june12/data"
output_path = "/content/drive/MyDrive/Colab Notebooks/natural-logic/june12/randomseed/June23rd"

assert os.path.exists(path), "%s does not exist!"%path
assert os.path.exists(output_path), "%s does not exist!"%output_path

In [7]:
folds = []
columns = ['Premise', 'Hypothesis', 'labels', 'CompressedGT', 'Modifier Type',
           'Modifier',	'Premise/Hypothesis/Both',	'Part of Premise/Hypothesis Modified']
for i,file in enumerate(filenames):
  train = pd.read_excel(os.path.join(path, file), sheet_name="train").rename(columns={"label4roberta":'labels'})
  test = pd.read_excel(os.path.join(path, file), sheet_name="test").rename(columns={"label4roberta":'labels'})
  assert len(train)==1043 and len(test) == 261, "Number of Training samples should be 1043 and # of test samples should be 261 in this fold %d"%i
  folds.append({"train":train[columns], "test":test[columns]})
assert len(folds) == 5, "The number of cross validation folds are not equal to 5!"

### Create data splits with premise, hypothesis as well as hypothesis, premise for **Test** set predictions to label:
- Forward Entailment
- Reverse Entailment
- Neutral

In [8]:
def read_data(data):
    # concatenate title and description, and remove backslashes
    data['text'] = data['Premise'] + " [SEP] " + data['Hypothesis']
    data['text'] = data['text'].str.replace('\\', ' ', regex=False)
    return data

#### Reading hypothesis [SEP] premise is for detecting the Reverse Entailment in the predictions.

In [9]:
def read_data_reverse(data):
    # concatenate title and description, and remove backslashes
    data['text'] = data['Hypothesis'] + " [SEP] " + data['Premise']
    data['text'] = data['text'].str.replace('\\', ' ', regex=False)
    return data

### Compute metrics for validation and test

In [10]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

def compute_metrics(eval_pred):
    y_true = eval_pred.label_ids
    y_pred = np.argmax(eval_pred.predictions, axis=-1)
    return {'accuracy': accuracy_score(y_true, y_pred), 'recall': recall_score(y_true, y_pred, average='micro'),
            'f1':f1_score(y_true, y_pred, average='micro'), 'precision':precision_score(y_true, y_pred, average='micro')}
def compute_test_metrics(y_true, y_pred):
    return {'accuracy': accuracy_score(y_true, y_pred), 'recall': recall_score(y_true, y_pred, average='micro'),
            'f1':f1_score(y_true, y_pred, average='micro'), 'precision':precision_score(y_true, y_pred, average='micro')}

### To include FE, RE and Neutral label calculation and scores for **Test**

In [11]:
from sklearn.metrics import classification_report
def test_eval(trainer, ds, fold, model_name ):
  test_ds = ds['test'].map(
      tokenize,
      batched=True,
      remove_columns=['Premise', 'Hypothesis', 'text'],
  )
  rev_test_ds = ds['rev_test'].map(
      tokenize,
      batched=True,
      remove_columns=['Premise', 'Hypothesis', 'text'],
  )
  test_ds.to_pandas()
  output = trainer.predict(test_ds)
  rev_scores = trainer.predict(rev_test_ds)

  y_true = output.label_ids
  y_preds = np.argmax(output.predictions, axis=-1)
  y_rev_score_preds = np.argmax(rev_scores.predictions, axis=-1)
  labels = []


  for i in range(len(y_preds)):
        if y_preds[i] == 0:
          labels.append("FE")
        elif y_preds[i] == 2:
          labels.append("Contradiction")
        else:
          if y_rev_score_preds[i] == 0:
            labels.append("RE")
          else:
            labels.append("Neutral")
  print(classification_report(y_true, y_preds, labels=[0, 1, 2]))

  res = compute_test_metrics(y_true, y_preds)
  res['fold'] = fold
  res['model_name'] = model_name
  return y_true, y_preds, res, labels

In [12]:
# model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/nli-deberta-v3-base', num_labels=3)
# tokenizer = AutoTokenizer.from_pretrained('cross-encoder/nli-deberta-v3-base')

### Get **this_train**, **this_validation** & **this_test** set from a **this_fold**

In [13]:
def get_dataset(fold, model_name):

  columns = ['Premise', 'Hypothesis', 'labels']

  train_df = read_data(fold["train"][columns])
  test_df = read_data(fold["test"][columns])
  rev_test_df = read_data_reverse(fold["test"][columns])
  print(test_df.columns)

  train_df, eval_df = train_test_split(train_df, train_size=0.9)
  train_df.reset_index(inplace=True, drop=True)
  eval_df.reset_index(inplace=True, drop=True)
  test_df.reset_index(inplace=True, drop=True)
  rev_test_df.reset_index(inplace=True, drop=True)

  print(f'train rows: {len(train_df.index):,}')
  print(f'eval rows: {len(eval_df.index):,}')
  print(f'test rows: {len(test_df.index):,}')
  print(f'test rows: {len(rev_test_df.index):,}')

  ds = DatasetDict()
  ds['train'] = Dataset.from_pandas(train_df)
  ds['validation'] = Dataset.from_pandas(eval_df)
  ds['test'] = Dataset.from_pandas(test_df)
  ds['rev_test'] = Dataset.from_pandas(rev_test_df)

  print(ds)
  return ds, test_df, rev_test_df

### CustomTrainer for CrossEntropyLoss but we train for both custom and default Trainer classes in HuggingFace
- Note: we did not see any difference between the two

In [14]:

import torch
from torch import nn
from transformers import Trainer
from accelerate import Accelerator

# class CustomTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False):
#         labels = inputs.get("labels")
#         # forward pass
#         outputs = model(**inputs)
#         logits = outputs.get("logits")
#         # compute custom loss (suppose one has 3 labels with different weights)
#         loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0]))
#         loss_fct.to('cuda')
#         loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
#         return (loss, outputs) if return_outputs else loss

## Tokenize & Train one model at a time for all folds

In [15]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer, set_seed
import time

def tokenize(examples):
    return tokenizer(examples['text'], truncation=True)

def train(model_name, this_path, folds, seed):
  epochs = [4, 8]
  batch_sizes = [8,16,32]
  m = model_name.split("/")[1]
  all_scores = []

  for num_epochs in epochs:
    for batch_size in batch_sizes:

      for i,fold in enumerate(folds):
          print("\n***********************************************************************************\n")
          print("\n**************** The number of epochs, batch_size and fold respectively are: ",num_epochs, batch_size, i,"************************\n")
          #discard model checkpoints that were cached.
          torch.cuda.empty_cache()
          time.sleep(60)

          #this set_seed is imported from transformers
          set_seed(seed)
          #load this pretrained model
          model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

          ds, test_df, rev_test_df = get_dataset(fold,model_name)
          train_ds = ds['train'].map(
            tokenize, batched=True,
            remove_columns=['Premise', 'Hypothesis', 'text'],
          )
          eval_ds = ds['validation'].map(
              tokenize,
              batched=True,
              remove_columns=['Premise', 'Hypothesis', 'text'],
          )

          weight_decay = 0.01
          tx_model_name = f'{model_name}-sequence-classification'

          training_args = TrainingArguments(
              output_dir=os.path.join(output_path,m+"_"+str(num_epochs)+str(batch_size)+"trainer"),
              log_level='error',
              num_train_epochs=num_epochs,
              per_device_train_batch_size=batch_size,
              per_device_eval_batch_size=batch_size,
              evaluation_strategy='epoch',
              weight_decay=weight_decay, seed = seed,
          )
          trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_ds,
            eval_dataset=eval_ds,
            compute_metrics=compute_metrics,
            tokenizer=tokenizer,
          )
          trainer.train()

          y_true, y_pred, results, labels = test_eval(trainer, ds, i, model_name )

          all_scores.append(results)
          fold["test"]["label"]= y_true
          fold["test"]["predictions"] = y_pred
          fold["test"]["text"] = test_df['text']
          fold["test"]["pred_labels"] =  labels
          filename = "five_"+m+"_"+str(num_epochs)+"_"+str(batch_size)+"_"+str(i)+"_"+str(seed)+"_test.xlsx"
          fold["test"].to_csv(os.path.join(this_path, filename))
  return all_scores

In [16]:
torch.cuda.get_device_name(0)

'Tesla T4'

### "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"

In [None]:
all_scores = []
predictions = []
model_name ="ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"

m = model_name.split("/")[1]
this_path = os.path.join(output_path, m)
if not os.path.exists(this_path):
  os.mkdir(this_path)
assert os.path.exists(this_path), "%s Path does not exists!"%(this_path)

seeds = [12345, 34567, 56789, 98765, 76543]
for seed in seeds:
  tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
  all_scores.append(train(model_name, this_path, folds, seed))

In [20]:
all_scores_roberta = all_scores

In [None]:
all_scores_roberta

In [22]:
import pandas as pd
# import pycm
import matplotlib.pyplot as plt
import os
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, classification_report
import altair as alt

### Get Macro : F1, Precision, Recall scores and Accuracy

In [23]:
def get_classification_report(ytrue, ypred, model_name, mtype="", svo=""):
  res = classification_report(ytrue, ypred, output_dict=True)
  print(classification_report(ytrue, ypred))
  precision_m = res['macro avg']['precision']
  recall_m = res['macro avg']['recall']
  f1_macro = res['macro avg']['f1-score']
  acc = res['accuracy']
  return {"model_name":model_name, "modifier_type":mtype,"svo": svo,
          "f1_macro":f1_macro, "precision_m":precision_m, "recall_m":recall_m, "acc":acc}

In [24]:
gt = 'CompressedGT'
pred = 'pred_labels'
svo = "Part of Premise/Hypothesis Modified"
mod_type_col = 'Modifier Type'

In [25]:
model_folder = "roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"
files = []
assert os.path.exists(os.path.join(output_path,model_folder)), "%s does not exist!" %os.path.join(output_path,model_folder)
files = [os.path.join(os.path.join(output_path, model_folder), file) for file in os.listdir(os.path.join(output_path,model_folder)) ]
assert len(files) == 150, "Number of output files are not 150!"

In [26]:
len(files)

150

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
model_name = 'ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli'

finetuned_models_modtype_scores = []
all = {}
for file in files:
  print(file)

  df = pd.read_csv(file)
  df[pred] = df[pred].replace("Negation", "Contradiction")
  df[svo] = df[svo].str.lower().str.strip()

  epochs, batch_size, fold, seed = os.path.basename(file)[53:].split("_")[0:4]

  print(epochs, batch_size, fold, seed )
  if (epochs, batch_size,seed, model_name) not in all.keys():
    all[(epochs, batch_size, seed, model_name)] = []

  all[(epochs, batch_size, seed, model_name)].append(df)

In [None]:
overall_scores = []
modifier_type_scores = []
svo_scores = []
for key in all.keys():
  df = pd.concat(list(all[key]), axis=0)

  for modifier_type in df[mod_type_col].unique():
    ytrue = df[df[mod_type_col] == modifier_type][gt].to_list()
    #print(ytrue)
    ypred = df[df[mod_type_col] == modifier_type][pred].to_list()
    precision = precision_score(ytrue, ypred, average='micro')
    f1 = f1_score(ytrue, ypred,average='micro')
    recall = recall_score(ytrue, ypred, average='micro')
    acc = accuracy_score(ytrue, ypred,)
    res = get_classification_report(ytrue, ypred, model_name,modifier_type, "")
    res["num_epochs"] = key[0]
    res["batch_size"] = key[1]
    res["model_name"] = key[3]
    res["seed"] = key[2]
    modifier_type_scores.append(res)

  for svo_type in df[svo].unique():
    ytrue = df[df[svo] == svo_type][gt].to_list()
    #print(ytrue)
    ypred = df[df[svo] == svo_type][pred].to_list()
    res2 = get_classification_report(ytrue, ypred, model_name, "", svo_type)
    res2["num_epochs"] = key[0]
    res2["batch_size"] = key[1]
    res2["model_name"] = key[3]
    res2["seed"] = key[2]
    svo_scores.append(res2)

  ypred = df[pred]
  ytrue = df[gt]
  res1 = get_classification_report(ytrue, ypred, model_name, "", "")
  res1["num_epochs"] = key[0]
  res1["batch_size"] = key[1]
  res1["model_name"] = key[3]
  res1["seed"] = key[2]
  overall_scores.append(res1)

In [29]:
pd.DataFrame.from_records(overall_scores).to_excel(os.path.join(output_path,"overall_scores_roberta.xlsx"))
pd.DataFrame.from_records(svo_scores).to_excel(os.path.join(output_path,"overall_svo_scores_roberta.xlsx"))
pd.DataFrame.from_records(modifier_type_scores).to_excel(os.path.join(output_path,"overall_modifier_scores_roberta.xlsx"))

In [30]:
all_mod = {}
for k, v in all.items():
  epochs, batch_size,seed, model_name = k
  df = pd.concat(list(v), axis=0)
  if (epochs, batch_size,seed) not in all_mod.keys():
    all_mod[(epochs, batch_size,seed)] = []
  all_mod[(epochs, batch_size,seed)].append(df)

In [31]:
count = 0
unequal = ""
keys = list(all_mod.keys())

for k1, k2 in zip(keys[:24], keys[6:]):
  if ",".join(k1[:2]) == ",".join(k2[:2]):
    # print(k1, k2)
    l1 = pd.concat(list(all_mod[k1]), axis=0)['pred_labels']
    l2 = pd.concat(list(all_mod[k2]), axis=0)['pred_labels']
    res = "equal" if l1.equals(other=l2) else "not equal"
    count += 1 if res == "equal" else 0
    s = ",".join(k1[:2]) +":"+ k1[2]+":"+k2[2]
    if res != "equal":
      unequal += s
    print("For this epochs, batch_size, fold # and seed %s the labels are %s "%(s, res ))
print("# of files between which prediction labels were equal are %d "%count)
print("The epochs, batch_size, fold # and seed that have unequal prediction labels are ",unequal)

For this epochs, batch_size, fold # and seed 4,8:12345:34567 the labels are not equal 
For this epochs, batch_size, fold # and seed 4,16:12345:34567 the labels are not equal 
For this epochs, batch_size, fold # and seed 4,32:12345:34567 the labels are not equal 
For this epochs, batch_size, fold # and seed 8,8:12345:34567 the labels are not equal 
For this epochs, batch_size, fold # and seed 8,16:12345:34567 the labels are not equal 
For this epochs, batch_size, fold # and seed 8,32:12345:34567 the labels are not equal 
For this epochs, batch_size, fold # and seed 4,8:34567:56789 the labels are not equal 
For this epochs, batch_size, fold # and seed 4,16:34567:56789 the labels are not equal 
For this epochs, batch_size, fold # and seed 4,32:34567:56789 the labels are not equal 
For this epochs, batch_size, fold # and seed 8,8:34567:56789 the labels are not equal 
For this epochs, batch_size, fold # and seed 8,16:34567:56789 the labels are not equal 
For this epochs, batch_size, fold # 