# CommonsenseQA

We have used Hugging Face to get a pre-trained BERT model and also, a library called `deeplib`.

In [1]:
!pip install git+https://github.com/deepditch/deep.lib.git --quiet
!pip install transformers --quiet
!pip install datasets --quiet

In [2]:
import math
import numpy as np
import datetime

import torch
from torch.optim.lr_scheduler import LambdaLR
from torch.utils.data import DataLoader

import transformers
from transformers import AutoModelForMultipleChoice, AutoTokenizer
from transformers.data.data_collator import DataCollator
from datasets import load_dataset

In [3]:
import deeplib
import deeplib.session
import deeplib.schedule
import deeplib.callbacks
import deeplib.validation
from deeplib.LR_Schedule.lr_scheduler import TorchOnBatchLRScheduleCallback

In [4]:
model = "bert-base-uncased"

## Prepare the Dataset

We have used the publically availible dataset from Hugging Face's datasets library. We have then implemented the BERT Model.


We have prepared examples for our BERT model from the dataset, where, the question has been tokenized and concatenated with each multiple choice answer. 
The BERT transformer encodes each Question and Answer pair, and the softmax function is applied over all answers to provide us the result.


In [5]:
dataset = load_dataset("commonsense_qa")



  0%|          | 0/3 [00:00<?, ?it/s]

In [6]:
tokenizers = AutoTokenizer.from_pretrained(model)
max_length = 128

#Each example in the dataset is padded to max length and eventually, we have a tensor of shape `(batch_size, N, 128).

def commonsenseqa_features_conversion(example):
    num_choices = len(example["choices"]["text"])

    features_set = tokenizers.batch_encode_plus(list(zip(
            [example["question"]] * num_choices,
            example["choices"]["text"],
        )), max_length=max_length, padding='max_length', truncation=True, return_token_type_ids=True, return_tensors='pt')
    
    labelstoid = {char: i for i, char in enumerate("ABCDE")}

    # Dummy answers required for testing the data
    if example["answerKey"]:
        features_set["labels"] = labelstoid[example["answerKey"]]
    else:
        features_set["labels"] = 0    
    return features_set


In [7]:
# Questions for our Model.
dataset = dataset.map(commonsenseqa_features_conversion, num_proc=4, batched=False, load_from_cache_file=False)

        

#0:   0%|          | 0/2436 [00:00<?, ?ex/s]

#2:   0%|          | 0/2435 [00:00<?, ?ex/s]

#1:   0%|          | 0/2435 [00:00<?, ?ex/s]

#3:   0%|          | 0/2435 [00:00<?, ?ex/s]

        

#0:   0%|          | 0/306 [00:00<?, ?ex/s]

#1:   0%|          | 0/305 [00:00<?, ?ex/s]

#2:   0%|          | 0/305 [00:00<?, ?ex/s]

#3:   0%|          | 0/305 [00:00<?, ?ex/s]

        

#0:   0%|          | 0/285 [00:00<?, ?ex/s]

#1:   0%|          | 0/285 [00:00<?, ?ex/s]

#3:   0%|          | 0/285 [00:00<?, ?ex/s]

#2:   0%|          | 0/285 [00:00<?, ?ex/s]

In [8]:
#Preparing mini-batches by collation by stacking each input to the tensor.

class Collator():
    def __call__(self, features):
        labels_ = torch.tensor([f["labels"] for f in features], dtype=torch.long)
        minibatch_ = {"labels": labels_}

        for k in ['input_ids', 'attention_mask', 'token_type_ids']:
            minibatch_[k] = torch.stack([torch.tensor(f[k]) for f in features])

        return minibatch_, labels_

In [9]:
valid_loader = DataLoader(dataset['validation'], shuffle=True, batch_size=4, collate_fn=Collator(), num_workers=2)
train_loader = DataLoader(dataset['train'], shuffle=True, batch_size=4, collate_fn=Collator(), num_workers=2)

print(valid_loader.dataset.data.shape)
print(train_loader.dataset.data.shape)


(1221, 9)
(9741, 9)


## Train

Fine-tuning BERT model using the `deeplib` library.

In [10]:
model = AutoModelForMultipleChoice.from_pretrained(model, return_dict=True) # Load the pre-trained model

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMultipleChoice: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForMultipleChoice were not initialized from the model checkpoint at bert-base-uncased and are newly

In [11]:
import matplotlib.pyplot as plt
accuracy = []
losses = []

# Tracking accuracy while training. 

class MultipleChoiceAccuracy(deeplib.validation._AccuracyMeter):
    def reset(self):         
        self.num_correct = 0
        self.count = 0

    def update(self, outputs, labels):
        preds = outputs.logits.argmax(-1)
        self.num_correct += (labels == preds).sum()
        self.count += labels.shape[0]
        #During training, the outputs and labels for each mini-batch are passed here.
        
    def metric(self): 
        accuracy.append(self.num_correct / self.count)
        return self.num_correct / self.count
        

In [12]:
# A weight decay of 0.01 is applied to each bias parameters.

def makeoptimizer(*args, **kwargs):
    decay = ["bias", "LayerNorm.weight"]

    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in decay)],
            "weight_decay": 0.01,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in decay)],
            "weight_decay": 0.0,
        },
    ]

    optimizer = torch.optim.AdamW(
        optimizer_grouped_parameters,
        lr=5e-5
    )

    return optimizer

In [13]:
num_epochs = 3

In [14]:
#For the first 500 iterations (warm up steps) of training, we linearly increase the learning rate(lr) from 0 to 5e-5.
#For the rest of the iterations, those are linearly decreased to 0.

num_training_steps = len(train_loader) * num_epochs
num_warmup_steps = 500

def lr_lambda(current_step):
    if current_step < num_warmup_steps:
        return float(current_step) / float(max(1, num_warmup_steps))
    return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps)))

lr_schedule = TorchOnBatchLRScheduleCallback(LambdaLR, lr_lambda, -1)

In [15]:
def Loss_Wrapper(outputs,label): 
    losses.append(outputs.loss.detach().cpu().numpy())
    return outputs.loss

trainingcallbacks = [deeplib.callbacks.GradientClipper(max_grad_norm=1),
                      deeplib.callbacks.TrainingLossLogger(),
                      deeplib.callbacks.TrainingAccuracyLogger(MultipleChoiceAccuracy()),
                      deeplib.validation.Validator(dataloader=valid_loader, accuracy_meter=MultipleChoiceAccuracy()),
                      lr_schedule]

schedule = deeplib.schedule.TrainingSchedule(dataloader=train_loader, num_epochs=num_epochs, callbacks=trainingcallbacks)
sess = deeplib.session.Session(model=model, criterion=Loss_Wrapper, optim_fn=makeoptimizer, lrs=5e-5)

In [16]:
sess.train(schedule)

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/2436 [00:00<?, ?it/s]

Validating:   0%|          | 0/306 [00:00<?, ?it/s]

+-------+------------+----------------+---------------------+-----------------+---------------+
| Epoch | Loss/Train | Accuracy/Train | Accuracy/Validation | Loss/Validation | Learning Rate |
|     1 |     1.3851 |         0.4199 |              0.4685 |          1.2600 |      0.00E+00 |


Epoch 2:   0%|          | 0/2436 [00:00<?, ?it/s]

Validating:   0%|          | 0/306 [00:00<?, ?it/s]

|     2 |     0.8298 |         0.6922 |              0.5061 |          1.4479 |      3.58E-05 |


Epoch 3:   0%|          | 0/2436 [00:00<?, ?it/s]

Validating:   0%|          | 0/306 [00:00<?, ?it/s]

|     3 |     0.3178 |         0.9035 |              0.5086 |          2.7851 |      1.79E-05 |
+-------+------------+----------------+---------------------+-----------------+---------------+
