<a href="https://colab.research.google.com/github/federaspa/Finetuning-Bert-for-Text-Classification/blob/main/Finetuning_Bert_for_CoLA_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Necessary imports and setup

In [None]:
! pip install --quiet "datasets" "scikit-learn" "torchmetrics>=0.7" "scipy" "torch>=1.8" "transformers" "torchtext>=0.9" "setuptools==59.5.0" "ipython[notebook]"
%load_ext tensorboard

In [None]:
from datetime import datetime
from typing import Optional

import datasets
import torch
from torch.utils.data import DataLoader
from transformers import (
    AdamW,
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    get_linear_schedule_with_warmup,
)

In [None]:
# download the dataset from the HuggingFace dataset library
dataset = datasets.load_dataset("glue", "cola")
# setup tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# setup the pretrained model
config = AutoConfig.from_pretrained('bert-base-uncased', num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', 
                                                           config=config)

## Tokenization



In [None]:
def convert_to_features(example_batch,  max_seq_length = max_seq_length, indices=None):

    # tokenize the sentence
    features = tokenizer.batch_encode_plus(
        example_batch["sentence"], 
        max_length=max_seq_length, 
        pad_to_max_length=True, 
        truncation=True
    )

    # rename label to labels to make it easier to pass to model forward
    features["labels"] = example_batch["label"]

    return features

In [None]:
train_batch_size = 32
eval_batch_size = 64
max_seq_length = 128

# iterate over all splits in the dataset (train, validation and test)
for split in dataset.keys():
    
    # replace each split with the tokenized version of itself, removing "labels"
    dataset[split] = dataset[split].map(
        convert_to_features,
        batched=True,
        remove_columns=["label"],
    )
    columns = [c for c in dataset[split].column_names if c in ["input_ids", "attention_mask", "labels"]]

    dataset[split].set_format(type="torch", columns=columns)

In [None]:
train_dl = DataLoader(dataset['train'], batch_size = train_batch_size, shuffle=True, drop_last=False, pin_memory=True, num_workers=2)

val_dl = DataLoader(dataset['validation'], batch_size=train_batch_size, shuffle=False, drop_last=False, num_workers=2)

test_dl = DataLoader(dataset['test'], batch_size=eval_batch_size, shuffle=False, drop_last=False, num_workers=2)

Let us indicise the first element of train to check if the tokenization was successful. We must call *list* due to the lazy nature of train_dataloader's computation.

In [None]:
list(train_dl)[0]

Let's check our dataloaders' sizes.

In [None]:
print("# Training iterations:", len(train_dl))
print("# Training sentences:", len(train_dl.dataset))
print("# Validation iterations:", len(val_dl))
print("# Validation sentences:", len(val_dl.dataset))
print("# Test iterations:", len(test_dl))
print("# Test sentences:", len(test_dl.dataset))

## Training

In [None]:
from tqdm.notebook import tqdm
import torch.nn.functional as F

def epoch(model, data_loader, logger,metric, mode="train", optim=None, schedule = False,
          device="cpu", epoch_idx=0, dropout = None):
    assert optim is not None or mode != "train", \
        "Optimizer required in 'train' mode"

    # TODO: fill it yourself!

    outputs_list = []
    for batch_idx, batch in (pbar:= tqdm(enumerate(data_loader), total=len(data_loader))):
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch['labels']
        outputs = model(**batch)
        loss = outputs.loss

        logits = outputs.logits
        preds = torch.argmax(logits, axis=1)

        if mode == "train":
            loss.backward() 
            optim.step()
            if schedule:
                lr_scheduler.step()
            optim.zero_grad()
            logger.add_scalar(f"{mode}/batch_loss", loss, batch_idx + 
                              len(data_loader)*epoch_idx)

        outputs_list.append({"loss": loss, "preds": preds, "labels": labels})
        pbar.set_description(desc=f"Epoch: {epoch_idx} - {mode}, Loss:{loss:.2f}")

    preds = torch.cat([x["preds"] for x in outputs_list]).detach().cpu()
    labels = torch.cat([x["labels"] for x in outputs_list]).detach().cpu()
    loss = torch.stack([x["loss"] for x in outputs_list]).mean()
    acc = metric(preds, labels)
    
    logger.add_scalar(f"{mode}/loss", loss, epoch_idx)
    logger.add_scalar(f"{mode}/acc", acc, epoch_idx)

    print(f"Epoch: {epoch_idx} - {mode}, Loss: {loss:.2f}, Acc: {acc}")
    return

In [None]:
from sklearn.metrics import matthews_corrcoef
metric = matthews_corrcoef

In [None]:
import pandas as pd
pd.options.display.max_rows = 4000
%tensorboard --logdir logs
from tensorboard import notebook
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import random
import torch.optim as optim
from torch.optim import AdamW
from transformers import get_scheduler

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

tb_logger = SummaryWriter(f"logs/{datetime.now()}")

# Hyperparameters
epochs = 5
learning_rate = 3e-5
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running training on device: {device}")

# Reinstantiate model to be sure we train a new model
config = AutoConfig.from_pretrained('bert-base-uncased', num_labels=num_labels, hidden_dropout_prob = 0.2, attention_probs_dropout_prob = 0.2)
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', 
                                                           config=config)
# Prepare optimizer
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
# Prepare scheduler
train_steps = epochs * len(train_dl)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=train_steps
)
# Perform training over the epochs
model.to(device)

for e in range(epochs):
    # Training epoch
    model.train()
    epoch(model, train_dl, tb_logger, metric, "train", optimizer, schedule = True, device=device, epoch_idx=e)

    # Evaluation epoch
    model.eval()
    with torch.no_grad():
        epoch(model, val_dl, tb_logger, metric, "val", device=device, schedule = True, epoch_idx=e)
