<a href="https://colab.research.google.com/github/fuat-arslan/NLP_Course/blob/main/Fuat_Arslan_NLP_Assignment2_FinetuningBERTforTextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#if colab used
!pip -q install optuna transformers datasets

#Imports

In [None]:
import torch
import optuna

from transformers import logging
import time
import datetime

import random
from random import seed

import numpy as np
from sklearn.metrics import matthews_corrcoef
import torch.utils.data as Data

from datasets import load_dataset
from transformers import AutoTokenizer
import optuna.visualization as vis


logging.set_verbosity_error()

#Utils.py from Tutorial

In [None]:
def get_device():
    if torch.backends.cuda.is_built():
        print("CUDA")
        device = torch.device("cuda")
    elif torch.backends.mps.is_built():
        print("mps")
        device = torch.device("mps")
    else:
        device = torch.device("cpu")
        raise Exception("GPU is not avalaible!")
    return device


# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)


def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))


def train_eval_loop(
    model, loader, optimizer, scheduler, device, n_epochs=2, seed_val=42
):
    # Set the seed value all over the place to make this reproducible.

    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    loss_values = []
    val_loss = []
    val_MCC_list = []
    t00 = time.time()
    for epoch_i in range(0, n_epochs):
        print("")
        print("======== Epoch {:} / {:} ========".format(epoch_i + 1, n_epochs))
        print("Training...")

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_loss = 0
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(loader["train"]):
            # print('Memory Usage:')
            # print('Allocated:', round(torch.mps.driver_allocated_memory()/1024**3,1), 'GB')

            b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
            model.zero_grad()

            loss = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels).loss
            total_loss += loss.item()
            loss.backward()

            # Clip the norm of the gradients to 1.0, this is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(loader["train"])
        loss_values.append(avg_train_loss)

        print("\nAverage training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(format_time(time.time() - t00)))

        print("\nRunning Validation...")
        t0 = time.time()
        model.eval()
        val_mcc, nb_eval_steps = 0, 0

        for batch in loader["validation"]:
            b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
            # print('Memory Usage:')
            # print('Allocated:', round(torch.mps.driver_allocated_memory()/1024**3,1), 'GB')

            with torch.no_grad():
                logits = model(b_input_ids, attention_mask=b_input_mask).logits

            logits = logits.detach().cpu().numpy()
            logits = np.argmax(logits, axis=1).flatten()
            label_ids = b_labels.to("cpu").numpy()

            val_mcc += matthews_corrcoef(logits, label_ids)
            nb_eval_steps += 1

        val_mcc = 100 * (val_mcc / nb_eval_steps)
        val_MCC_list.append(val_mcc)
        print("  Validation MCC: {0:.2f}".format(val_mcc))
        print("  Validation took: {:}".format(format_time(time.time() - t0)))

    return val_mcc, (loss_values,val_MCC_list)


def init_loader(max_length=16, batch_size=32, test_size=0.2, random_state=2023):
    model_checkpoint = "bert-base-uncased"

    dataset = load_dataset("glue", "cola")

    df_s = {}
    x = {}
    y = {}
    input_ids, attention_mask = {}, {}
    datasets, loader = {}, {}

    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

    for split in ["train", "validation", "test"]:
        df_s[split] = dataset[split].to_pandas()
        x[split] = dataset[split].to_pandas().sentence.values
        y[split] = dataset[split].to_pandas().label.values

        input = tokenizer(
            list(x[split]),
            max_length=max_length,
            truncation=True,
            padding=True,
            return_tensors="pt",
        )

        input_ids[split], attention_mask[split] = input.input_ids, input.attention_mask

        datasets[split] = Data.TensorDataset(
            input_ids[split], attention_mask[split], torch.LongTensor(y[split])
        )

        loader[split] = Data.DataLoader(
            datasets[split], batch_size=batch_size, shuffle=False
        )
    return loader, y


from transformers import get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification


def init_objects(
    lr, n_epochs, dropout_p=0.1, max_length=16, batch_size=32, test_size=0.2, random_state=2023
):
    loader, _ = init_loader(max_length=max_length, batch_size=batch_size)

    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", num_labels=2
    )
    model.dropout.p = dropout_p

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, eps=1e-8)

    total_steps = len(loader["train"]) * n_epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )
    return model, loader, optimizer, scheduler


###Sample train

In [None]:
# %%
lr = 2e-5
n_epochs = 1
max_length = 16
batch_size = 32
test_size = 0.2
random_state = 2023
device = torch.device("cuda")

model, loader, optimizer, scheduler = init_objects(
    lr, n_epochs, max_length, batch_size, test_size, random_state
)
model.to(device)
_, _ = train_eval_loop(
    model, loader, optimizer, scheduler, device, n_epochs=n_epochs, seed_val=42
)

#Tuner

In [None]:
# Hyperparam Tune
param_dict = {
    "lr": [1e-5, 2e-5],
    "n_epochs": [1, 2, 3],
    "max_length": [16, 32, 64],
    "batch_size": [32],
    "dropout_p": [0.0, 0.5]
}


class BertObjective:
    def __init__(self, d, device):
        self.d = d
        self.device = device

    def __call__(self, trial: optuna.trial.Trial):
        self.lr = trial.suggest_float("lr", self.d["lr"][0], self.d["lr"][1], log=True)
        self.n_epochs = trial.suggest_categorical("n_epochs", self.d["n_epochs"])
        self.max_length = trial.suggest_categorical("max_length", self.d["max_length"])
        self.batch_size = trial.suggest_categorical("batch_size", self.d["batch_size"])
        self.dp = trial.suggest_float("dropout_p", self.d["dropout_p"][0], self.d["dropout_p"][1])

        model, loader, optimizer, scheduler = init_objects(
            self.lr, self.n_epochs, self.dp, self.max_length, self.batch_size
        )
        model.to(self.device)
        val_mcc, _ = train_eval_loop(
            model, loader, optimizer, scheduler, self.device, self.n_epochs
        )

        return val_mcc


device = torch.device("cuda")
study = optuna.create_study(study_name="Stduy 0", direction="maximize")
study.optimize(BertObjective(param_dict, device), n_trials=20)

# Train again with best parameters
lr = study.best_params["lr"]
n_epochs = study.best_params["n_epochs"]
max_length = study.best_params["max_length"]
batch_size = study.best_params["batch_size"]
dropout_p = study.best_params["dropout_p"]

model, loader, optimizer, scheduler = init_objects(lr, n_epochs, dropout_p, max_length, batch_size)
model.to(device)
val_mcc, _ = train_eval_loop(model, loader, optimizer, scheduler, device, n_epochs)
# Obtain Test Results

In [None]:
study.best_params

Visualize reuslts

In [None]:
vis.plot_parallel_coordinate(study)

In [None]:
vis.plot_param_importances(study)

#Train Best model

In [None]:
b_params = study.best_params

In [None]:
# %%
lr = b_params['lr']
n_epochs = b_params['n_epochs']
max_length = b_params['max_length']
batch_size = b_params['batch_size']
dropout_p = b_params['dropout_p']
test_size = 0.2
random_state = 2023
device = torch.device("cuda")

model, loader, optimizer, scheduler = init_objects(
    lr, n_epochs,dropout_p, max_length, batch_size, test_size, random_state
)
model.to(device)
val_mcc, losses = train_eval_loop(
    model, loader, optimizer, scheduler, device, n_epochs=n_epochs, seed_val=42
)

In [None]:

# Sample data
val_MCC_list = losses[1]
loss_val = losses[0]
# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

# Plot val_MCC_list
ax1.plot(val_MCC_list, marker='o')
ax1.set_xlabel('Index')
ax1.set_ylabel('MCC Value')
ax1.set_title('val_MCC_list')

# Plot loss_val
ax2.plot(loss_val, marker='o')
ax2.set_xlabel('Index')
ax2.set_ylabel('Loss Value')
ax2.set_title('loss_val')

# Adjust the layout to prevent overlapping labels
plt.tight_layout()

# Display the plot
plt.show()

Test Run

In [None]:
def test_run(model,loader):
    val_MCC_list = []
    print("\nRunning Test...")
    t0 = time.time()
    model.eval()
    val_mcc, nb_eval_steps = 0, 0

    for batch in loader["test"]:
        b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
        # print('Memory Usage:')
        # print('Allocated:', round(torch.mps.driver_allocated_memory()/1024**3,1), 'GB')

        with torch.no_grad():
            logits = model(b_input_ids, attention_mask=b_input_mask).logits


        logits = logits.detach().cpu().numpy()
        logits = np.argmax(logits, axis=1).flatten()
        print(logits)
        label_ids = b_labels.to("cpu").numpy()
        print(label_ids)
        val_mcc += matthews_corrcoef(logits, label_ids)
        nb_eval_steps += 1
        val_MCC_list.append(val_mcc)

    val_mcc = 100 * (val_mcc / nb_eval_steps)

    print("  Test MCC: {0:.2f}".format(val_mcc))
    print("  Test took: {:}".format(format_time(time.time() - t0)))

    return val_MCC_list

#Push Model

In [None]:
import huggingface_hub
from huggingface_hub import notebook_login
your_token = None #Please generate a token from huggingface
huggingface_hub.login(token = your_token)

In [None]:
model.push_to_hub(repo_id = 'bert_fine_tune')

#Custom Model

In [None]:
def custom_train_eval_loop(
    model, loader, optimizer, scheduler, device, n_epochs=2, seed_val=42
):
    # Set the seed value all over the place to make this reproducible.
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    loss_values = []
    val_loss = []
    val_MCC_list = []
    t00 = time.time()

    for epoch_i in range(0, n_epochs):
        print("")
        print("======== Epoch {:} / {:} ========".format(epoch_i + 1, n_epochs))
        print("Training...")

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_loss = 0
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(loader["train"]):
            b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)
            model.zero_grad()

            # Forward pass
            loss, logits = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)

            total_loss += loss.item()
            loss.backward()

            # Clip the norm of the gradients to 1.0
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(loader["train"])
        loss_values.append(avg_train_loss)

        print("\nAverage training loss: {0:.2f}".format(avg_train_loss))
        print("Training epoch took: {:}".format(format_time(time.time() - t00)))

        print("\nRunning Validation...")
        t0 = time.time()
        model.eval()
        val_mcc, nb_eval_steps = 0, 0

        for batch in loader["validation"]:
            b_input_ids, b_input_mask, b_labels = tuple(t.to(device) for t in batch)

            with torch.no_grad():
                _, logits = model(b_input_ids, attention_mask=b_input_mask)

            logits = logits.detach().cpu().numpy()
            logits = np.argmax(logits, axis=1).flatten()
            label_ids = b_labels.to("cpu").numpy()

            val_mcc += matthews_corrcoef(logits, label_ids)
            nb_eval_steps += 1

        val_mcc = 100 * (val_mcc / nb_eval_steps)
        val_MCC_list.append(val_mcc)
        print("Validation MCC: {0:.2f}".format(val_mcc))
        print("Validation took: {:}".format(format_time(time.time() - t0)))

    return val_mcc, (loss_values, val_MCC_list)

def custom_init_objects(
    lr, n_epochs, dropout_p=0.1, max_length=16, batch_size=32, test_size=0.2, random_state=2023
):
    loader, _ = init_loader(max_length=max_length, batch_size=batch_size)

    model = BertClassifier(pretrained_model_name='bert-base-uncased', num_classes=2, pooling_fn = max_pooling)

    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, eps=1e-8)

    total_steps = len(loader["train"]) * n_epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )
    return model, loader, optimizer, scheduler

In [None]:
import torch
import torch.nn as nn
from transformers import BertModel

In [None]:
# Max pooling function
def max_pooling(tensor, dim):
    return torch.max(tensor, dim)[0]

In [None]:
class BertClassifier(nn.Module):
    def __init__(self, pretrained_model_name, num_classes, pooling_fn=torch.mean):
        super(BertClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model_name)
        self.pooling_fn = pooling_fn
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = self.pooling_fn(outputs.last_hidden_state, dim=1)
        logits = self.classifier(pooled_output)

        if labels is not None:
            loss = self.loss_fn(logits, labels)
            return loss, logits
        else:
            return 0, logits

In [None]:
# Hyperparam Tune
param_dict = {
    "lr": [1e-5, 2e-5],
    "n_epochs": [1,2,3],
    "max_length": [16, 32, 64],
    "batch_size": [32]
}


class BertObjective:
    def __init__(self, d, device):
        self.d = d
        self.device = device

    def __call__(self, trial: optuna.trial.Trial):
        self.lr = trial.suggest_float("lr", self.d["lr"][0], self.d["lr"][1], log=True)
        self.n_epochs = trial.suggest_categorical("n_epochs", self.d["n_epochs"])
        self.max_length = trial.suggest_categorical("max_length", self.d["max_length"])
        self.batch_size = trial.suggest_categorical("batch_size", self.d["batch_size"])

        model, loader, optimizer, scheduler = custom_init_objects(
            self.lr, self.n_epochs,  0.1, self.max_length, self.batch_size
        )
        model.to(self.device)
        val_mcc, _ = custom_train_eval_loop(
            model, loader, optimizer, scheduler, self.device, self.n_epochs
        )

        return val_mcc


device = torch.device("cuda")
study = optuna.create_study(study_name="Stduy 0", direction="maximize")
study.optimize(BertObjective(param_dict, device), n_trials=20)

# Train again with best parameters
lr = study.best_params["lr"]
n_epochs = study.best_params["n_epochs"]
max_length = study.best_params["max_length"]
batch_size = study.best_params["batch_size"]


model, loader, optimizer, scheduler = custom_init_objects(lr, n_epochs, 0.1, max_length, batch_size)
model.to(device)
val_mcc, losses_l = custom_train_eval_loop(model, loader, optimizer, scheduler, device, n_epochs)
# Obtain Test Results

In [None]:
# Sample data
val_MCC_list = losses_l[1]
loss_val = losses_l[0]
# Create a figure with two subplots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))

# Plot val_MCC_list
ax1.plot(val_MCC_list, marker='o')
ax1.set_xlabel('Index')
ax1.set_ylabel('MCC Value')
ax1.set_title('val_MCC_list')

# Plot loss_val
ax2.plot(loss_val, marker='o')
ax2.set_xlabel('Index')
ax2.set_ylabel('Loss Value')
ax2.set_title('loss_val')

# Adjust the layout to prevent overlapping labels
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
vis.plot_param_importances(study)

In [None]:
vis.plot_parallel_coordinate(study)