# MVP: Fine-tuning a Pretrained Model for Sentiment Analysis

In [None]:
import random

import pandas as pd
from datasets import Dataset
from dotenv import load_dotenv
import numpy as np
import torch

from src.data_loader import load_datasets

load_dotenv()

MODEL_NAME = 'sentence-transformers/all-MiniLM-L6-v2'
SEED = 1337

torch.use_deterministic_algorithms(True)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('mps') if torch.backends.mps.is_available() else torch.device('cpu')
print('Device:', device)

def set_seed():
    random.seed(SEED)
    np.random.seed(SEED)
    
    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
        torch.cuda.manual_seed_all(SEED)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed()

In [None]:
labelled_dev, unlabelled_dev, val_set, nested_splits = load_datasets("../data/", include_nested_splits=True)
train_df = pd.concat([labelled_dev, unlabelled_dev])

labelled_dev.shape, unlabelled_dev.shape, val_set.shape

In [None]:
for split in nested_splits.items():
    print(split[0], split[1].shape)

In [None]:
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_set)

# Fine-tuning

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
    
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, force_download=True)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, force_download=True)

In [None]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print(f"Model has {params} trainable parameters.")

## Training

In [None]:
from transformers import TrainingArguments

training_epochs = 3
training_batch_size = 16
logging_steps = len(train_df) // training_batch_size

# TODO: Use hyperparams for fine-tuning stated on https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english
training_args = TrainingArguments(output_dir=MODEL_NAME,
                                  num_train_epochs=training_epochs,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=training_batch_size,
                                  per_device_eval_batch_size=training_batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  log_level="error",
                                  report_to="wandb",
                                  run_name="amazon_sentiment_analysis",
                                  optim="adamw_torch"
                                  )

In [None]:
def tokenize(batch, max_length=512):
    return tokenizer(batch['content'], padding=True, truncation=True, max_length=max_length)

def tokenize_dataset(dataset, max_size=100, process_batch_size=100, batched=True):
    """ Tokenizes the dataset """
    if not isinstance(dataset, Dataset):
        raise ValueError("The dataset must be a huggingface Dataset object.")
    return dataset.map(lambda batch: tokenize(batch, max_size), batched=batched, batch_size=process_batch_size)

train_ds_tokenized = tokenize_dataset(train_ds)
val_ds_tokenized = tokenize_dataset(val_ds)

## Metrics

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def compute_metrics(pred):
    labels, preds = pred.label_ids, pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
from transformers import Trainer, TrainerCallback

class EpochResultsCallback(TrainerCallback):
    """A custom callback to capture and log results at the end of each epoch."""
    def __init__(self):
        self.results_df = pd.DataFrame(columns=['Epoch', 'Validation Loss', 'Accuracy', 'F1'])
    
    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        new_row = {
            'Epoch': state.epoch,
            'Validation Loss': metrics['eval_loss'],
            'Accuracy': metrics.get('eval_accuracy', None),
            'F1': metrics.get('eval_f1', None)
        }
        self.results_df = pd.concat([self.results_df, pd.DataFrame(new_row, index=[0])])


def fine_tune_model(model, training_args, train_dataset, eval_dataset, tokenizer):
    epoch_results_callback = EpochResultsCallback()
    trainer = Trainer(
        model=model,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        callbacks=[epoch_results_callback]
    )
    
    trainer.evaluate()
    
    torch.cuda.empty_cache()
    trainer.train()

    return epoch_results_callback.results_df

eval_df = fine_tune_model(model, training_args, train_ds_tokenized, val_ds_tokenized, tokenizer)

In [None]:
eval_df

# Transfer Learning