In [None]:
from pathlib import Path
from sklearn.model_selection import train_test_split

from datasets import load_dataset, Dataset, DatasetDict
from transformers import DistilBertTokenizerFast, DistilBertModel
import numpy as np

from sklearn.metrics import accuracy_score
import torch
import gc
from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
from transformers import DistilBertForSequenceClassification, DistilBertForMaskedLM, Trainer, TrainingArguments

# Fine-tuning example notebook

Below code is an example of how you can fine-tune an existing model. There are some TODO items for you to explore different things, but as said, do not feel restricted by these but instead try to think what is still unknown for you or what could you try out.

# Download data

Use Imdb dataset for this exercise. The dataset contains movie reviews and review sentiment (positive or negative).

In [None]:
# Load Imdb dataset from Hugging Face
dataset = load_dataset("stanfordnlp/imdb")
# Make sure the data is shuffled
dataset = dataset.shuffle()

In [None]:
# Dataset contains train and test splits and text, label pairs where label is 0 or 1 depending on sentiment
print(dataset["train"][0].keys())
print(dataset["train"][0]["text"])
print(dataset["train"][0]["label"])

### Fetch tokenizer and encode the data

In [None]:
model_name='distilbert-base-uncased'

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

In [None]:
def tokenize_function(examples):
    tokenizer.truncation_side = "left"
    return tokenizer(
        text = examples["text"],
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    )

In [None]:
dataset_encoded = dataset.map(tokenize_function, batched=True)

## TODO
What did the tokenizer function add to the dataset and how does it look now?

## Fetch the model to be fine-tuned

In [None]:
# This is the base model
base_model = DistilBertModel.from_pretrained(model_name)

In [None]:
# Test the base model
test_base_model_input = "Test"
encoded_input = tokenizer(test_base_model_input, return_tensors="pt")
output = base_model(**encoded_input)
output

As we see, the base model does not have a head, so fetch a base model with sequence classification head.

In [None]:
base_model_classification = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

## Question

What is the difference between the base_model and base_model_classification?

Can we use the base model with classification head directly for classification? Why? Try it out.



## Fine-tune model
Test first with smaller number of samples (num_train_spochs = 3, number of train samples = 500)

In [None]:
# Split train data into train and eval datasets. Dataset has a method for this built in.
dataset_train_eval = dataset_encoded["train"].train_test_split(test_size=0.2)
dataset_train_eval

In [None]:

n_train_samples = 500
n_eval_samples = 100

training_args = TrainingArguments(
    output_dir='./distilbert_classification_results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir='./distilbert_classification_logs',
    logging_steps=10,
    #use_mps_device=True
)

fine_tuned_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

trainer = Trainer(
    model=fine_tuned_model,
    args=training_args,
    train_dataset=Dataset.from_dict(dataset_train_eval["train"][:n_train_samples]),
    eval_dataset=Dataset.from_dict(dataset_train_eval["test"][:n_eval_samples])
)
trainer.train()

## TODO

Test the fine-tuned model with some test reviews.

Hint: If you encounter an error regarding memory not allocated in some device, allocate the tokens in the proper device first.
Example:

`tokenizer(test_review, return_tensors="pt).to("mps")`

## TODO

Is the true base model also changed when fine-tuning as above? Can you check that somehow?

Hint: Check if the models weights are the same:

`base_model_classification._modules["embeddings"].word_embeddings.weight`

# Evaluation

In [None]:
# Here is a method for doing inference in batches, and calculating the model accuracy.
def inference(model, dataset_encoded_test, batch_size: int, n_samples: int, device: str="mps"):
    print(f"Inference on {device}...")
    model.to(device)

    for i in range(0,n_samples//batch_size):
        print("Batch: ", i)
        input_ids = torch.LongTensor(dataset_encoded_test["input_ids"][i*batch_size:(i+1)*batch_size]).to(device)
        attention_mask = torch.LongTensor(dataset_encoded_test["attention_mask"][i*batch_size:(i+1)*batch_size]).to(device)

        with torch.no_grad():   
            logits = model(input_ids = input_ids,
                        attention_mask = attention_mask).logits
        if i == 0:
            logits_all = logits
        else:
            logits_all = torch.cat((logits_all, logits),0)

    if device == "cpu":
        predicted_labels = np.argmax(logits_all.detach().numpy(), axis=1)
    else:
        predicted_labels = np.argmax(logits_all.cpu().detach().numpy(), axis=1)
    test_set_labels = dataset_encoded_test["label"][:(n_samples//batch_size*batch_size)]
    
    print(predicted_labels[:20])
    print(test_set_labels[:20])
    print(f"Model accuracy is: {accuracy_score(test_set_labels, predicted_labels)}")

In [None]:
inference(model=fine_tuned_model, dataset_encoded_test=dataset_encoded["test"], batch_size=200, n_samples=1000, device="mps")

### Test Freezing layers

In [None]:
model_freezed = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")

for name, param in model_freezed.named_parameters():
     if "distilbert." in name:
        param.requires_grad = False

trainer_freezed = Trainer(
    model=model_freezed,
    args=training_args,
    train_dataset=Dataset.from_dict(dataset_train_eval["train"][:n_train_samples]),
    eval_dataset=Dataset.from_dict(dataset_train_eval["test"][:n_eval_samples])
)
trainer_freezed.train()

In [None]:
inference(model=model_freezed, dataset_encoded_test=dataset_encoded["test"], batch_size=200, n_samples=1000, device="mps")

## PEFT training

In [None]:
# Use the same base model for LoRA training
model_lora_base = DistilBertForSequenceClassification.from_pretrained(model_name)

lora_config = LoraConfig(target_modules = ['word_embeddings', 'q_lin', 'k_lin', 'v_lin', 'out_lin','pre_classifier','classifier'])
model_lora = get_peft_model(model_lora_base, lora_config)

In [None]:
trainer_lora= Trainer(
    model=model_lora,
    args=training_args,
    train_dataset=Dataset.from_dict(dataset_train_eval["train"][:n_train_samples]),
    eval_dataset=Dataset.from_dict(dataset_train_eval["test"][:n_eval_samples])
)
trainer_lora.train()

In [None]:
inference(model=model_lora, dataset_encoded_test=dataset_encoded["test"], batch_size=200, n_samples=1000, device="mps")

## TODO

Here are some questions and topics to explore.

1. What do you notice about the accuracy and training time with full fine-tuning, with freezing layers or with LoRa fine-tuning? Can you find some explanations for your findings?
2. Test different LoRa config parameters. How does the rank (parameter r) affect the number of trainable parameters. Hint: `print_trainable_parameters`?
3. How is the model accuracy affected when using a lower rank for instance?
4. For further study: Following the above principles, try fine-tuning a classifier with some different data that you can find from Hugging Face. Or try to fine tune these models further with some other sentiment data (other than imbd reviews).
5. Explore how the model accuracy improves as you use more data for training.