# Lightweight Fine-Tuning Project

TODO: In this cell, describe your choices for each of the following

* PEFT technique: 
* Model: 
* Evaluation approach: 
* Fine-tuning dataset: 

## Setup and Imports

In [1]:
# ! pip install -q "datasets==2.15.0"

In [2]:
# ! pip show bitsandbytes

In [3]:
# ! pip install scikit-learn

In [4]:
import numpy as np
import torch
import os

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer, 
    DataCollatorWithPadding
)

from peft import LoraConfig, get_peft_model

from sklearn.metrics import accuracy_score, f1_score

# root_dir = "/tmp"
root_dir = "./../../../../data/GenAI/02_genai_fundamentals/project2/results"


foundation_model_name = "distilbert-base-uncased"

foundation_model_path = os.path.join(root_dir, "foundation_model", "model")
foundation_model_output_path = os.path.join(root_dir, "foundation_model", "output")

lora_model_path = os.path.join(root_dir, "lora_model", "model")
lora_model_output_path = os.path.join(root_dir, "lora_model", "output")

qlora_model_path = os.path.join(root_dir, "qlora_model", "model")
qlora_model_output_path = os.path.join(root_dir, "qlora_model", "output")
                                
                                
batch_size = 16
train_epochs = 2

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  from .autonotebook import tqdm as notebook_tqdm


## Load Dataset

In [5]:
# # The sms_spam dataset only has a train split, so we use the train_test_split method to split it into train and test
# dataset = load_dataset("sms_spam", split="train").train_test_split(
#     test_size=0.2, shuffle=True, seed=23
# )

# splits = ["train", "test"]

# # View the dataset characteristics
# dataset["train"]

In [6]:
dataset = load_dataset("sms_spam", split="train")
dataset = dataset.select(range(3))

dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=23)


splits = ["train", "test"]

# View the dataset characteristics
dataset["train"]

Dataset({
    features: ['sms', 'label'],
    num_rows: 2
})

In [7]:
# Inspect the first example
dataset["train"][0]

{'sms': 'Ok lar... Joking wif u oni...\n', 'label': 0}

## Pre-process datasets

In [8]:
tokenizer = AutoTokenizer.from_pretrained(foundation_model_name)

# Let's use a lambda function to tokenize all the examples
tokenized_dataset = {}
for split in splits:
    tokenized_dataset[split] = dataset[split].map(
        lambda x: tokenizer(x["sms"], truncation=True), batched=True
    )


# Inspect the available columns in the dataset
tokenized_dataset["train"]

Dataset({
    features: ['sms', 'label', 'input_ids', 'attention_mask'],
    num_rows: 2
})

In [9]:
# # Define tokenization function
# def preprocess_function(examples):
#     return tokenizer(
#         examples["sms"],
#         truncation=True,
#         padding="max_length",  # or "longest" or True
#         max_length=128
#     )

In [10]:
# Tokenize datasets
# tokenized_dataset = {}
# for split in splits:
#     tokenized_dataset[split] = dataset[split].map(preprocess_function, batched=True)

# # Set format for PyTorch (this makes tensors!)
# for split in splits:
#     tokenized_dataset[split].set_format(
#         type="torch", columns=["input_ids", "attention_mask", "label"]
#     )

In [11]:
# Inspect the available columns in the dataset
tokenized_dataset["train"]

Dataset({
    features: ['sms', 'label', 'input_ids', 'attention_mask'],
    num_rows: 2
})

## Loading and Evaluating a Foundation Model

TODO: In the cells below, load your chosen pre-trained Hugging Face model and evaluate its performance prior to fine-tuning. This step includes loading an appropriate tokenizer and dataset.

In [12]:
foundation_model = AutoModelForSequenceClassification.from_pretrained(
    foundation_model_name,
    num_labels=2,
    id2label={0: "not spam", 1: "spam"},
    label2id={"not spam": 0, "spam": 1},
)

# Unfreeze all the model parameters.
# Hint: Check the documentation at https://huggingface.co/transformers/v4.2.2/training.html
for param in foundation_model.parameters():
    param.requires_grad = True

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
print(foundation_model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [14]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {"accuracy": (predictions == labels).mean()}


In [15]:
def TrainAndValidate(model, model_output_path, model_path):
    model.to(device)
    
    # Read more about it here https://huggingface.co/docs/transformers/main_classes/trainer
    trainer = Trainer(
        model=foundation_model,
        args=TrainingArguments(
            output_dir=model_output_path,
            # Set the learning rate
            learning_rate=2e-5,
            # Set the per device train batch size and eval batch size
            per_device_train_batch_size=batch_size,
            per_device_eval_batch_size=batch_size,
            # Evaluate and save the model after each epoch
            # evaluation_strategy="epoch",
            eval_strategy="epoch",
            save_strategy="epoch",
            num_train_epochs=train_epochs,
            weight_decay=0.01,
            load_best_model_at_end=True,
        ),
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["test"],
        tokenizer=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
        compute_metrics=compute_metrics,
    )

    trainer.train()
    
    
    model.to(device)
    trainer.evaluate()
    
    model.save_pretrained(model_path)

## Train Foundation Model

In [16]:
foundation_trainer = TrainAndValidate(foundation_model, foundation_model_output_path, foundation_model_path)
# foundation_model.to(device)


# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     predictions = np.argmax(predictions, axis=1)
#     return {"accuracy": (predictions == labels).mean()}


# # The HuggingFace Trainer class handles the training and eval loop for PyTorch for us.
# # Read more about it here https://huggingface.co/docs/transformers/main_classes/trainer
# trainer = Trainer(
#     model=foundation_model,
#     args=TrainingArguments(
#         output_dir=foundation_model_output_path,
#         # Set the learning rate
#         learning_rate=2e-5,
#         # Set the per device train batch size and eval batch size
#         per_device_train_batch_size=batch_size,
#         per_device_eval_batch_size=batch_size,
#         # Evaluate and save the model after each epoch
#         # evaluation_strategy="epoch",
#         eval_strategy="epoch",
#         save_strategy="epoch",
#         num_train_epochs=train_epochs,
#         weight_decay=0.01,
#         load_best_model_at_end=True,
#     ),
#     train_dataset=tokenized_dataset["train"],
#     eval_dataset=tokenized_dataset["test"],
#     tokenizer=tokenizer,
#     data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
#     compute_metrics=compute_metrics,
# )

# trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.742299,0.0
2,No log,0.763981,0.0


## Evaluate the model

In [17]:
# foundation_model.to(device)
# trainer.evaluate()

In [18]:
# foundation_model.save_pretrained(foundation_model_path)

## Performing Parameter-Efficient Fine-Tuning

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [19]:
# lora_config = LoraConfig(
#     r=8,
#     lora_alpha=32,
#     target_modules=["q_lin", "k_lin", "v_lin"],
#     lora_dropout=0.05,
#     bias="none",
#     task_type="SEQ_CLS"
# )

# lora_model = get_peft_model(foundation_model, lora_config)
# lora_model.print_trainable_parameters()

In [20]:
# lora_model.to(device)

# lora_trainer = Trainer(
#     model=lora_model,
#     args=TrainingArguments(
#         output_dir="./tmp/data/lora_spam_not_spam",
#         # Set the learning rate
#         learning_rate=2e-5,
#         # Set the per device train batch size and eval batch size
#         per_device_train_batch_size=16,
#         per_device_eval_batch_size=16,
#         # Evaluate and save the model after each epoch
#         evaluation_strategy="epoch",
#         save_strategy="epoch",
#         num_train_epochs=2,
#         weight_decay=0.01,
#         load_best_model_at_end=True,
#     ),
#     train_dataset=tokenized_dataset["train"],
#     eval_dataset=tokenized_dataset["test"],
#     tokenizer=tokenizer,
#     data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
#     compute_metrics=compute_metrics,
# )

# lora_trainer.train()

In [21]:
# lora_model.to(device)
# lora_trainer.evaluate()

###  ⚠️ IMPORTANT ⚠️

Due to workspace storage constraints, you should not store the model weights in the same directory but rather use `/tmp` to avoid workspace crashes which are irrecoverable.
Ensure you save it in /tmp always.

In [22]:
# # Saving the model
# lora_model.save_pretrained(lora_model_path)

## Performing Bits and Bytes

TODO: In the cells below, create a PEFT model from your loaded model, run a training loop, and save the PEFT model weights.

In [23]:
# bnb_config = BitsAndBytesConfig(
#     load_in_4bit=True,                # 4-bit quantization
#     bnb_4bit_use_double_quant=True,   # nested quantization for stability
#     bnb_4bit_quant_type="nf4",        # normal float 4 (better accuracy)
#     bnb_4bit_compute_dtype=torch.float16,  # compute in half precision
# )

In [24]:
# qlora_model = AutoModelForSequenceClassification.from_pretrained(
#     foundation_model_name,
#     num_labels=2,
#     quantization_config=bnb_config,
#     # device_map="auto",   # automatically place modules on GPU
#     id2label={0: "not spam", 1: "spam"},
#     label2id={"not spam": 0, "spam": 1},
# )

## Apply LoRA(PEFT)

In [25]:
# qlora_model = get_peft_model(qlora_model, lora_config)

# qlora_model.print_trainable_parameters()


In [26]:
# qlora_model.to(device)

# qlora_trainer = Trainer(
#     model=qlora_model,
#     args=TrainingArguments(
#         output_dir="./tmp/data/qlora_spam_not_spam",
#         # Set the learning rate
#         learning_rate=2e-5,
#         # Set the per device train batch size and eval batch size
#         per_device_train_batch_size=16,
#         per_device_eval_batch_size=16,
#         # Evaluate and save the model after each epoch
#         evaluation_strategy="epoch",
#         save_strategy="epoch",
#         num_train_epochs=2,
#         weight_decay=0.01,
#         load_best_model_at_end=True,
#     ),
#     train_dataset=tokenized_dataset["train"],
#     eval_dataset=tokenized_dataset["test"],
#     tokenizer=tokenizer,
#     data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
#     compute_metrics=compute_metrics,
# )

# qlora_trainer.train()

In [27]:
# qlora_model.to(device)
# qlora_trainer.evaluate()

In [28]:
# # Saving the model
# qlora_model.save_pretrained(qlora_model_path)

## Performing Inference with a PEFT Model

TODO: In the cells below, load the saved PEFT model weights and evaluate the performance of the trained PEFT model. Be sure to compare the results to the results from prior to fine-tuning.

In [29]:
# def evaluate_model(model, dataset):
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#     model.to(device)
#     preds, labels = [], []
#     for i in range(0, len(dataset), batch_size):  # batch size = 16
#         batch = dataset[i:i+16]
#         inputs = {k: torch.tensor(batch[k]) for k in ["input_ids", "attention_mask"]}
#         with torch.no_grad():
#             outputs = model(**inputs)
#         preds += torch.argmax(outputs.logits, dim=1).tolist()
#         labels += batch["label"]
#     acc = accuracy_score(labels, preds)
#     f1 = f1_score(labels, preds)
#     return acc, f1

In [30]:
# tokenized_dataset = {}
# for split in splits:
#     tokenized_dataset[split] = dataset[split].map(preprocess_function, batched=True)

# # Set format for PyTorch (this makes tensors!)
# for split in splits:
#     tokenized_dataset[split].set_format(
#         type="torch", columns=["input_ids", "attention_mask", "label"]
#     )

In [31]:
def evaluate_model(model, dataset, tokenizer):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    preds, labels = [], []

    for i in range(0, len(dataset), batch_size):
        batch = dataset[i : i + batch_size]

        # Tokenize this batch of text dynamically with padding + truncation
        inputs = tokenizer(
            batch["sms"],
            truncation=True,
            padding=True,
            return_tensors="pt",
            max_length=128
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        preds += torch.argmax(outputs.logits, dim=1).cpu().tolist()
        labels += batch["label"]

    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds)
    return acc, f1


In [32]:
eval_dataset=tokenized_dataset["test"]

In [33]:
foundation_acc, foundation_f1 = evaluate_model(foundation_model, eval_dataset, tokenizer)
# lora_acc, lora_f1 = evaluate_model(lora_model, eval_dataset, tokenizer)
# qlora_acc, qlora_f1 = evaluate_model(qlora_model, eval_dataset, tokenizer)

In [34]:
print(f"Foundation Model - Accuracy: {foundation_acc:.4f}, F1: {foundation_f1:.4f}")
# print(f"Lora Model - Accuracy: {lora_acc:.4f}, F1: {lora_f1:.4f}")
# print(f"Qlora Model - Accuracy: {qlora_acc:.4f}, F1: {qlora_f1:.4f}")

Foundation Model - Accuracy: 0.0000, F1: 0.0000
