## 1. Installs and Environment Setup

In [1]:
# check for pytorch install and CUDA availability:
import torch
torch.cuda.is_available(), torch.cuda.get_device_name(0) if torch.cuda.is_available() else None

(True, 'NVIDIA GeForce RTX 5070 Ti Laptop GPU')

In [2]:
# standard library imports
import os
import random

# 3rd-party libs & hugging face ecosystem 
import numpy as np
import evaluate
import accelerate
import matplotlib
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments, 
    Trainer,
)
from peft import LoraConfig, get_peft_model
from sklearn.metrics import classification_report, accuracy_score, f1_score

## 2. Data Preparation: load, tokenize, splits

In [3]:
# load dataset (note splits)
ds = load_dataset("dair-ai/emotion")
print(ds)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})


In [4]:
# choose model tokenizer
MODEL_NAME = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [5]:
# tokenizing function
def tokenize_batch(batch):
    tok = tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )
    # DistilRoBERTa doesn't use token_type_ids:
    tok.pop("token_type_ids", None)
    return tok

In [6]:
# apply tokenization
ds = ds.map(tokenize_batch, batched=True)

In [7]:
# only keep columns needed for PyTorch and correct format for PyTorch
cols_to_keep = ["input_ids", "attention_mask", "label"]
ds = ds.remove_columns([c for c in ds.column_names["train"] if c not in cols_to_keep])
ds.set_format(type="torch")

## 3. Load base model & classification head

In [8]:
NUM_LABELS = len(set(ds["train"]["label"]))

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=NUM_LABELS
)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(DEVICE)
# print(set(ds["train"]["label"]))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
           

## 4. Apply LoRA using PEFT

In [9]:
peft_config = LoraConfig(
    task_type="SEQ_CLS", # sequence classification
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters() # sanity - should only show LoRA params trainable

trainable params: 13,042,048 || all params: 107,464,448 || trainable%: 12.1362


## 5. Training: Trainer + TrainingArguments

In [10]:
# metric setup
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=p.label_ids)["accuracy"],
        "f1": f1.compute(predictions=preds, references=p.label_ids, average="macro")["f1"],
        "precision": precision.compute(predictions=preds, references=p.label_ids, average="macro")["precision"],
        "recall": recall.compute(predictions=preds, references=p.label_ids, average="macro")["recall"],
    }

training_args = TrainingArguments(
    output_dir="./peft-emotion",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    per_device_train_batch_size=16, # adjust to fit GPU memory, use smaller if needed
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    learning_rate=2e-4,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
)

print("TrainingArguments Instantiated Successfully!")

TrainingArguments Instantiated Successfully!


In [11]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    processing_class=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.4354,0.332546,0.8835,0.853027,0.848675,0.864902
2,0.2915,0.2375,0.9115,0.882219,0.889927,0.876634
3,0.2793,0.220894,0.9175,0.891699,0.887327,0.897005


TrainOutput(global_step=3000, training_loss=0.5187485879262288, metrics={'train_runtime': 241.0572, 'train_samples_per_second': 199.123, 'train_steps_per_second': 12.445, 'total_flos': 2523908800512000.0, 'train_loss': 0.5187485879262288, 'epoch': 3.0})

## 6. Evaluation on Test Set

In [12]:
results = trainer.predict(ds["test"])
preds = np.argmax(results.predictions, axis=1)
labels = results.label_ids

print(classification_report(labels, preds, digits=4))

              precision    recall  f1-score   support

           0     0.9531    0.9449    0.9490       581
           1     0.9371    0.9223    0.9297       695
           2     0.7759    0.8491    0.8108       159
           3     0.8986    0.9018    0.9002       275
           4     0.8761    0.8839    0.8800       224
           5     0.7344    0.7121    0.7231        66

    accuracy                         0.9090      2000
   macro avg     0.8625    0.8690    0.8655      2000
weighted avg     0.9101    0.9090    0.9094      2000



In [13]:
# print metric dict:
print("Eval metrics:", compute_metrics(results)) 

Eval metrics: {'accuracy': 0.909, 'f1': 0.8654557409224403, 'precision': 0.8625255818695735, 'recall': 0.8690248791245195}


In [14]:
# chatgpt5 made this for me when I said it seemed weird I didn't "interact" with the emotions from the Twitter dataset:
example = "Iâ€™m so proud of myself today!"
inputs = tokenizer(example, return_tensors="pt").to(model.device)
pred = model(**inputs).logits.argmax(dim=-1).item()
ds["train"].features["label"].int2str(pred)


'joy'

In [15]:
example2 = "Square chair eats my round table."
inputs = tokenizer(example2, return_tensors="pt").to(model.device)
pred = model(**inputs).logits.argmax(dim=-1).item()
ds["train"].features["label"].int2str(pred)

'fear'

## 7. Baseline Comparison (majority class accuracy)

In [16]:
# I tried to iterate over everything which I am told is 0(n^2)
# this should be 0(n)

from collections import Counter

# count labels
label_counts = Counter(ds["train"]["label"])

# majority class
majority_class = label_counts.most_common(1)[0][0]
majority_class


tensor(0)

In [17]:
# accuracy of always guessing majority class
test_labels = ds["test"]["label"]
majority_baseline_acc = sum([1 for x in test_labels if x == majority_class]) / len(test_labels)
majority_baseline_acc


0.2905

In [18]:
trainer.save_model("lora-distilroberta-emotion")