In [None]:
# Honour code
# Course: Statistical Languague Processing
# Homework: 04
# I pledge that this submission is my own work.

# Finetuning a Pretrained Transformer Model
In this assignment, you will finetune a pretrained [DistilBERT uncased](https://huggingface.co/distilbert/distilbert-base-uncased) transformer model for the task of classifying tweets into 4 classes (anger, joy, optimism, sadness).

You will use tools from
[Huggingface Transformers](https://huggingface.co/docs/transformers/index) to fine-tune the model on the tweet_eval dataset.

To be able to finetune the model quickly, you need to use a **GPU**.

For this assignment, it is best to use the training loop provided by Huggingface transformers
(Trainer). It will automatically perform batching for you and use a GPU when it is available.

### Dataset
The [tweet_eval](https://huggingface.co/datasets/cardiffnlp/tweet_eval) dataset will be used as training data. Use the emotion subset of the data, where
each tweet is labelled as expressing anger, joy, optimism, or sadness.

## Step 1 Set Up (2 pts)
Import packages.
Load models.
Load the datasets.

In [1]:
!pip install transformers datasets



In [2]:
!pip install accelerate --upgrade
!pip install transformers[torch] --upgrade
!pip install evaluate



In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM # import model

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
# Mount Google Drive

from google.colab import drive
drive.mount('/content/drive')
import os

# Create directories to save training checkpoints and models
checkpoint_dir = '/content/drive/My Drive/transformer_checkpoints'
model_dir = '/content/drive/My Drive/transformer_models'

os.makedirs(checkpoint_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Prepare Training Data (5 pts)

In [5]:
# (a) Load the emotion subset of the tweet dataset (1 pt)
####################

from datasets import load_dataset
ds = load_dataset("cardiffnlp/tweet_eval", "emotion")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
# (b) tokenise the entire dataset
####################

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
def tokenize_function(example):
    return tokenizer(example['text'], padding="max_length", truncation=True)

tokenized_datasets = ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/1421 [00:00<?, ? examples/s]

In [7]:
# (c) Create small data splits for cpu training (2 pts)
# For dataset loading instructions, see the "Use this dataset" button in the right column of the HuggingFace page.
# Until you get all bugs out of your code, you should use a cpu:
# Runtime -> Change runtime type -> CPU
# Also, you don’t need to use much training data just for debugging your code.
# If device.type == "cpu", use only the first (50/10/10) of the tokenized train, dev, test splits.
####################

if device.type == "cpu":
    train_dataset = tokenized_datasets['train'].select(range(50))
    eval_dataset = tokenized_datasets['validation'].select(range(10))
    test_dataset = tokenized_datasets['test'].select(range(10))
else:
    train_dataset = tokenized_datasets['train']
    eval_dataset = tokenized_datasets['validation']
    test_dataset = tokenized_datasets['test']

# Print to verify
print(f"Train dataset: {len(train_dataset)} samples")
print(f"Eval dataset: {len(eval_dataset)} samples")
print(f"Test dataset: {len(test_dataset)} samples")

Train dataset: 3257 samples
Eval dataset: 374 samples
Test dataset: 1421 samples


## Step 4 Pre-Training Tasks

In [8]:
# Load the pretrained distilbert-base-uncased model (5 pts)
####################
# load the model
from transformers import AutoModelForSequenceClassification

# Retrieve the number of labels from the dataset
num_labels = len(ds['train'].features['label'].names)

# Create label2id and id2label dictionaries
label_names = ds['train'].features['label'].names
label2id = {label: idx for idx, label in enumerate(label_names)}
id2label = {idx: label for label, idx in label2id.items()}

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels,
    label2id=label2id,
    id2label=id2label
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Training Arguments (4 pts)
####################

from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir=checkpoint_dir,          # Directory to save checkpoints
    save_total_limit=2,                 # Limit the total amount of checkpoints
    load_best_model_at_end=True,        # Load the best model at the end of training
    evaluation_strategy="steps",        # Evaluation strategy
    save_strategy="steps",              # Save strategy
    logging_strategy="steps",           # Logging strategy
    eval_steps=500,                     # Evaluate every 500 steps
    save_steps=500,                     # Save every 500 steps
    logging_steps=500,                  # Log every 500 steps
    per_device_train_batch_size=8,      # Batch size for training
    per_device_eval_batch_size=8,       # Batch size for evaluation
    num_train_epochs=3,                 # Number of training epochs
    report_to="none"                    # Disable reporting to online services (e.g., WandB)
)

print(training_args)

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=500,
eval_strategy=steps,
evaluation_strategy=steps,
fp16=False,
fp16_backend=auto,
fp



In [22]:
# Evaluation Setup (5 pts)
# Define the compute_metrics function that will be used for evaluation,
# both during training and for evaluation on the test set.
# Since this is a classification task, use the
# F1 score (with macro averaging): metric = evaluate.load("f1")
####################

from datasets import load_metric
import numpy as np

# Load the F1 score metric using datasets
metric = load_metric("f1")

# Define the compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids  # Access labels from the dictionary
    preds = np.argmax(pred.predictions, axis=1)  # Get predicted labels from the predictions
    f1 = metric.compute(predictions=preds, references=labels, average='macro')  # Calculate F1 score
    return {
        'f1': f1['f1'],  # Return the F1 score
    }


  metric = load_metric("f1")


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

The repository for f1 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/f1.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


## Step 5 Initialize Trainer and Train (Total: 2 pts)
Important: Move the model to the device before training: model.to(device)

In [16]:
# check the names
print(train_dataset.column_names)
print(eval_dataset.column_names)

['text', 'label', 'input_ids', 'attention_mask']
['text', 'label', 'input_ids', 'attention_mask']


In [23]:
# Initialize Trainer and Train (Total: 2 pts)
# Important: Move the model to the device before training: model.to(device)
# from transformers import Trainer

# Move the model to the device
model.to(device)

# Initialize the Trainer
trainer = Trainer(
    model=model,                          # The model to be trained
    args=training_args,                   # Training arguments
    train_dataset=train_dataset,          # Training dataset
    eval_dataset=eval_dataset,            # Evaluation dataset
    compute_metrics=compute_metrics       # Metrics computation function
)

# Start training
trainer.train()


Step,Training Loss,Validation Loss,F1
500,0.149,1.488981,0.705724
1000,0.1506,1.419626,0.706961


TrainOutput(global_step=1224, training_loss=0.13341447808384116, metrics={'train_runtime': 512.9073, 'train_samples_per_second': 19.05, 'train_steps_per_second': 2.386, 'total_flos': 1294385117663232.0, 'train_loss': 0.13341447808384116, 'epoch': 3.0})

## Step 6 Save the Best Model (Total: 3 pts)
Save your model to the directory that was created for that purpose when mounting the drive.

After the model is saved, it is safe to delete the contents of the directory used for training. You will need to do this to avoid running out of space.

Check often how much of your google drive quota you are using.

Also keep in mind that items you delete from the file browser on your PC are moved to the trash, where they remain for 30 days before being deleted. This means that they are still counted on your quota.

Check the contents of the trash and empty often.

In [24]:
# Save the best model
trainer.save_model(model_dir)

# Optionally, save the tokenizer as well
tokenizer.save_pretrained(model_dir)

# Clean up the training checkpoint directory
import shutil

shutil.rmtree(checkpoint_dir)  # Deletes the contents of the checkpoint directory


### Step 7 Load the Saved Model
If all went well, you should now be able to load your model in the same way you loaded the distilbert-base-uncased model earlier, using AutoModelForSequenceClassification. But this time you only need to provide the model location, since no further training is being done.

Then create a [TextClassificationPipeline](https://huggingface.co/docs/transformers/main_classes/pipelines#transformers.TextClassificationPipeline). Note that the pipeline task is "sentiment-analysis", and you will need to provide the distilbert-base-uncased tokenizer as an argument to the pipeline.

In [29]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline

# Load the saved model
model = AutoModelForSequenceClassification.from_pretrained(model_dir)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


# Specify device
device = 0 if torch.cuda.is_available() else -1

# Create a Text Classification Pipeline
classifier = pipeline(
    task="sentiment-analysis",
    model=model,
    tokenizer=tokenizer,
    device = device # pass device to pipeline
)

# Example usage of the pipeline
result_positive = classifier("This movie is disgustingly good!")
result_negative = classifier("Director tried too much.")

print(result_positive)
print(result_negative)


[{'label': 'anger', 'score': 0.8470107316970825}]
[{'label': 'optimism', 'score': 0.9292612075805664}]


## Step 8 Use your model for inference (Total: 5 pts)
Use the pipeline created in the previous step to classify some tweets (at least 15). You can use tweets from the test set for this part.

For each tweet, print the model’s prediction, the model’s confidence score, and the tweet.

In [32]:
# Select a subset of tweets from the test set
test_tweets = ds['test']['text'][:15]  # Adjust this slice if needed for more tweets

# # Classify each tweet and print the results
# for tweet in test_tweets:
#     result = classifier(tweet)
#     label = result[0]['label']
#     score = result[0]['score']
#     print(f"Tweet: {tweet}\nPrediction: {label}, Confidence: {score:.4f}\n")

# Select a subset of tweets from the test set
test_tweets = ds['test']['text'][:15]  # Adjust this slice if needed for more tweets

# Classify tweets in batches
results = classifier(test_tweets)

# Print the results
for tweet, result in zip(test_tweets, results):
    label = result['label']
    score = result['score']
    print(f"Tweet: {tweet}\nPrediction: {label}, Confidence: {score:.4f}\n")


Tweet: #Deppression is real. Partners w/ #depressed people truly dont understand the depth in which they affect us. Add in #anxiety &amp;makes it worse
Prediction: sadness, Confidence: 0.9996

Tweet: @user Interesting choice of words... Are you confirming that governments fund #terrorism? Bit of an open door, but still...
Prediction: anger, Confidence: 0.9992

Tweet: My visit to hospital for care triggered #trauma from accident 20+yrs ago and image of my dead brother in it. Feeling symptoms of #depression
Prediction: sadness, Confidence: 0.9996

Tweet: @user Welcome to #MPSVT! We are delighted to have you! #grateful #MPSVT #relationships
Prediction: joy, Confidence: 0.9997

Tweet: What makes you feel #joyful?
Prediction: joy, Confidence: 0.9996

Tweet: i am revolting.
Prediction: anger, Confidence: 0.9996

Tweet: Rin might ever appeared gloomy but to be a melodramatic person was not her thing.\n\nBut honestly, she missed her old friend. The special one.
Prediction: sadness, Confidence: