In [None]:
# All necessary libraries for this project

import os
from datasets import load_dataset
from transformers import DistilBertTokenizer, set_seed, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import classification_report
from collections import Counter

In [None]:
# Mounting/linking my Google Drive account with this Google Colab project.

from google.colab import drive
ROOT = "/content/drive"
drive.mount(ROOT, force_remount=True)

path = "/content/drive/My Drive/Essex Summer School 2025/3N Deep Learning for Text and Vision/Exam/"
os.chdir(path)
print("Current working directory:", os.getcwd())

Mounted at /content/drive
Current working directory: /content/drive/My Drive/Essex Summer School 2025/3N Deep Learning for Text and Vision/Exam


First, I have to retrieve the data from the Stanford Sentiment Treebank (SST-2) through the Hugging Face Hub's datasets library

In [None]:
ds = load_dataset("stanfordnlp/sst2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Now that I have the data needed, I move on to the data preprocessing stage where:


*   I tokenise the entire dataset (ds) across all splits (train, validation, and test) using map and batched for efficiency.
*   Clean and format the data to be ready for the analysis.



In [None]:
# Data preprocessing


# Initialising the DistilBert tokeniser
tokeniser = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def format_data_for_training(tokenised_ds: dict) -> dict:
  """
    Cleaning the dataset and making it ready for training. Specifically:
    - Removing columns that won't be used by the model.
    - Renaming "label" to "labels", because Trainer seems to expect "labels" as the default target field.
    - Setting format to PyTorch tensors.

    Parameters:
      data (dict):
        The full tokenised dataset.

    Returns:
      A dictionary containing PyTorch-formatted tokenised text and labels ready for training.
  """
  tokenised_ds = tokenised_ds.remove_columns(["sentence", "idx"])
  tokenised_ds = tokenised_ds.rename_column("label", "labels")
  print("\nBEFORE TORCH FORMAT:\n", tokenised_ds["train"][:3])

  tokenised_ds.set_format(type = "torch", columns = ["input_ids", "attention_mask", "labels"])
  print("\nAFTER TORCH FORMAT:\n", tokenised_ds["train"][:3])
  return tokenised_ds


def tokenise_text(data: dict) -> dict:
  """
    Tokenising the text dataset. Specifically:
    - Adding padding, and trancating, so the batched set of sentences has equal length and does not exceed maximum model length.
    - Breaking those sentences into words.
    - Finally, converting those words into numbers as token IDs for the machine to understand/read.

    Parameters:
      data (dict):
        Contains the full dataset information, including the actual sentences to be tokenised.

    Returns:
      A dictionary containing input IDs and attention masks for the tokenised text.
  """
  return tokeniser(data["sentence"], padding="max_length", truncation=True)


tokenised_ds = ds.map(tokenise_text, batched=True)
print(tokenised_ds["train"][:3])

cleaned_ds = format_data_for_training(tokenised_ds)
print(cleaned_ds["train"][:3])






{'idx': [0, 1, 2], 'sentence': ['hide new secretions from the parental units ', 'contains no wit , only labored gags ', 'that loves its characters and communicates something rather beautiful about human nature '], 'label': [0, 0, 1], 'input_ids': [[101, 5342, 2047, 3595, 8496, 2013, 1996, 18643, 3197, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

Model training (and fine-tuning) stage

In [None]:
# Just testing

train_labels = cleaned_ds["train"].select(range(300))["labels"]
val_labels = cleaned_ds["validation"].select(range(100))["labels"]

print("Train labels:", Counter(train_labels))
print("Val labels:", Counter(val_labels))

Train labels: Counter({tensor(0): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(1): 1, tensor(1): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(1): 1, tensor(1): 1, tensor(1): 1, tensor(1): 1, tensor(1): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(1): 1, tensor(1): 1, tensor(0): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(1): 1, tensor(1): 1, tensor(1): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(1): 1, tensor(1): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(1): 1, tensor(1): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(1): 1, tensor(0): 1, tensor(0): 1, tensor(0): 1, tensor(1): 

Time to train my model and fine-tune it.



*   I have initialised the DistilBert model I want to train on, for binary classification (num_labels = 2).
* I have reduced the batch sizes (i.e., how many samples to use at a time for training and validation) and epochs (i.e., how many training repetitions over the full ds), otherwise my machine can't handle it.
* I have set a seed with a value of 1 and deterministic to True for assignment reproducibility purposes.
*   Fine-tuning begins when I call trainer.train()
* Forward pass and backpropagation happen inside Huggingface's Trainer, which seems to be using the Adam optimiser (https://huggingface.co/docs/transformers/en/main_classes/trainer)



In [None]:

# Initialising the DistilBert model for binary classification
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels = 2)

# Congifuring the training arguments for my model
training_args = TrainingArguments(
    output_dir = "./Results",
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 32,
    num_train_epochs = 2,
    save_strategy="epoch",
    evaluation_strategy = "epoch",
    seed = 1,
    deterministic = True
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = cleaned_ds["train"],
    eval_dataset = cleaned_ds["validation"].select(range(20))
)

# Training my model based on the given congifuration.
trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss


TrainOutput(global_step=4, training_loss=0.6766969561576843, metrics={'train_runtime': 258.0447, 'train_samples_per_second': 0.233, 'train_steps_per_second': 0.016, 'total_flos': 7948043919360.0, 'train_loss': 0.6766969561576843, 'epoch': 2.0})

Finally, it's the model prediction stage

In [None]:
# Just testing

print("Train:\n", cleaned_ds["train"][:3]["labels"])
print("Validation:\n", cleaned_ds["validation"][:20]["labels"])
print("Test:\n", cleaned_ds["test"][:3]["labels"])

print("Train labels:", set(cleaned_ds["train"].select(range(30))["labels"]))
print("Validation labels:", set(cleaned_ds["validation"].select(range(20))["labels"]))

Train:
 tensor([0, 0, 1])
Validation:
 tensor([1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0])
Test:
 tensor([-1, -1, -1])
Train labels: {tensor(0), tensor(0), tensor(0), tensor(0), tensor(1), tensor(1), tensor(1), tensor(1), tensor(0), tensor(1), tensor(0), tensor(1), tensor(1), tensor(0), tensor(0), tensor(0), tensor(0), tensor(1), tensor(0), tensor(0), tensor(1), tensor(0), tensor(1), tensor(0), tensor(0), tensor(1), tensor(1), tensor(1), tensor(0), tensor(1)}
Validation labels: {tensor(0), tensor(1), tensor(0), tensor(1), tensor(1), tensor(0), tensor(1), tensor(1), tensor(1), tensor(0), tensor(1), tensor(0), tensor(0), tensor(1), tensor(0), tensor(0), tensor(1), tensor(1), tensor(0), tensor(0)}


In [None]:
# Model prediction stage

# Running predictions using the validation set
preds = trainer.predict(cleaned_ds["validation"])
y_pred = preds.predictions.argmax(axis = 1)
y_true = preds.label_ids

# Printing final evaluation results from the classification report.
report = classification_report(y_true, y_pred, target_names=["negative", "positive"], output_dict = True)
print("Accuracy:", report["accuracy"])
print("F1-score (macro):", report["macro avg"]["f1-score"])
print("Since the SST-2 test set does not seem to provide labels, all final evaluation was performed on the validation set.\nThe current results show that the sentiment analysis model achieved an accuracy of 70% and a macro-averaged F1-score of 69.7% on the SST-2 validation set.")


Accuracy: 0.7
F1-score (macro): 0.696969696969697
Since the SST-2 test set does not provide labels, all final evaluation was performed on the validation set.
 The current results show that the sentiment analysis model achieved an accuracy of 70% and a macro-averaged F1-score of 69.7% on the SST-2 validation set.
