This notebook serves as a starting point for the final project of the Introduction to Machine Learning 2024/25 course. You must use exactly the code provided to load and split the dataset.

# Code to load the dataset

In [None]:
!pip install -U "datasets==3.6.0"

In [None]:
### YOU MUST NOT CHANGE THIS CELL! ###

from datasets import load_dataset

full_dataset = load_dataset("skeskinen/TinyStories-GPT4", split="train")
full_dataset = full_dataset.remove_columns([c for c in full_dataset.column_names if c not in ["story", "features"]])
assert len(full_dataset) == 2745100

splits = full_dataset.train_test_split(test_size=10000, seed=42, shuffle=True)

train_dataset = splits["train"]
test_dataset  = splits["test"]

assert len(train_dataset) == 2735100
assert len(test_dataset)  == 10000

assert train_dataset[0]["story"][:33] == "One day, a little girl named Lily"
assert train_dataset[0]["features"] == ["Dialogue", "Conflict"]

In [None]:
# Here we print the first example of the train dataset

from pprint import pprint
pprint(train_dataset[0])



# MODEL 2 : Convolutional Neural Network

In [None]:
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import random
import copy
import pandas as pd
import hashlib
import json
import os
import string


from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.nn import BCEWithLogitsLoss
from torch.optim.lr_scheduler import StepLR
from sklearn.metrics import f1_score, accuracy_score, precision_recall_curve, confusion_matrix
from sklearn.model_selection import train_test_split, ParameterGrid
from collections import defaultdict

In [None]:
# Fix the seed for the reproducibility
SEED = 42

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
# Define the device:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
TAGS = ["BadEnding", "Conflict", "Dialogue", "Foreshadowing", "MoralValue", "Twist"]

def f1_per_tag(true_tag_lists, pred_tag_lists):

    scores = f1_score(true_tag_lists, pred_tag_lists, average=None, zero_division=0)
    return {t: float(s) for t, s in zip(TAGS, scores)}

In [None]:
num_labels = len(TAGS)

Create a smaller training dataset, representative of all the labels:

Since we are unable to train the model on the entire dataset of stories, we want to create a smaller training set that is representative of all the labels. To ensure that no tag is significantly underrepresented, we impose a minimum number of stories for each label

In [None]:
# Construction of a small training dataset
target_size = 400000      # Dimension of the small dataset
min_per_tag = 8000       # Minimum number of stories per tag


# Map each label to the list of indices of the stories that contain it
tag_to_indices = defaultdict(list) # Create a dictionary that maps each tag to an empty list.
for idx, example in enumerate(train_dataset): # Iterate over each example in the train_dataset, retrieving the index idx of the current example.
    for tag in example['features']:
        if tag in TAGS:
            tag_to_indices[tag].append(idx) # If the tag is one of the tags of interest, it adds the index idx of that story to the list corresponding to that tag in the dictionary.

# To avoid duplicated stories
final_indices = set()
for tag in TAGS:
    indices_for_tag = tag_to_indices[tag] # Retrieves the list of story indices containing that tag from the previously created tag_to_indices dictionary.

    num_to_sample = min(min_per_tag, len(indices_for_tag))  # If there are less stories with that tag than min_per_tag we take them all
    sampled_indices = random.sample(indices_for_tag, num_to_sample) # Randomly samples num_to_sample indices from the indices_for_tag list.
    final_indices.update(sampled_indices) # Adds the sampled indices to the final_indices set.

print(f"Unique stories after guaranteeing minimums: {len(final_indices)}")

# Add random stories to reach target_size.
num_to_add = target_size - len(final_indices) # How many stories left to reach target_size
if num_to_add > 0:
    # Identify all the indices not already selected
    all_indices = set(range(len(train_dataset)))
    remaining_indices = list(all_indices - final_indices)
    random.shuffle(remaining_indices) # Shuffle the remaining indices

    final_indices.update(remaining_indices[:num_to_add]) # Add the first num_to_add indices of the shuffled list

# Create the final dataset from the selected indices
final_indices_list = list(final_indices) # Convert it to a list otherwise it cannot be shuffled

random.shuffle(final_indices_list) # To avoid having an ordered dataset

# To ensure the final dimension to be target_size
if len(final_indices_list) > target_size:
    final_indices_list = final_indices_list[:target_size]

# Reduced 'controlled' dataset
small_dataset = train_dataset.select(final_indices_list)

print(f"Final 'small_dataset' created with {len(small_dataset)} stories.")

# Verify the new label distribution
print("\nVerifying the new label distribution:")
new_counts = {tag: 0 for tag in TAGS}
for item_features in small_dataset['features']:
    for feature in item_features:
        if feature in new_counts:
            new_counts[feature] += 1
pprint(new_counts)

# Extract stories and labels from the small dataset
stories, labels = small_dataset['story'], small_dataset['features']



In [None]:
# # DOWNLOAD SMALL DATASET
# import csv
# from google.colab import files

# df_small = pd.DataFrame({
#     'stories': stories,
#     'labels': [str(l) for l in labels]  # This mantains the commas
# })
# # Create a DataFrame and I convert elements in strings

# df_small.to_csv('small_dataset_400.000.csv', index=False, quoting=1)  # Save the DataFrame in a csv file
# files.download('small_dataset_400.000.csv')

Convert every label to a binary vector:

We define a mapping from each label to a unique index, then create a function to convert lists of labels into binary vectors. This allows us to represent the presence or absence of each label as 1s and 0s for model training

In [None]:
label_to_index = {label: i for i, label in enumerate(TAGS)} # Map each label to a unique index (BadEnding->0, Conflict->1,...)

In [None]:
def list_to_binary(label_list):
    vector = [0] * len(TAGS) # Initialize a binary vector of zeros with length equal to number of labels
    for label in label_list:
        label = label.strip() # Remove any extra whitespace around the label
        vector[label_to_index[label]] = 1 # Set the position corresponding to this label to 1
    return vector

def binary_to_list(vector):
  return [TAGS[i] for i, value in enumerate(vector) if value == 1]

labels_bin = [list_to_binary(label) for label in labels] # Convert all labels into binary vectors

In [None]:
# Define a function that separate the words from the punctuation with an empty space and then split the stories into tokens:
def simple_tokenizer(stories):

    punctuation = string.punctuation    # Construct a string for the punctuation
    all_tokens = []

    for text in stories:
        # Convert text to lowercase
        text = text.lower()

        # Iterate over each punctuation character
        for char in punctuation:
            # Add a space before and after that character in the text
            text = text.replace(char, f' {char} ')

        # Split the tokens when there is a space
        tokens = text.split()
        all_tokens.append(tokens)

    return all_tokens

In [None]:
# Apply the simple_tokenizer function to all the stories of the small_dataset:
stories_split = simple_tokenizer(stories)

Create the vocabulary

In [None]:
# Define a special token for padding and for unknown tokens
PAD_TOKEN = '[PAD]'
UNK_TOKEN = '[UNK]'

# Flatten the list of tokenized stories to get a single list of all tokens
all_tokens = [token for story in stories_split for token in story]

# Remove duplicates and sort the tokens
vocab_tokens = sorted(set(all_tokens))

V = [PAD_TOKEN, UNK_TOKEN] + vocab_tokens
token_to_index = {token: idx for idx, token in enumerate(V)}

vocab_size = len(V)

In [None]:
# Define a function to encode a story:
def encode(x):
    return [token_to_index[token] if token in token_to_index else token_to_index[UNK_TOKEN] for token in x]

In [None]:
# How to choose max_length

lengths = [len(story) for story in stories_split]              # Compute the length of eash story
max_length = int(np.percentile(lengths, 95))                    # Fix max_length s.t. 95% of the stories are shorter than max_length

print("Max length chosen:", max_length)

In [None]:
# Define a function to have all the stories of the same length:
def truncate_and_pad(sequence):
    sequence = copy.copy(sequence)
    if len(sequence) > max_length:      # If the story is too long, take only the last max_length tokens => We give more weight to the end of the story rather than the beginning
        sequence = sequence[-max_length:]
    elif len(sequence) < max_length:    # If the story is too short add some PAD tokens (at the end of the story)
        sequence = sequence + [token_to_index[PAD_TOKEN]] * (max_length - len(sequence))

    return sequence

In [None]:
# Apply the encode function to all the stories in small_dataset
stories_encoded = [encode(story) for story in stories_split]

# Apply truncate_and_pad function to each story
stories_padded = [truncate_and_pad(story) for story in stories_encoded]

Split the 'controlled' smaller dataset into training and validation sets, keeping 80% for training and 20% for validation, with a fixed random seed for reproducibility.

In [None]:
# Divide the small_dataset in train_dataset and val_dataset
train_stories, val_stories, train_labels, val_labels = train_test_split(stories_padded, labels_bin, test_size=0.2, random_state=42)

Create the DataLoader

In [None]:
# Construct the TensorDataset
train_dataset =TensorDataset(torch.tensor(train_stories, dtype = torch.long), torch.tensor(train_labels, dtype = torch.float))
val_dataset = TensorDataset(torch.tensor(val_stories, dtype = torch.long), torch.tensor(val_labels, dtype = torch.float))

# Construct DataLoader instances
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

Define the training and evaluating procedure

In [None]:
# Training procedure:
def train(model, train_loader, val_loader, optimizer, epochs, pos_weights, device, save_best_model=False):
    model.to(device)    # Move the model to the correct device

    scheduler = StepLR(optimizer, step_size=round(epochs * 3/4), gamma=0.1, verbose=True)
    loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weights)   # Loss function with the pos_weights

    # Initialize
    best_f1_scores_per_tag = np.zeros(len(TAGS))
    pred_prob = []
    pred_labels = []

    best_true_labels = None   # Initialize a variable for the true labels associated to the best model


    # Training cycle
    for epoch in range(epochs):
        model.train()   # Training mode
        losses = []

        # Iterate over the elements of the batch
        for data, target in train_loader:
            data = data.to(device)
            target = target.to(device)

            optimizer.zero_grad()   # Reset the gradients
            output = model(data)    # Forward pass
            loss = loss_fn(output, target)    # Compute the losses
            loss.backward()   # Backward pass
            optimizer.step()    # Update the weights

            losses.append(loss.item())

        print(f'Train epoch: {epoch + 1} | Loss: {np.mean(losses):.4f}')

        # Evaluate after each epoch
        current_avg_loss, current_f1_values, current_pred_prob, current_pred_labels, true_labels= evaluate(model, val_loader, device)

        # Check for improved model (at least one F1 better, and the rest not below 0.75)
        improvements = current_f1_values > best_f1_scores_per_tag
        at_least_one_improved = improvements.any()

        # Acceptable degradation (they must remain > 0.75)
        non_improved = ~improvements
        acceptable_degradation = np.all(current_f1_values[non_improved] >= 0.75) # boolean variable: True if all the tags have improved and they are > 0.75

        if at_least_one_improved and acceptable_degradation:
          best_f1_scores_per_tag = current_f1_values.copy()
          pred_prob = current_pred_prob
          pred_labels = current_pred_labels

          # Save the true labels at the best epoch:
          best_true_labels = true_labels.copy()       # Use .copy() to be sure

          if save_best_model:
            torch.save(model.state_dict(), "best_model_CNN.pt")

          print(f"✅ New best model saved (F1-based)")

        scheduler.step()
        print(f"Learning rate: {scheduler.get_last_lr()}")

    # If the model does not improve:
    if best_true_labels is None:
        best_true_labels = true_labels

    return best_true_labels, pred_labels, pred_prob

In [None]:
# Evaluate function
def evaluate(model, loader, device):
    model.to(device)
    model.eval()    # Evaluation mode
    loss = 0

    loss_fn = nn.BCEWithLogitsLoss()
    all_targets = []
    all_outputs = []

    with torch.no_grad():
        for data, target in loader:
            data = data.to(device)      # Access input_ids
            target = target.to(device)  # Access labels

            output = model(data)
            loss += loss_fn(output, target).item()

            all_targets.append(target.cpu().numpy())
            all_outputs.append(output.cpu().numpy())


    loss /= len(loader)

    # Calculate F1 score per tag
    all_targets = np.concatenate(all_targets)
    all_outputs = np.concatenate(all_outputs)

    # Apply sigmoid and threshold to get binary predictions
    pred_probabilities = torch.sigmoid(torch.tensor(all_outputs)).numpy()
    pred_tags = (pred_probabilities > 0.5).astype(int)

    # Calculates F1 scores per label
    f1_scores = f1_per_tag(all_targets, pred_tags)
    current_f1_values = np.array([f1_scores[tag] for tag in TAGS]) # np.array with f1 values


    print(f'Evaluation loss: {loss:.4f}')
    print("F1 Score per tag:")
    for tag, score in f1_scores.items():
        print(f"  {tag}: {score:.4f}")

    return loss, current_f1_values, pred_probabilities, pred_tags, all_targets

We add pos_weights to the loss function to solve the problem of class imbalance. Adding these weights ensures that the loss calculation accurately accounts for the distribution in each class.
The weight for each tag is computed as the ratio between the number of negative examples over the number of positive examples. \\
https://docs.pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html#torch.nn.BCEWithLogitsLoss

In [None]:
# Transform labels_bin into a np.array:
labels_bin_np = np.array(labels_bin)

# Counts the number of positive occurences for each tag:
num_positives = labels_bin_np.sum(axis=0)   # axis = 0 (sum for every tag)

# Counts the number of negative occurences for each tag:
num_negatives = labels_bin_np.shape[0] - num_positives    # labels_bin_np.shape[0] = number of rows = number of stories

# Fix a control constant to prevent division by zero
epsilon = 1e-6

pos_weight_value = num_negatives / (num_positives + epsilon)

# Transform pos_weights into a torch.tensor and pass it to the "device"
pos_weight_tensor = torch.tensor(pos_weight_value, dtype=torch.float, device=device)

print("\n Pos_weight for each tag:")
for i, tag in enumerate(TAGS):
    print(f"- {tag}: {pos_weight_tensor[i].item():.2f}")

Function for the CNN model:
In PyTorch, nn.Embedding() returns a $b \times l \times d$ tensor (where $b$ is the batch size, $l$ is the sequence length, and $d$ is the embedding dimension), whereas nn.Conv1d() espects a $b \times d \times l$ tensor. Therefore, if you use these layers in your 1-dimensional CNN, you need to swap the second and third
dimension between the embedding layer and the first convolutional layer!

In [None]:
class Transpose(nn.Module):
    """Swap two tensor dimensions inside a Sequential."""
    def __init__(self, dim0: int, dim1: int):
        super().__init__()
        self.dim0, self.dim1 = dim0, dim1

    def forward(self, x):
        # x is returned as a **view**, so this is zero‑copy
        return x.transpose(self.dim0, self.dim1)

### Grid Search

We ran the following cells on a dataset of 50,000 stories to find the best hyperparameters.
Theoretically, these hyperparameters should remain optimal even if we increase the size of the dataset, so we saved them manually and re-ran the entire script using only the model configured with the best parameters.

In [None]:
# def run_training_with_params(params, vocab_size, num_labels, epochs, pos_weight_tensor, train_loader, val_loader, device):
#   # Parameters for the grid
#   learning_rate = params["learning_rate"]
#   embedding_dim = params["embedding_dim"]
#   drop_out = params["dropout"]
#   filters_conv1 = params["filters_conv1"]
#   kernel_size_conv1 = params["kernel_size_conv1"]
#   filters_conv2 = params["filters_conv2"]
#   kernel_size_conv2 = params["kernel_size_conv2"]

#   # Build the model
#   model = nn.Sequential(
#       nn.Embedding(vocab_size, embedding_dim),
#       Transpose(1, 2),
#       nn.Conv1d(in_channels=embedding_dim, out_channels=filters_conv1, kernel_size=kernel_size_conv1, padding="same"),
#       nn.ReLU(),
#       nn.MaxPool1d(kernel_size=2),
#       nn.Conv1d(in_channels=filters_conv1, out_channels=filters_conv2, kernel_size=kernel_size_conv2, padding="same"),
#       nn.ReLU(),
#       nn.AdaptiveMaxPool1d(1),
#       nn.Flatten(),
#       nn.Dropout(drop_out),
#       nn.Linear(filters_conv2, num_labels)
#       )
#   model.to(device)

#   # Optimizer:
#   optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

#   # Training:
#   y_true_bin, _ , y_pred_prob = train(
#       model=model,
#       optimizer=optimizer,
#       train_loader=train_loader,
#       val_loader=val_loader,
#       epochs=epochs,
#       pos_weights=pos_weight_tensor,
#       device=device
#       )

#   # Predicted tags
#   y_pred_bin = (y_pred_prob > 0.5).astype(int)

#   # Macro-F1 score
#   f1 = f1_score(y_true_bin, y_pred_bin, average='macro')

#   return f1

In [None]:
# epochs = 10

# # Define the grid of hyperparameters
# param_grid = {
#     "learning_rate": [0.001, 0.0005],
#     "embedding_dim" : [100, 200, 300],
#     "dropout" : [0.4, 0.5],
#     "filters_conv1": [64, 128],
#     "kernel_size_conv1": [3, 5],
#     "filters_conv2": [32, 64],
#     "kernel_size_conv2": [3]
# }

# results = []

# for params in ParameterGrid(param_grid):
#     print(f"\n Training with parameters: {params}\n")

#     f1 = run_training_with_params(params, vocab_size, num_labels, epochs, pos_weight_tensor, train_loader, val_loader, device)
#     results.append({"params": params, "f1": f1})

# # Find the best combination
# best = max(results, key=lambda x: x["f1"])
# print("\n--- Best hyperparameters found: ---")
# print(best["params"])
# print(f"Macro-F1: {best['f1']:.4f}")

In [None]:
# # Converts results into a DataFrame:
# df_results = pd.DataFrame([
#     {**r["params"], "f1_score": r["f1"]} for r in results
# ])

# # Sorts by F1 in descending order:
# df_results = df_results.sort_values(by="f1_score", ascending=False)

# print(df_results.to_string(index=False))

# df_results.to_csv("grid_search_results.csv", index=False)

### Training loop

In [None]:
# Save the best hyperparameters: (by hand, but from the gridsearch)
best_lr = 0.0005
best_embedding_dim = 300
best_drop_out = 0.4
best_filters_conv1 = 128
best_kernel_size_conv1 = 3
best_filters_conv2 = 64
best_kernel_size_conv2 = 3

print ("Best hyperparameters:")
print(best_lr, best_embedding_dim, best_drop_out, best_filters_conv1, best_kernel_size_conv1, best_filters_conv2, best_kernel_size_conv2)

In [None]:
!pip install torchinfo
from torchinfo import summary

# Definition of the model:
model_A = nn.Sequential(
    nn.Embedding(vocab_size, best_embedding_dim),
    Transpose(1, 2),
    nn.Conv1d(in_channels=best_embedding_dim, out_channels=best_filters_conv1, kernel_size=best_kernel_size_conv1, padding="same"),
    nn.ReLU(),
    nn.MaxPool1d(kernel_size=2),
    nn.Conv1d(in_channels=best_filters_conv1, out_channels=best_filters_conv2, kernel_size=best_kernel_size_conv2, padding="same"),
    nn.ReLU(),
    nn.AdaptiveMaxPool1d(1),
    nn.Flatten(),
    nn.Dropout(best_drop_out),
    nn.Linear(best_filters_conv2, num_labels)
    )

dummy_input = torch.randint(0, vocab_size, (batch_size, max_length), dtype=torch.long)

summary(model_A, input_data=dummy_input)


In [None]:
# Optimizer configuration:
optimizer = torch.optim.Adam(model_A.parameters(), best_lr)

# Training:
epochs=10
_, _, _ = train(model_A, train_loader, val_loader, optimizer, epochs, pos_weight_tensor, device, save_best_model=True)