In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

from datamodule import PromptDataset, get_length
from model_baseline import DistilBertClassifier



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd

# === Tokenizer ===
tokenizer_BERT = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer_LLaDa = AutoTokenizer.from_pretrained('GSAI-ML/LLaDA-8B-Instruct', trust_remote_code=True)
steps = [32, 64, 128, 256, 512, 1024]
# I would have done with also 2048 and 4096 but in the training data there are no examples with that length

# === Load data ===
df_train = pd.read_csv(r"..\data\train.csv")
train_data = list(zip(df_train["user_prompt"], get_length(df_train["model_response"], tokenizer_LLaDa, max_length=1024, steps= steps)))
del df_train

df_test = pd.read_csv(r"..\data\test.csv")
df_test = df_test.dropna(subset=["model_response"])
data_test = list(zip(df_test["user_prompt"], get_length(df_test["model_response"], tokenizer_LLaDa, max_length= 1024, steps= steps)))
del df_test

val_data, test_data = train_test_split(data_test, test_size=0.3, random_state=42)

# All the training prompt except one have length < 64
train_ds = PromptDataset(train_data, tokenizer_BERT, max_len=64)
val_ds = PromptDataset(val_data, tokenizer_BERT, max_len=128)
test_ds = PromptDataset(test_data, tokenizer_BERT, max_len=128)

train_dl = DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=16)
test_dl = DataLoader(test_ds, batch_size=16)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")



In [13]:
model = DistilBertClassifier(n_classes=6)
# model = torch.compile(model).to(device)
model = model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
@torch.no_grad()
def estimate_loss(eval_iters = 10):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        if split == 'train':
            dataloader = train_dl
        else:
            dataloader = val_dl
        k = 0
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            logits, loss = model(input_ids, attention_mask, labels)
            assert loss is not None, "Loss should not be None"
            losses[k] = loss.item()
            k += 1
            if k >= eval_iters:
                break
        out[split] = losses.mean()
    model.train()
    return out

### Freezing the DistilBERT parameters

In [15]:
for param in model.encoder.parameters():
    param.requires_grad = False

optimizer = torch.optim.AdamW(model.classifier.parameters(), lr=2e-5)

In [None]:
eval_interval = 200
max_iters = len(train_dl)

# === Training loop ===
for epoch in range(10):
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_dl):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits, loss = model(input_ids, attention_mask, labels)
        assert loss is not None, "Loss should not be None"
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if i % eval_interval == 0 or i == max_iters - 1:
                losses = estimate_loss()
                print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    print(f"[Epoch {epoch+1}] Loss: {total_loss:.4f}")

### Fine tuning: changing also the parameters of DistilBERT

In [31]:
# Congela tutti i parametri di BERT
for param in model.encoder.parameters():
    param.requires_grad = True

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-6)

In [None]:
eval_interval = 200
max_iters = len(train_dl)

# === Training loop ===
for epoch in range(6):
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_dl):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits, loss = model(input_ids, attention_mask, labels)
        assert loss is not None, "Loss should not be None"
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if i % eval_interval == 0 or i == max_iters - 1:
                losses = estimate_loss()
                print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    print(f"[Epoch {epoch+1}] Loss: {total_loss:.4f}")

In [33]:
torch.save(model.state_dict(), "checkpoints/DistilBERT_LLaDa_1.pth")


### Evaluation

In [3]:
model = DistilBertClassifier(n_classes=6)
model.load_state_dict(torch.load("checkpoints/DistilBERT_LLaDa_1.pth"))
model = model.to(device)
model.eval()
0

0

In [4]:
input_text = "Can you explain the theory of relativity?"
input_enc = tokenizer_BERT(input_text, return_tensors="pt", padding='max_length', truncation=True, max_length=512).to(device)
output = model(input_enc['input_ids'], input_enc['attention_mask'])
print(output)

(tensor([[-2.1020, -0.9028,  1.1409,  2.2895,  2.6583, -3.0865]],
       device='cuda:0', grad_fn=<AddmmBackward0>), None)


In [5]:
input_text = "What's your name?"
input_enc = tokenizer_BERT(input_text, return_tensors="pt", padding='max_length', truncation=True, max_length=512).to(device)
output = model(input_enc['input_ids'], input_enc['attention_mask'])
print(output)

(tensor([[ 2.9584,  2.9615,  1.7493, -0.4979, -3.9948, -4.0273]],
       device='cuda:0', grad_fn=<AddmmBackward0>), None)


In [6]:
input_text = "What is 3 + 3?"
input_enc = tokenizer_BERT(input_text, return_tensors="pt", padding='max_length', truncation=True, max_length=512).to(device)
output = model(input_enc['input_ids'], input_enc['attention_mask'])
print(output)

(tensor([[ 1.6350,  0.8858,  1.1286,  0.3247, -1.9087, -2.3112]],
       device='cuda:0', grad_fn=<AddmmBackward0>), None)


In [7]:
@torch.no_grad()
def see_prediction(kappa = 1):
    out = {}
    model.eval()
    for split in ['val']:
        losses = torch.zeros(kappa)
        if split == 'train':
            dataloader = train_dl
        else:
            dataloader = val_dl
        k = 0
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            logits, loss = model(input_ids, attention_mask, labels)
            pred = torch.argmax(logits, dim=1)
            print(f"pred: {[(pred[i].item(), labels[i].item()) for i in range(16)]}")
            assert loss is not None, "Loss should not be None"
            losses[k] = loss.item()
            k += 1
            if k >= kappa:
                break
        out[split] = losses
    model.train()
    print("mean cross entropy loss: ", out["val"].mean())
    return 

In [8]:
see_prediction(10)

pred: [(2, 4), (3, 1), (2, 2), (3, 3), (2, 1), (4, 4), (3, 2), (2, 3), (4, 3), (4, 3), (4, 4), (4, 4), (4, 4), (0, 2), (3, 3), (4, 4)]
pred: [(4, 3), (3, 4), (4, 2), (3, 4), (4, 3), (4, 4), (2, 0), (3, 3), (4, 4), (3, 3), (0, 0), (4, 3), (3, 3), (3, 2), (3, 3), (4, 0)]
pred: [(2, 1), (3, 2), (0, 0), (2, 3), (2, 3), (4, 3), (1, 4), (4, 2), (4, 2), (3, 4), (4, 3), (2, 3), (3, 2), (0, 0), (3, 1), (1, 3)]
pred: [(0, 1), (2, 1), (4, 4), (4, 4), (3, 2), (4, 3), (4, 4), (3, 4), (4, 4), (4, 4), (2, 0), (0, 1), (4, 2), (4, 3), (4, 4), (0, 0)]
pred: [(1, 0), (0, 0), (3, 3), (4, 4), (3, 4), (4, 4), (2, 2), (3, 4), (4, 4), (4, 2), (4, 4), (4, 3), (1, 0), (2, 2), (2, 1), (0, 2)]
pred: [(2, 0), (3, 4), (3, 3), (3, 2), (4, 4), (3, 4), (4, 4), (2, 2), (4, 4), (4, 3), (3, 3), (3, 3), (3, 3), (2, 2), (1, 0), (4, 3)]
pred: [(4, 0), (3, 2), (4, 4), (4, 2), (3, 2), (4, 3), (4, 4), (4, 4), (2, 0), (4, 3), (3, 3), (2, 2), (3, 4), (2, 3), (3, 3), (4, 4)]
pred: [(3, 4), (4, 3), (4, 3), (3, 2), (3, 3), (3, 3), 

In [43]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error


def evaluate_accuracy(model, dataloader, device='cpu'):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            logits, loss = model(input_ids, attention_mask, labels)
            pred = torch.argmax(logits, dim=1)

            all_preds.extend(pred.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    accuracy = accuracy_score(all_labels, all_preds)
    mse = mean_squared_error(all_labels, all_preds)
    return accuracy, mse



In [44]:
acc, mse = evaluate_accuracy(model, val_dl, device=device)
print(f"Accuracy: {acc:.2%}")
print(f"Mean Squared Error: {mse:.4f}")


Accuracy: 48.26%
Mean Squared Error: 1.0589


### Final evaluation: obtain the list of predictions on the test


In [20]:
df_test = pd.read_csv(r"..\data\test.csv")
print(df_test.shape)
df_test = df_test.dropna(subset=["model_response"])
print(df_test.shape)
data_test = list(zip(df_test["user_prompt"], get_length(df_test["model_response"], tokenizer_LLaDa, max_length= 1024, steps= steps)))
del df_test


test_ds = PromptDataset(data_test, tokenizer_BERT, max_len=64)
test_dl = DataLoader(test_ds, batch_size=16)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

(5000, 3)
(4998, 3)


In [21]:
def get_predictions(model, dataloader, device='cpu'):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            logits, loss = model(input_ids, attention_mask, labels)
            pred = torch.argmax(logits, dim=1)

            all_preds.extend(pred.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    return all_preds, all_labels

In [22]:
all_preds, all_labels = get_predictions(model, test_dl, device=device)

In [24]:
steps = [32, 64, 128, 256, 512, 1024]
all_preds = [steps[pred] for pred in all_preds]
all_labels = [steps[label] for label in all_labels]

In [29]:
import numpy as np
np.array(all_preds), np.array(all_labels)


(array([256, 512, 128, ..., 256, 512, 256]),
 array([ 32, 128, 256, ..., 128, 512,  32]))

In [30]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(all_labels, all_preds)
print(f"Mean Squared Error: {mse:.4f}")

Mean Squared Error: 29398.1016


In [32]:
import os
import numpy as np

# Ensure the directory exists
os.makedirs("prediction_test", exist_ok=True)

# Save all_preds as a numpy array
np.save("prediction_test/DistilBERT_LLaDa_clas.npy", np.array(all_preds))