In [4]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split

from datamodule import PromptDataset, get_length_reg
from model_baseline import DistilBertRegressor



In [3]:
import pandas as pd

# === Tokenizer ===
tokenizer_BERT = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer_GPT2 = AutoTokenizer.from_pretrained("gpt2")

steps = [32, 64, 128, 256, 512]

# === Load data ===
df_train = pd.read_csv(r"..\data\train.csv")
train_data = list(zip(df_train["user_prompt"], get_length_reg(df_train["model_response"], tokenizer_GPT2)))
del df_train

df_test = pd.read_csv(r"..\data\test.csv")
df_test = df_test.dropna(subset=["model_response"])
data_test = list(zip(df_test["user_prompt"], get_length_reg(df_test["model_response"], tokenizer_GPT2)))
del df_test

val_data, test_data = train_test_split(data_test, test_size=0.3, random_state=42)

# All the training prompt except one have length < 64
train_ds = PromptDataset(train_data, tokenizer_BERT, max_len=64)
val_ds = PromptDataset(val_data, tokenizer_BERT, max_len=128)
test_ds = PromptDataset(test_data, tokenizer_BERT, max_len=128)

train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
val_dl = DataLoader(val_ds, batch_size=32)
test_dl = DataLoader(test_ds, batch_size=32)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


Token indices sequence length is longer than the specified maximum sequence length for this model (2258 > 1024). Running this sequence through the model will result in indexing errors


In [5]:
model = DistilBertRegressor()
model = model.to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
@torch.no_grad()
def estimate_loss(eval_iters = 10):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        if split == 'train':
            dataloader = train_dl
        else:
            dataloader = val_dl
        k = 0
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            logits, loss = model(input_ids, attention_mask, labels)
            assert loss is not None, "Loss should not be None"
            losses[k] = loss.item()
            k += 1
            if k >= eval_iters:
                break
        out[split] = losses.mean()
    model.train()
    return out

### Freezing the DistilBERT parameters

In [8]:
for param in model.encoder.parameters():
    param.requires_grad = False

optimizer = torch.optim.AdamW(model.regressor.parameters(), lr=2e-5)

In [10]:
eval_interval = 200
max_iters = len(train_dl)

# === Training loop ===
for epoch in range(10):
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_dl):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits, loss = model(input_ids, attention_mask, labels)
        assert loss is not None, "Loss should not be None"
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if i % eval_interval == 0 or i == max_iters - 1:
                losses = estimate_loss()
                print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    print(f"[Epoch {epoch+1}] Loss: {total_loss:.4f}")

step 0: train loss 58442.3633, val loss 60240.4141
step 200: train loss 59391.0703, val loss 59417.0625
step 400: train loss 58902.9922, val loss 58486.9883
step 445: train loss 57124.1562, val loss 58249.7500
[Epoch 1] Loss: 26795071.7773
step 0: train loss 59229.4297, val loss 58244.4688
step 200: train loss 53898.8203, val loss 57062.4180
step 400: train loss 54039.8984, val loss 55720.2500
step 445: train loss 57511.3945, val loss 55401.7734
[Epoch 2] Loss: 25719094.5078
step 0: train loss 55070.4492, val loss 55394.3672
step 200: train loss 59326.0742, val loss 53834.3125
step 400: train loss 72129.3594, val loss 52162.4805
step 445: train loss 50230.7109, val loss 51759.8750
[Epoch 3] Loss: 24264466.9238
step 0: train loss 44094.7461, val loss 51750.6758
step 200: train loss 49264.7305, val loss 49945.2148
step 400: train loss 44973.9414, val loss 48069.3008
step 445: train loss 55843.3047, val loss 47624.8359
[Epoch 4] Loss: 22440830.6426
step 0: train loss 44991.9727, val loss 

In [11]:
eval_interval = 200
max_iters = len(train_dl)

# === Training loop ===
for epoch in range(5):
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_dl):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits, loss = model(input_ids, attention_mask, labels)
        assert loss is not None, "Loss should not be None"
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if i % eval_interval == 0 or i == max_iters - 1:
                losses = estimate_loss()
                print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    print(f"[Epoch {epoch+1}] Loss: {total_loss:.4f}")

step 0: train loss 42148.8086, val loss 25034.8301
step 200: train loss 23393.8242, val loss 24014.9648
step 400: train loss 22812.0625, val loss 23148.7617
step 445: train loss 21875.8711, val loss 22972.7422
[Epoch 1] Loss: 10232775.1543
step 0: train loss 20114.6992, val loss 22968.9180
step 200: train loss 19020.5820, val loss 22227.3477
step 400: train loss 19536.5352, val loss 21609.3281
step 445: train loss 18065.9258, val loss 21507.3184
[Epoch 2] Loss: 9306091.7202
step 0: train loss 17381.3652, val loss 21504.8770
step 200: train loss 18614.4707, val loss 21013.4336
step 400: train loss 16912.7148, val loss 20663.8125
step 445: train loss 18904.1133, val loss 20589.4883
[Epoch 3] Loss: 8612875.9502
step 0: train loss 19518.0117, val loss 20587.9844
step 200: train loss 17718.7891, val loss 20322.0762
step 400: train loss 16243.4248, val loss 20127.0098
step 445: train loss 17747.9766, val loss 20092.1680
[Epoch 4] Loss: 8183896.3242
step 0: train loss 15621.2051, val loss 200

### Fine tuning: changing also the parameters of DistilBERT

In [13]:
# Congela tutti i parametri di BERT
for param in model.encoder.parameters():
    param.requires_grad = True

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-6)

In [14]:
eval_interval = 200
max_iters = len(train_dl)

# === Training loop ===
for epoch in range(8):
    model.train()
    total_loss = 0
    for i, batch in enumerate(train_dl):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        logits, loss = model(input_ids, attention_mask, labels)
        assert loss is not None, "Loss should not be None"
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if i % eval_interval == 0 or i == max_iters - 1:
                losses = estimate_loss()
                print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    print(f"[Epoch {epoch+1}] Loss: {total_loss:.4f}")

step 0: train loss 14524.5029, val loss 19836.0664
step 200: train loss 10631.6387, val loss 15585.8418
step 400: train loss 13101.1016, val loss 14772.9277
step 445: train loss 12384.4404, val loss 14201.4404
[Epoch 1] Loss: 6293407.0283
step 0: train loss 11882.6992, val loss 14186.6846
step 200: train loss 10996.8926, val loss 13713.1484
step 400: train loss 10839.2715, val loss 14229.2861
step 445: train loss 9426.0381, val loss 13533.6191
[Epoch 2] Loss: 5425324.1470
step 0: train loss 9032.8008, val loss 13543.1904
step 200: train loss 10041.5566, val loss 13698.6191
step 400: train loss 9306.3955, val loss 13121.5215
step 445: train loss 10005.4297, val loss 13029.6270
[Epoch 3] Loss: 5090905.2915
step 0: train loss 8356.2285, val loss 13007.3262
step 200: train loss 20821.6719, val loss 13144.1436
step 400: train loss 8848.1729, val loss 12586.4395
step 445: train loss 8791.7285, val loss 12603.7168
[Epoch 4] Loss: 4819823.4707
step 0: train loss 8986.2627, val loss 12635.4404


KeyboardInterrupt: 

In [15]:
torch.save(model.state_dict(), "checkpoints/DistilBERT_DGPT_reg.pth")


### Evaluation

In [16]:
model = DistilBertRegressor()
model.load_state_dict(torch.load("checkpoints/DistilBERT_DGPT_reg.pth"))
model = model.to(device)
model.eval()
0

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  model.load_state_dict(torch.load("checkpoints/DistilBERT_DGPT_reg.pth"))


0

In [17]:
input_text = "Can you explain the theory of relativity?"
input_enc = tokenizer_BERT(input_text, return_tensors="pt", padding='max_length', truncation=True, max_length=512).to(device)
output = model(input_enc['input_ids'], input_enc['attention_mask'])
print(output)

(tensor([230.7434], device='cuda:0', grad_fn=<SqueezeBackward1>), None)


In [18]:
input_text = "What's your name?"
input_enc = tokenizer_BERT(input_text, return_tensors="pt", padding='max_length', truncation=True, max_length=512).to(device)
output = model(input_enc['input_ids'], input_enc['attention_mask'])
print(output)

(tensor([56.2916], device='cuda:0', grad_fn=<SqueezeBackward1>), None)


In [19]:
input_text = "What is 3 + 3?"
input_enc = tokenizer_BERT(input_text, return_tensors="pt", padding='max_length', truncation=True, max_length=512).to(device)
output = model(input_enc['input_ids'], input_enc['attention_mask'])
print(output)

(tensor([103.5006], device='cuda:0', grad_fn=<SqueezeBackward1>), None)


In [22]:
@torch.no_grad()
def see_prediction(kappa = 1):
    out = {}
    model.eval()
    for split in ['val']:
        losses = torch.zeros(kappa)
        if split == 'train':
            dataloader = train_dl
        else:
            dataloader = val_dl
        k = 0
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            logits, loss = model(input_ids, attention_mask, labels)
            pred = logits
            print(f"pred: {[(pred[i].item(), labels[i].item()) for i in range(32)]}")
            assert loss is not None, "Loss should not be None"
            losses[k] = loss.item()
            k += 1
            if k >= kappa:
                break
        out[split] = losses
    model.train()
    print("mean cross entropy loss: ", out["val"].mean())
    return 

In [23]:
see_prediction(10)

pred: [(80.5232162475586, 272), (133.73294067382812, 47), (88.93971252441406, 67), (154.083251953125, 159), (91.1819839477539, 53), (317.1280517578125, 371), (245.73590087890625, 101), (132.80133056640625, 249), (422.2581481933594, 249), (252.9711151123047, 218), (350.1542053222656, 263), (327.7568664550781, 349), (316.2778625488281, 469), (73.82817840576172, 106), (170.40174865722656, 213), (267.37347412109375, 305), (265.1798095703125, 256), (186.89918518066406, 273), (243.48573303222656, 75), (219.05828857421875, 264), (209.02621459960938, 189), (229.86526489257812, 313), (129.1331329345703, 32), (183.33203125, 242), (385.2653503417969, 370), (158.89852905273438, 217), (113.5411376953125, 28), (351.6352844238281, 241), (206.50244140625, 220), (150.26824951171875, 97), (176.29559326171875, 180), (362.85272216796875, 2)]
pred: [(142.53115844726562, 55), (159.28858947753906, 128), (78.3479995727539, 26), (118.26432037353516, 169), (102.40924072265625, 153), (327.3758850097656, 272), (9

In [24]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error


def evaluate_accuracy(model, dataloader, device='cpu'):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            logits, loss = model(input_ids, attention_mask, labels)
            pred = logits

            all_preds.extend(pred.cpu().tolist())
            all_labels.extend(labels.cpu().tolist())

    mse = mean_squared_error(all_labels, all_preds)
    return mse



In [25]:
mse = evaluate_accuracy(model, val_dl, device=device)
print(f"Mean Squared Error: {mse:.4f}")


Mean Squared Error: 11504.4834
