In [1]:
import torch
import time
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.profiler import profile, ProfilerActivity, record_function
from torch.utils.flop_counter import FlopCounterMode
import psutil
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, TrainerCallback, AutoModelForMaskedLM
import random
import datasets
import wandb
import numpy as np
import evaluate
from peft import get_peft_model, LoraConfig, TaskType

  from .autonotebook import tqdm as notebook_tqdm


### Exercise 4.1

Build a classifier based on ModernBERT and fine-tune the classification head only (not the model weights) so that
the accuracy is maximized for this task. Plot the accuracy on train and dev (validation) sets over the course of
training. Report the results on the test set corresponding to your best model measured on the dev (validation) set
in Table 1. Include the results in Table 1. Include a link to your code on Github.

In [2]:
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

In [3]:
strategy_qa = datasets.load_dataset("wics/strategy-qa", split="test")
ds = strategy_qa.train_test_split(test_size=0.2, seed=42, shuffle=True)
tv = ds["test"].train_test_split(test_size=0.5, seed=42, shuffle=True)
label_map = {"true": 1, "false": 0}

ds = datasets.DatasetDict({
    "train": ds["train"],
    "test": tv["test"],
    "validation": tv["train"]
})

In [4]:
print(f"Train size: {len(ds['train'])}")
print(f"Validation size: {len(ds['validation'])}")
print(f"Test size: {len(ds['test'])}")

Train size: 1832
Validation size: 229
Test size: 229


In [5]:
true_pos_id = tokenizer("true")["input_ids"][1]
false_pos_id = tokenizer("false")["input_ids"][1]

In [6]:
def preprocess(ex):
    ans = ex["answer"]
    y = int(bool(ans))
    # appending facts to the question, because the model is not doing well at all
    text = ex["question"] + " Answer: [MASK]"
    text = " ".join(ex["facts"]) + " " + ex["question"] + " Answer: [MASK]"
    enc = tokenizer(text, padding="max_length", truncation=True, max_length=145)
    mask_position = enc['input_ids'].index(tokenizer.mask_token_id)
    labels = enc["input_ids"].copy()
    labels[mask_position] = true_pos_id if y == 1 else false_pos_id
    enc["labels"] = labels
    return enc

In [7]:
ds = ds.map(preprocess, remove_columns=strategy_qa.column_names)

In [8]:
max_len = 0
for x in ds["test"]:
    max_len = max(max_len, len(x["input_ids"]))

In [9]:
max_len

145

In [10]:
143

143

In [11]:
len(ds["train"][0]["labels"])

145

In [12]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    B, L, V = logits.shape
    m = (labels == tokenizer.sep_token_id)
    sep_pos = m.argmax(axis=1)
    pos = sep_pos - 1
    batch_ix = np.arange(B)
    step_logits = logits[batch_ix, pos, :]
    logits = step_logits[:, [false_pos_id, true_pos_id]]
    refs = labels[batch_ix, pos]
    refs = (refs == true_pos_id).astype(int)

    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=refs)

In [14]:
class TrainEvalCallback(TrainerCallback):
    def __init__(self, trainer, sample_size=229):
        self.trainer = trainer
        self.sample_size = sample_size
        self.train_sample = None

    def on_train_begin(self, args, state, control, **kwargs):
        ds = self.trainer.train_dataset
        self.train_sample = ds.select(range(self.sample_size))

    def on_epoch_end(self, args, state, control, **kwargs):
        metrics = self.trainer.evaluate(
            eval_dataset=self.train_sample,
            metric_key_prefix="train",
            ignore_keys=None,
        )
        self.trainer.log(metrics)
        control.should_evaluate = True
        control.should_log = True
        return control

In [15]:
args = TrainingArguments(
    output_dir="modernbert-strategyqa",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    lr_scheduler_type="cosine",
    warmup_ratio=0.2,
    greater_is_better=True,
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    report_to=["wandb"],
    seed=42,
)

def model_init():
    model = AutoModelForMaskedLM.from_pretrained(
        "answerdotai/ModernBERT-base"
    )
    for param in model.parameters():
        param.requires_grad = False
    lora_config = LoraConfig(
        r=1,
        target_modules = ["attn.Wo"],
        task_type=TaskType.SEQ_CLS,
        layers_to_transform=[16],
    )
    model = get_peft_model(model, lora_config)
    for name, param in model.named_parameters():
        param.requires_grad = False
        if "lora" in name:
            param.requires_grad = True
    total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total trainable parameters: {total_trainable_params}")
    return model

trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

trainer.add_callback(TrainEvalCallback(trainer))


  trainer = Trainer(


Total trainable parameters: 1536


In [16]:
def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 5e-6, 5e-3, log=True),
    }

best = trainer.hyperparameter_search(
    backend="optuna",
    direction="maximize",
    n_trials=2,
    hp_space=hp_space,
    compute_objective=lambda m: m["eval_accuracy"],
)
print(best)

[I 2025-09-04 23:32:39,583] A new study created in memory with name: no-name-7c28e69a-887a-4f38-869e-7313cce2b5d7
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None, 'bos_token_id': None}.


Total trainable parameters: 1536


[34m[1mwandb[0m: Currently logged in as: [33melisabeth-fittschen[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy
1,7.0759,14.047585,0.550218
2,7.0187,14.047292,0.550218
3,7.2758,14.04722,0.550218


[I 2025-09-04 23:33:17,611] Trial 0 finished with value: 0.5502183406113537 and parameters: {'learning_rate': 5.21122452604712e-06}. Best is trial 0 with value: 0.5502183406113537.


Total trainable parameters: 1536


0,1
eval/accuracy,▁▁▁
eval/loss,█▂▁
eval/runtime,█▁▅
eval/samples_per_second,▁█▅
eval/steps_per_second,▁█▆
train/epoch,▁▁▂▂▃▃▃▃▃▃▄▄▄▅▅▆▆▆▆▆▆▇▇▇██████
train/global_step,▁▁▂▂▃▃▃▃▃▃▄▄▄▅▅▆▆▆▆▆▆▇▇▇██████
train/grad_norm,▁▁▂▄▃▄▃▆▄▅▃▄█▄▆█▅▅▆▅
train/learning_rate,▃▅▇████▇▆▆▅▄▄▃▃▂▂▁▁▁
train/loss,▄▃▂▅▅▅▄▄▃▃▃▂▄▁▅▅▃▃▃█

0,1
eval/accuracy,0.55022
eval/loss,14.04722
eval/runtime,2.5918
eval/samples_per_second,88.357
eval/steps_per_second,3.087
total_flos,530625089018880.0
train/epoch,3
train/global_step,174
train/grad_norm,1669.03589
train/learning_rate,0.0


Epoch,Training Loss,Validation Loss,Accuracy
1,6.7669,13.281297,0.519651
2,5.6977,11.29364,0.49345
3,5.6155,10.819053,0.475983


[I 2025-09-04 23:33:53,176] Trial 1 finished with value: 0.4759825327510917 and parameters: {'learning_rate': 0.0006553429689433842}. Best is trial 0 with value: 0.5502183406113537.


BestRun(run_id='0', objective=0.5502183406113537, hyperparameters={'learning_rate': 5.21122452604712e-06}, run_summary=None)


In [17]:
training_args = TrainingArguments(
    output_dir="modernbert-strategyqa-finetuned",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    learning_rate=best.hyperparameters["learning_rate"],
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.001,
    fp16=True,
    report_to=["wandb"],
    seed=42,
)

trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)
trainer.add_callback(TrainEvalCallback(trainer))
trainer.train()
trainer.evaluate(ds["validation"])
trainer.evaluate(ds["test"])

  trainer = Trainer(


Total trainable parameters: 1536


The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None, 'bos_token_id': None}.


Total trainable parameters: 1536


Epoch,Training Loss,Validation Loss,Accuracy
1,7.0759,14.047539,0.550218
2,7.0187,14.047336,0.550218
3,7.2758,14.047254,0.550218






{'eval_loss': 13.780497550964355,
 'eval_accuracy': 0.611353711790393,
 'eval_runtime': 2.5805,
 'eval_samples_per_second': 88.742,
 'eval_steps_per_second': 3.1,
 'epoch': 3.0}

In [18]:
# get test set results
print(trainer.evaluate(ds["validation"]))
print(trainer.evaluate(ds["test"]))



{'eval_loss': 14.047538757324219, 'eval_accuracy': 0.5502183406113537, 'eval_runtime': 2.5815, 'eval_samples_per_second': 88.707, 'eval_steps_per_second': 3.099, 'epoch': 3.0}




{'eval_loss': 13.780497550964355, 'eval_accuracy': 0.611353711790393, 'eval_runtime': 2.5823, 'eval_samples_per_second': 88.68, 'eval_steps_per_second': 3.098, 'epoch': 3.0}


In [19]:
from peft import get_peft_model, LoraConfig, TaskType

In [20]:
for name, _ in model.named_parameters():
    print(f"{name}, {_.data.shape}")

NameError: name 'model' is not defined

In [None]:
lora_model = AutoModelForMaskedLM.from_pretrained(
    "answerdotai/ModernBERT-base"
)
lora_config = LoraConfig(
    r=1,
    target_modules = ["mlp.Wo"],
    task_type=TaskType.SEQ_CLS,
    layers_to_transform=[16],
)
lora_model = get_peft_model(lora_model, lora_config)
for name, param in lora_model.named_parameters():
    if "classifier" in name:
        print(param.data.shape)
        param.requires_grad = False
lora_model.print_trainable_parameters()

trainable params: 1,920 || all params: 149,657,152 || trainable%: 0.0013


In [None]:
for name, param in lora_model.named_parameters():
    print(f"{name}, {param.requires_grad}, {param.data.shape}")

base_model.model.model.embeddings.tok_embeddings.weight, False, torch.Size([50368, 768])
base_model.model.model.embeddings.norm.weight, False, torch.Size([768])
base_model.model.model.layers.0.attn.Wqkv.weight, False, torch.Size([2304, 768])
base_model.model.model.layers.0.attn.Wo.weight, False, torch.Size([768, 768])
base_model.model.model.layers.0.mlp_norm.weight, False, torch.Size([768])
base_model.model.model.layers.0.mlp.Wi.weight, False, torch.Size([2304, 768])
base_model.model.model.layers.0.mlp.Wo.weight, False, torch.Size([768, 1152])
base_model.model.model.layers.1.attn_norm.weight, False, torch.Size([768])
base_model.model.model.layers.1.attn.Wqkv.weight, False, torch.Size([2304, 768])
base_model.model.model.layers.1.attn.Wo.weight, False, torch.Size([768, 768])
base_model.model.model.layers.1.mlp_norm.weight, False, torch.Size([768])
base_model.model.model.layers.1.mlp.Wi.weight, False, torch.Size([2304, 768])
base_model.model.model.layers.1.mlp.Wo.weight, False, torch.Size

In [None]:
for name, param in lora_model.named_parameters():
    if "layers.21.mlp.Wo" in name:
        print(param.data.shape)
    if param.requires_grad == True:
        print(name)
        print(param.data.shape)

base_model.model.model.layers.16.mlp.Wo.lora_A.default.weight
torch.Size([1, 1152])
base_model.model.model.layers.16.mlp.Wo.lora_B.default.weight
torch.Size([768, 1])
torch.Size([768, 1152])


In [None]:
print(ds["train"][1])

{'input_ids': [50281, 5804, 14963, 20754, 11819, 16916, 4647, 521, 1442, 6850, 281, 247, 278, 1657, 24563, 32, 50282], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': 1}
