In [2]:
import torch
import time
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.profiler import profile, ProfilerActivity, record_function
from torch.utils.flop_counter import FlopCounterMode
import psutil
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments, DataCollatorWithPadding, TrainerCallback
import random
import datasets
import wandb
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


### Exercise 4.1

Build a classifier based on ModernBERT and fine-tune the classification head only (not the model weights) so that
the accuracy is maximized for this task. Plot the accuracy on train and dev (validation) sets over the course of
training. Report the results on the test set corresponding to your best model measured on the dev (validation) set
in Table 1. Include the results in Table 1. Include a link to your code on Github.

In [3]:
model = AutoModelForSequenceClassification.from_pretrained(
        "answerdotai/ModernBERT-base", num_labels=2
    )

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
strategy_qa = datasets.load_dataset("wics/strategy-qa", split="test")
ds = strategy_qa.train_test_split(test_size=0.2, seed=42, shuffle=True)
tv = ds["test"].train_test_split(test_size=0.5, seed=42, shuffle=True)
label_map = {"true": 1, "false": 0}

ds = datasets.DatasetDict({
    "train": ds["train"],
    "test": tv["test"],
    "validation": tv["train"]
})

In [30]:
print(f"Train size: {len(ds['train'])}")
print(f"Validation size: {len(ds['validation'])}")
print(f"Test size: {len(ds['test'])}")

Train size: 1832
Validation size: 229
Test size: 229


In [None]:
def preprocess(ex):
    ans = ex["answer"]
    y = int(bool(ans))
    # appending facts to the question, because the model is not doing well at all
    text = ex["question"]
    text = " ".join(ex["facts"]) + " " + ex["question"]
    enc = tokenizer(text)
    enc["labels"] = y
    return enc

In [32]:
ds = ds.map(preprocess, remove_columns=strategy_qa.column_names)

Map:   0%|          | 0/1832 [00:00<?, ? examples/s]

Map: 100%|██████████| 1832/1832 [00:00<00:00, 3128.94 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 3821.42 examples/s]
Map: 100%|██████████| 229/229 [00:00<00:00, 3930.98 examples/s]


In [9]:
print(f"True labels in training set: {sum(ds['train']['labels'])}")
print(f"False labels in training set: {len(ds['train']) - sum(ds['train']['labels'])}")

True labels in training set: 854
False labels in training set: 978


In [10]:
collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [11]:
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

In [12]:
class TrainEvalCallback(TrainerCallback):
    def __init__(self, trainer, sample_size=229):
        self.trainer = trainer
        self.sample_size = sample_size
        self.train_sample = None

    def on_train_begin(self, args, state, control, **kwargs):
        ds = self.trainer.train_dataset
        self.train_sample = ds.select(range(self.sample_size))

    def on_evaluate(self, args, state, control, **kwargs):
        metrics = self.trainer.predict(self.train_sample).metrics
        self.trainer.log(metrics)

In [13]:
args = TrainingArguments(
    output_dir="modernbert-strategyqa",
    eval_strategy="epoch",
    logging_strategy="steps",
    save_strategy="epoch",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    lr_scheduler_type="cosine",
    learning_rate= 2e-5,
    warmup_ratio=0.2,
    greater_is_better=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0,
    fp16=True,
    report_to=["wandb"],
    seed=42,
)

def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(
        "answerdotai/ModernBERT-base", num_labels=2
    )
    for param in model.parameters():
        param.requires_grad = False
    for name, param in model.named_parameters():
        if "classifier" in name:
            param.requires_grad = True
    total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total trainable parameters: {total_trainable_params}")
    return model

trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    tokenizer=tokenizer,
    data_collator=collator,
    compute_metrics=compute_metrics,
)

trainer.add_callback(TrainEvalCallback(trainer))


  trainer = Trainer(
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total trainable parameters: 1538


In [None]:
"""def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 5e-6, 5e-3, log=True),
    }

best = trainer.hyperparameter_search(
    backend="optuna",
    direction="maximize",
    n_trials=25,
    hp_space=hp_space,
    compute_objective=lambda m: m["eval_accuracy"],
)
print(best)"""

In [22]:
trainer.args.learning_rate = 0.002371420117372919

In [23]:
trainer.train()
print(trainer.evaluate())

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': None, 'bos_token_id': None}.
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total trainable parameters: 1538




Epoch,Training Loss,Validation Loss,Accuracy
1,0.733,0.727907,0.510917
2,0.7124,0.701448,0.576419
3,0.6264,0.704988,0.598253






{'eval_loss': 0.7049884796142578, 'eval_accuracy': 0.5982532751091703, 'eval_runtime': 1.0863, 'eval_samples_per_second': 210.801, 'eval_steps_per_second': 7.364, 'epoch': 3.0}


In [24]:
print(trainer.evaluate(ds["validation"]))
print(trainer.evaluate(ds["test"]))





{'eval_loss': 0.7049884796142578, 'eval_accuracy': 0.5982532751091703, 'eval_runtime': 1.0953, 'eval_samples_per_second': 209.082, 'eval_steps_per_second': 7.304, 'epoch': 3.0}






{'eval_loss': 0.7125810384750366, 'eval_accuracy': 0.5240174672489083, 'eval_runtime': 1.0842, 'eval_samples_per_second': 211.209, 'eval_steps_per_second': 7.378, 'epoch': 3.0}
