## TRAINING

In [None]:
from datasets import load_from_disk

train_dataset_tokenized = load_from_disk("Data/tokenized_train_dataset")
eval_dataset_tokenized = load_from_disk("Data/tokenized_eval_dataset")

def convert_labels_to_float(batch):
    batch["labels"] = [float(x) for x in batch["label"]] 
    return batch

train_dataset_tokenized = train_dataset_tokenized.map(convert_labels_to_float, batched=True)
eval_dataset_tokenized  = eval_dataset_tokenized.map(convert_labels_to_float, batched=True)


In [9]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }



In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from peft import LoraConfig, get_peft_model
import torch

# Tokenizer - not really necessary due to data bein pre-tokenized.
tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/mmBERT-base", use_fast=True)

# Metrics for evaluation (f1)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

# Load mmBERT model
model = AutoModelForSequenceClassification.from_pretrained(
    "jhu-clsp/mmBERT-base",
    num_labels=2,      # binary classification
    torch_dtype="auto" # uses float16/32 depending on device and fp16 flag
)

# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS",
    target_modules=["Wqkv", "Wo", "mlp.Wi", "mlp.Wo"]  # inside mmBERT
)

model = get_peft_model(model, lora_config)


# Training arguments
args = TrainingArguments(
    output_dir="training_checkpoints/mmbert_Qlora_checkpoints",
    learning_rate=1e-4,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    fp16=True,                  # mixed precision
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

# Data collator to handle padding
data_collator = DataCollatorWithPadding(tokenizer)

# Training statements
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset_tokenized,   # your tokenized train dataset
    eval_dataset=eval_dataset_tokenized,     # your tokenized eval dataset
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Aand train!
trainer.train()


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/mmBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
W1210 12:21:07.674000 1167 site-packages/torch/_dynamo/convert_frame.py:1358] [1/8] torch._dynamo hit config.recompile_limit (8)
W1210 12:21:07.674000 1167 site-packages/torch/_dynamo/convert_frame.py:1358] [1/8]    function: 'compiled_mlp' (/opt/conda/lib/python3.12/site-packages/transformers/models/modernbert/modeling_modernbert.py:528)
W1210 12:21:07.674000 1167 site-packages/torch/_dynamo/convert_frame.py:1358] [1/8]    last reason: 1/3: tensor 'hidden_states' Tensor device index mismatch. Expected device index to be , actual 
W1210 12:21:07.674000 1167 site-packages/torch/_dynamo/convert_frame.py:1358] [1/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W1210 12:21:

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.6129,2.149511,0.887799,0.660248
2,2.0551,2.114035,0.889879,0.708877
3,1.9024,2.103077,0.89499,0.692321
4,1.7382,2.064749,0.895911,0.719648
5,1.4831,2.123376,0.895525,0.722011


W1210 12:41:52.221000 1167 site-packages/torch/_dynamo/convert_frame.py:1358] [0/8] torch._dynamo hit config.recompile_limit (8)
W1210 12:41:52.221000 1167 site-packages/torch/_dynamo/convert_frame.py:1358] [0/8]    function: 'compiled_embeddings' (/opt/conda/lib/python3.12/site-packages/transformers/models/modernbert/modeling_modernbert.py:207)
W1210 12:41:52.221000 1167 site-packages/torch/_dynamo/convert_frame.py:1358] [0/8]    last reason: 0/5: GLOBAL_STATE changed: grad_mode 
W1210 12:41:52.221000 1167 site-packages/torch/_dynamo/convert_frame.py:1358] [0/8] To log all recompilation reasons, use TORCH_LOGS="recompiles".
W1210 12:41:52.221000 1167 site-packages/torch/_dynamo/convert_frame.py:1358] [0/8] To diagnose recompilation issues, see https://pytorch.org/docs/main/torch.compiler_troubleshooting.html


TrainOutput(global_step=3095, training_loss=1.8911208342273325, metrics={'train_runtime': 8859.5085, 'train_samples_per_second': 89.328, 'train_steps_per_second': 0.349, 'total_flos': 6.94737361631232e+16, 'train_loss': 1.8911208342273325, 'epoch': 5.0})

In [12]:
model.save_pretrained("trained_adapters/mmbert_Qlora_mawsa_adapters")
tokenizer.save_pretrained("trained_adapters/mmbert_Qlora_mawsa_adapters")

('trained_adapters/mmbert_Qlora_mawsa_adapters/tokenizer_config.json',
 'trained_adapters/mmbert_Qlora_mawsa_adapters/special_tokens_map.json',
 'trained_adapters/mmbert_Qlora_mawsa_adapters/tokenizer.json')

## QLoRA 2LCLS

In [None]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model, PeftModel
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch.nn as nn

tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/mmBERT-base", use_fast=True)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

model = AutoModelForSequenceClassification.from_pretrained(
    "jhu-clsp/mmBERT-base",
    num_labels=2,  
    torch_dtype="auto"
)


# Introduces a 2 layer classifier head for more robustness
hidden = model.config.hidden_size
model.classifier = nn.Sequential(
    nn.Linear(hidden, hidden),
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(hidden, 2)  # output = 2 logits for CrossEntropyLoss
)


lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS",
    target_modules=["Wqkv", "Wo", "mlp.Wi", "mlp.Wo"]
)
model = get_peft_model(model, lora_config)

model.to(device)

for name, param in model.named_parameters():
    if param.requires_grad:
        param.data = param.data.to(torch.float32)


training_args = TrainingArguments(
    output_dir="training_checkpoints/mmbert_Qlora_2lcls_checkpoints",
    learning_rate=1e-4,
    num_train_epochs=5,
    per_device_train_batch_size=32,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

data_collator = DataCollatorWithPadding(tokenizer)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,   
    eval_dataset=eval_dataset_tokenized,  
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train(resume_from_checkpoint="training_checkpoints/mmbert_Qlora_2lcls_checkpoints/checkpoint-2475")


In [None]:
model.save_pretrained("trained_adapters/mmbert_Qlora_mawsa_adapters_2layercls")
tokenizer.save_pretrained("trained_adapters/mmbert_Qlora_mawsa_adapters_2layercls")