## TRAINING

In [4]:
import torch

In [1]:
from datasets import load_from_disk

train_dataset_tokenized = load_from_disk("Data/tokenized_train_dataset")
eval_dataset_tokenized = load_from_disk("Data/tokenized_eval_dataset")

def convert_labels_to_float(batch):
    batch["labels"] = [float(x) for x in batch["label"]] 
    return batch

train_dataset_tokenized = train_dataset_tokenized.map(convert_labels_to_float, batched=True)
eval_dataset_tokenized  = eval_dataset_tokenized.map(convert_labels_to_float, batched=True)


In [2]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# Tokenizer - not really necessary due to data bein pre-tokenized.
tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/mmBERT-base", use_fast=True)

# Metrics for evaluation (f1)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

# Load mmBERT model
model = AutoModelForSequenceClassification.from_pretrained(
    "jhu-clsp/mmBERT-base",
    num_labels=2,      # binary classification
    torch_dtype="auto" # uses float16/32 depending on device and fp16 flag
)

# LoRA configuration
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS",
    target_modules=["Wqkv", "Wo"]  # inside mmBERT
)

model = get_peft_model(model, lora_config)


# Training arguments
args = TrainingArguments(
    output_dir="training_checkpoints/mmbert_lora_checkpoints",
    learning_rate=1e-4,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    fp16=True,                  # mixed precision
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

# Data collator to handle padding
data_collator = DataCollatorWithPadding(tokenizer)

# Training statements
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset_tokenized,   # your tokenized train dataset
    eval_dataset=eval_dataset_tokenized,     # your tokenized eval dataset
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Aand train!
trainer.train()


`torch_dtype` is deprecated! Use `dtype` instead!
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/mmBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5159,0.285702,0.875349,0.710989
2,0.4756,0.268481,0.894188,0.726477
3,0.3766,0.313678,0.901468,0.729923
4,0.3006,0.374423,0.895228,0.734847
5,0.2556,0.548018,0.896357,0.730864


TrainOutput(global_step=49465, training_loss=0.4018260232818798, metrics={'train_runtime': 11085.8687, 'train_samples_per_second': 71.388, 'train_steps_per_second': 4.462, 'total_flos': 6.88165004703744e+16, 'train_loss': 0.4018260232818798, 'epoch': 5.0})

In [None]:
model.save_pretrained("trained_adapters/mmbert_lora_mawsa_adapters")
tokenizer.save_pretrained("trained_adapters/mmbert_lora_mawsa_adapters")

In [None]:
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer, 
    DataCollatorWithPadding
)
from peft import LoraConfig, get_peft_model
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import torch.nn as nn

tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/mmBERT-base", use_fast=True)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

model = AutoModelForSequenceClassification.from_pretrained(
    "jhu-clsp/mmBERT-base",
    num_labels=2,  # now HF uses CrossEntropyLoss automatically
    torch_dtype="auto"
)


# Introduces a 2 layer classifier head for more robustness
hidden = model.config.hidden_size
model.classifier = nn.Sequential(
    nn.Linear(hidden, hidden),
    nn.ReLU(),
    nn.Dropout(0.1),
    nn.Linear(hidden, 2)  # output = 2 logits for CrossEntropyLoss
)



lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS",
    target_modules=["Wqkv", "Wo"]
)
model = get_peft_model(model, lora_config)


for name, param in model.named_parameters():
    if param.requires_grad:
        param.data = param.data.to(torch.float32)


training_args = TrainingArguments(
    output_dir="training_checkpoints/mmbert_lora_2lcls_checkpoints",
    learning_rate=1e-4,
    num_train_epochs=5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

data_collator = DataCollatorWithPadding(tokenizer)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,   # your tokenized train dataset
    eval_dataset=eval_dataset_tokenized,     # your tokenized eval dataset
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


trainer.train()


Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at jhu-clsp/mmBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5108,0.262909,0.888186,0.72726
2,0.4607,0.263542,0.899299,0.73054
3,0.3588,0.331843,0.901468,0.735651
4,0.2876,0.440772,0.896565,0.730719
5,0.2156,0.592929,0.893326,0.724567


TrainOutput(global_step=49465, training_loss=0.37669459949980416, metrics={'train_runtime': 11053.7835, 'train_samples_per_second': 71.595, 'train_steps_per_second': 4.475, 'total_flos': 6.95344184358912e+16, 'train_loss': 0.37669459949980416, 'epoch': 5.0})

In [None]:
model.save_pretrained("trained_adapters/mmbert_lora_mawsa_adapters_2layercls")
tokenizer.save_pretrained("trained_adapters/mmbert_lora_mawsa_adapters_2layercls")

('mmbert_lora_mawsa_adapters_2layercls\\tokenizer_config.json',
 'mmbert_lora_mawsa_adapters_2layercls\\special_tokens_map.json',
 'mmbert_lora_mawsa_adapters_2layercls\\tokenizer.json')