In [1]:
!pip install torch
!pip install transformers
!pip install peft
!pip install datasets
!pip install scikit-learn

Collecting torch
  Downloading torch-2.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (30 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx>=2.5.1 (from torch)
  Downloading networkx-3.6.1-py3-none-any.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.8.93 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.8.93-py3-none-manylinux2010_x86_64.manylinux_2_12_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-runtime-cu12==12.8.90 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cuda-cupti-cu12==12.8.90 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.8.90-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.7 kB)
Collecting nvidia-cudnn-cu12==9.10.2.21 (from torch)
  Downloading nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_x86_64.whl.metadata (1.8 kB)
Collecting nvidia-cubl

In [1]:
# ---------------------------------------------
# ModernBERT LoRA LANGUAGE ADAPTER TRAINING
# ---------------------------------------------

from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForMaskedLM,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    Trainer,
)
from peft import LoraConfig, get_peft_model
import torch
import os

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"


# -------------------------------
# Minimal helper functions
# -------------------------------

def load_tokenizer():
    """Always load the ModernBERT tokenizer."""
    return AutoTokenizer.from_pretrained("jhu-clsp/mmBERT-base", use_fast=True)


def save_peft_adapter(model, output_dir):
    """Save only LoRA adapter weights."""
    os.makedirs(output_dir, exist_ok=True)
    model.save_pretrained(output_dir)


# -------------------------------
# Dataset preparation
# -------------------------------

def prepare_mlm_dataset(tokenizer, wiki_dataset_name, wiki_config, split="train[:20000]", seq_len=512):

    # Load split (ModernBERT is long-sequence capable)
    ds = load_dataset(wiki_dataset_name, wiki_config, split=split)

    def tokenize(example):
        return tokenizer(
            example["text"],
            truncation=True,
            max_length=seq_len,
            return_special_tokens_mask=True,
        )

    tokenized = ds.map(tokenize, batched=True, remove_columns=ds.column_names)

    return tokenized


# -------------------------------
# Train one LoRA language adapter
# -------------------------------

def train_language_lora(
    wiki_config,
    split="train[:20000]",
    out_dir="language_lora",
    seq_len=128,
    batch_size=8,
    epochs=2,
):
    print(f"\n=====================================")
    print(f" Training language LoRA for {wiki_config}")
    print(f"=====================================\n")

    tokenizer = load_tokenizer()

    dataset = prepare_mlm_dataset(
        tokenizer,
        wiki_dataset_name="wikimedia/wikipedia",
        wiki_config=wiki_config,
        split=split,
        seq_len=seq_len,
    )

    # Data collator for MLM
    collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm_probability=0.15
    )

    # Base model (ModernBERT MLM)
    model = AutoModelForMaskedLM.from_pretrained(
        "jhu-clsp/mmBERT-base",
        torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
    ).to(device)

    # LoRA config
    lora_cfg = LoraConfig(
        r=16,
        lora_alpha=32,
        lora_dropout=0.1,
        target_modules=["Wqkv", "Wo"],   # ModernBERT attention
        bias="none",
        task_type="SEQ_CLS",  # trick into actual mlm
    )

    model = get_peft_model(model, lora_cfg)

    # Training args
    args = TrainingArguments(
        output_dir=f"{out_dir}_checkpoints",
        per_device_train_batch_size=batch_size,
        num_train_epochs=epochs,
        learning_rate=1e-4,
        fp16=(device == "cuda"),
        bf16=False,
        logging_steps=50,
        logging_first_step=True,
        save_steps=5000,
        dataloader_num_workers=2,
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=dataset,
        data_collator=collator,
    )

    trainer.train()

    # Save LoRA adapter weights
    save_peft_adapter(model, out_dir)

    print(f"\nSaved LoRA language adapter → {out_dir}\n")



### Training Language Adapters

In [32]:
# English
train_language_lora(wiki_config="20231101.en", split="train[:10000]", out_dir="trained_adapters/language_lora_en", epochs=10)


 Training language LoRA for 20231101.en



Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Step,Training Loss
1,1.8106
50,1.39
100,1.3397
150,1.3469
200,1.2859
250,1.2922
300,1.2593
350,1.2935
400,1.2558
450,1.2546



Saved LoRA language adapter → trained_adapters/language_lora_en



In [33]:
# Danish
train_language_lora(wiki_config="20231101.da", split="train[:10000]", out_dir="trained_adapters/language_lora_da", epochs=10)


 Training language LoRA for 20231101.da



20231101.da/train-00000-of-00002.parquet:   0%|          | 0.00/203M [00:00<?, ?B/s]

20231101.da/train-00001-of-00002.parquet:   0%|          | 0.00/125M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/295347 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Step,Training Loss
1,1.6602
50,1.1902
100,1.0849
150,1.1767
200,1.0948
250,1.0751
300,1.0264
350,1.0381
400,0.9621
450,1.0614



Saved LoRA language adapter → trained_adapters/language_lora_da



In [34]:
# Italian
train_language_lora(wiki_config="20231101.it", split="train[:10000]", out_dir="trained_adapters/language_lora_it", epochs=10)


 Training language LoRA for 20231101.it



20231101.it/train-00000-of-00010.parquet:   0%|          | 0.00/529M [00:00<?, ?B/s]

20231101.it/train-00001-of-00010.parquet:   0%|          | 0.00/376M [00:00<?, ?B/s]

20231101.it/train-00002-of-00010.parquet:   0%|          | 0.00/340M [00:00<?, ?B/s]

20231101.it/train-00003-of-00010.parquet:   0%|          | 0.00/287M [00:00<?, ?B/s]

20231101.it/train-00004-of-00010.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

20231101.it/train-00005-of-00010.parquet:   0%|          | 0.00/241M [00:00<?, ?B/s]

20231101.it/train-00006-of-00010.parquet:   0%|          | 0.00/222M [00:00<?, ?B/s]

20231101.it/train-00007-of-00010.parquet:   0%|          | 0.00/218M [00:00<?, ?B/s]

20231101.it/train-00008-of-00010.parquet:   0%|          | 0.00/220M [00:00<?, ?B/s]

20231101.it/train-00009-of-00010.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1833639 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Step,Training Loss
1,1.292
50,1.2484
100,1.0654
150,1.0861
200,1.051
250,1.0814
300,1.0453
350,1.0205
400,1.078
450,1.0044



Saved LoRA language adapter → trained_adapters/language_lora_it



In [35]:
# Polish
train_language_lora(wiki_config="20231101.pl", split="train[:10000]", out_dir="trained_adapters/language_lora_pl", epochs=10)


 Training language LoRA for 20231101.pl



20231101.pl/train-00000-of-00006.parquet:   0%|          | 0.00/431M [00:00<?, ?B/s]

20231101.pl/train-00001-of-00006.parquet:   0%|          | 0.00/339M [00:00<?, ?B/s]

20231101.pl/train-00002-of-00006.parquet:   0%|          | 0.00/274M [00:00<?, ?B/s]

20231101.pl/train-00003-of-00006.parquet:   0%|          | 0.00/248M [00:00<?, ?B/s]

20231101.pl/train-00004-of-00006.parquet:   0%|          | 0.00/243M [00:00<?, ?B/s]

20231101.pl/train-00005-of-00006.parquet:   0%|          | 0.00/231M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1587721 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Step,Training Loss
1,1.7809
50,1.0627
100,0.9603
150,0.9483
200,0.9436
250,0.9163
300,0.9189
350,0.8714
400,0.9225
450,0.8966



Saved LoRA language adapter → trained_adapters/language_lora_pl



### Dataset import for task training

In [2]:
from datasets import load_from_disk

train_dataset_tokenized = load_from_disk(r"Data/tokenized_train_dataset")
eval_dataset_tokenized = load_from_disk(r"Data/tokenized_eval_dataset")

def convert_labels_to_float(batch):
    batch["labels"] = [float(x) for x in batch["label"]] 
    return batch

train_dataset_tokenized = train_dataset_tokenized.map(convert_labels_to_float, batched=True)
eval_dataset_tokenized  = eval_dataset_tokenized.map(convert_labels_to_float, batched=True)

In [3]:
torch.cuda.is_available()

True

## LoRA Task Training

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "jhu-clsp/mmBERT-base",
    use_fast=True
)

from transformers import AutoModelForMaskedLM
from peft import PeftModel
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# Load MLM model
mlm_model = AutoModelForMaskedLM.from_pretrained(
    "jhu-clsp/mmBERT-base"
)

# Load trained language LoRA
mlm_model = PeftModel.from_pretrained(
    mlm_model,
    "trained_adapters/language_lora_en",
    is_trainable=False
)

mlm_model.to(device)

import torch.nn as nn
from transformers import ModernBertConfig, ModernBertForSequenceClassification

config = ModernBertConfig.from_pretrained(
    "jhu-clsp/mmBERT-base",
    num_labels=2
)

# Create classification model
cls_model = ModernBertForSequenceClassification(config)

# Copy encoder weights (THIS PRESERVES THE LoRA-lang-adapter)
cls_model.modernbert = mlm_model.base_model.model

cls_model.to(device)

from peft import LoraConfig, get_peft_model

task_lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["Wqkv", "Wo"],
    bias="none",
    task_type="SEQ_CLS"
)

model = get_peft_model(cls_model, task_lora_config)
model.to(device)

model.print_trainable_parameters()

from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

data_collator = DataCollatorWithPadding(tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

training_args = TrainingArguments(
    output_dir="./mawsa_lora_task",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=50,
    torch_compile=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()




trainable params: 4,597,250 || all params: 619,915,268 || trainable%: 0.7416


  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
model.save_pretrained("trained_adapters/lora_lang_mawsa_adapter")
tokenizer.save_pretrained("trained_adapters/lora_lang_mawsa_adapter")

## QLoRA Training

In [40]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    "jhu-clsp/mmBERT-base",
    use_fast=True
)

from transformers import AutoModelForMaskedLM
from peft import PeftModel
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

mlm_model = AutoModelForMaskedLM.from_pretrained(
    "jhu-clsp/mmBERT-base"
)

mlm_model = PeftModel.from_pretrained(
    mlm_model,
    "trained_adapters/language_lora_en",
    is_trainable=False
)

mlm_model.to(device)

import torch.nn as nn
from transformers import ModernBertConfig, ModernBertForSequenceClassification

# Build config
config = ModernBertConfig.from_pretrained(
    "jhu-clsp/mmBERT-base",
    num_labels=2
)

cls_model = ModernBertForSequenceClassification(config)

cls_model.modernbert = mlm_model.base_model.model

cls_model.to(device)

from peft import LoraConfig, get_peft_model

task_lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["Wqkv", "Wo", "mlp.Wi", "mlp.Wo"], # added mlp modules (QLoRA)
    bias="none",
    task_type="SEQ_CLS"
)

model = get_peft_model(cls_model, task_lora_config)
model.to(device)

model.print_trainable_parameters()

from transformers import Trainer, TrainingArguments, DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

data_collator = DataCollatorWithPadding(tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

training_args = TrainingArguments(
    output_dir="./mawsa_Qlora_task",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=eval_dataset_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()




trainable params: 6,759,938 || all params: 622,077,956 || trainable%: 1.0867


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6784,0.397786,0.831461,0.525673
2,0.7497,0.361398,0.856867,0.510915
3,0.6434,0.359815,0.859036,0.523599
4,0.6162,0.355824,0.855916,0.550894
5,0.6079,0.35619,0.860967,0.561932


TrainOutput(global_step=49465, training_loss=0.6960911192836624, metrics={'train_runtime': 16500.6765, 'train_samples_per_second': 47.962, 'train_steps_per_second': 2.998, 'total_flos': 1.391011983194112e+17, 'train_loss': 0.6960911192836624, 'epoch': 5.0})

In [41]:
model.save_pretrained("trained_adapters/Qlora_lang_mawsa_adapter")
tokenizer.save_pretrained("trained_adapters/Qlora_lang_mawsa_adapter")

('trained_adapters/Qlora_lang_mawsa_adapter/tokenizer_config.json',
 'trained_adapters/Qlora_lang_mawsa_adapter/special_tokens_map.json',
 'trained_adapters/Qlora_lang_mawsa_adapter/tokenizer.json')

languages = ["en", "da", "it", "pl"]
for lang in languages:
    # Load base + task LoRA
    base_model = AutoModelForSequenceClassification.from_pretrained(
        "jhu-clsp/mmBERT-base",
        num_labels=2
    )
    hidden = base_model.config.hidden_size
    base_model.classifier = nn.Sequential(
        nn.Linear(hidden, hidden),
        nn.ReLU(),
        nn.Dropout(0.1),
        nn.Linear(hidden, 2)
    )

    # Load language LoRA
    model = PeftModel.from_pretrained(base_model, f"language_lora_{lang}")

    # Freeze language LoRA
    for n, p in model.named_parameters():
        if "lora" in n:
            p.requires_grad = False

    # Apply task LoRA
    model = get_peft_model(model, task_lora_config)
    model.to(device)
    model.eval()

    # run evaluation on eval_dataset_tokenized or prediction pipeline
