In [10]:
import os, random
from datasets import Dataset
import numpy as np
import pandas as pd
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer)

from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, accuracy_score

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

DATA_DIR = os.environ.get("DATA_DIR", "../data/synth_clinical")

notes_v2  = os.path.join(DATA_DIR, "notes_hard_v2.csv")
labels_v2 = os.path.join(DATA_DIR, "doc_labels_hard_v2.csv")

notes = pd.read_csv(notes_v2)
labs  = pd.read_csv(labels_v2)
df = notes.merge(labs[['doc_id','hard_v2_label','split']], on='doc_id', how='left').rename(columns={'hard_v2_label':'label'})

train_df = df[df['split']=='train'][['text','label']].reset_index(drop=True)
test_df  = df[df['split']=='test'][['text','label']].reset_index(drop=True)

ds = {
    "train": Dataset.from_pandas(train_df),
    "test":  Dataset.from_pandas(test_df),
}
len(ds["train"]), len(ds["test"])

(480, 120)

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = logits - logits.max(axis=1, keepdims=True)
    probs = np.exp(probs)
    probs = probs[:,1] / probs.sum(axis=1)
    preds=(probs>=0.5).astype(int)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds),
        'auroc': roc_auc_score(labels, probs),
        'auprc': average_precision_score(labels, probs)
    }


In [14]:
def run_model(model_name:str, max_len=768, epochs=3, batch=16, lr=2e-5, fp16=True):
    tok=AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)
    enc=ds['train'].map(lambda x: tok(x['text'], max_length=max_len, truncation=True), batched=True, remove_columns=['text'])
    model=AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    args=TrainingArguments(
        output_dir=f"../reports/doc_cls_hard_{model_name.replace('/','_')}",
        per_device_train_batch_size=batch,
        per_device_eval_batch_size=batch,
        num_train_epochs=epochs,
        learning_rate=lr,
        fp16=fp16,
        logging_steps=50,
        eval_strategy='epoch',
        save_strategy='no',
        report_to=[],
        seed=SEED
    )
    trainer=Trainer(model=model, args=args, train_dataset=enc['train'], eval_dataset=enc['test'], tokenizer=tok, compute_metrics=compute_metrics)
    import time; t0=time.time(); trainer.train(); dur=time.time()-t0
    metrics=trainer.evaluate(); metrics['seconds']=dur; return metrics


In [None]:
results={}
for name, mn in [("BioClinical ModernBERT", "thomas-sounack/BioClinical-ModernBERT-base"),
                 ("ModernBERT (vanilla)", "answerdotai/ModernBERT-base")]:
    print(f"\n==== Training {name}: {mn} ====")
    results[name]=run_model(mn, max_len=1024, epochs=10)
results



==== Training BioClinical ModernBERT: thomas-sounack/BioClinical-ModernBERT-base ====


In [None]:
pd.DataFrame(results).T[['accuracy','f1','auroc','auprc','eval_loss','seconds']]
