<a href="https://colab.research.google.com/github/chenhaodev/unsloth-practise/blob/main/bert_classification-modernbert-large.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%env UNSLOTH_DISABLE_FAST_GENERATION=1

env: UNSLOTH_DISABLE_FAST_GENERATION=1


In [None]:
# CH:
# google T5 will fail (maybe not supported by unsloth)
# answerdotai/ModernBERT-large will suc, but on NV GPU (L4, TPU will fail; T4 will fail due to fp16 format issue)

In [2]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [7]:
from unsloth import FastLanguageModel, FastModel
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import os
import torch
from torch import tensor
import torch.nn.functional as F
from transformers import TrainingArguments, Trainer, ModernBertModel, AutoModelForSequenceClassification, training_args
from datasets import load_dataset, Dataset
from tqdm import tqdm

model_name = 'answerdotai/ModernBERT-large'

NUM_CLASSES = 3
DATA_DIR = "data/"

model, tokenizer = FastModel.from_pretrained(
    model_name = model_name,load_in_4bit = False,
    max_seq_length = 2048,
    dtype = None,
    auto_model = AutoModelForSequenceClassification,
    num_labels = NUM_CLASSES,
)
print("model parameters:" + str(sum(p.numel() for p in model.parameters())))

# make all parameters trainable
for param in model.parameters():
    param.requires_grad = True

==((====))==  Unsloth 2025.6.8: Fast Modernbert patching. Transformers: 4.52.4.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model parameters:395834371


In [8]:
# data = pd.read_csv(DATA_DIR + "finance_sentiment_multiclass.csv")
data = pd.read_csv("https://raw.githubusercontent.com/timothelaborie/text_classification_scripts/main/data/finance_sentiment_multiclass.csv")

labels = data["label"].tolist()
labels = [x-1 for x in labels]
# convert labels to one hot vectors
labels = np.eye(NUM_CLASSES)[labels]

train_data,val_data, train_labels, val_labels = train_test_split(data["text"], labels, test_size=0.1, random_state=42)
dataset = Dataset.from_list([{'text': text, 'labels': label} for text, label in zip(train_data, train_labels)])
val_dataset = Dataset.from_list([{'text': text, 'labels': label} for text, label in zip(val_data, val_labels)])

def tokenize_function(examples):
    return tokenizer(examples['text'])

dataset = dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
dataset

Map:   0%|          | 0/3893 [00:00<?, ? examples/s]

Map:   0%|          | 0/433 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels', 'input_ids', 'attention_mask'],
    num_rows: 3893
})

In [9]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    eval_dataset=val_dataset,
    args=TrainingArguments(
        per_device_train_batch_size=32,
        gradient_accumulation_steps=1,
        warmup_steps=10,
        fp16=not torch.cuda.is_bf16_supported(),
        bf16=torch.cuda.is_bf16_supported(),
        optim=training_args.OptimizerNames.ADAMW_TORCH,
        #optim=training_args.OptimizerNames.ADEMAMIX, # this can provide better results
        learning_rate=5e-5,
        weight_decay=0.001,
        lr_scheduler_type="cosine",
        seed=3407,
        num_train_epochs=3, # bert-style models usually need more than 1 epoch
        save_strategy="epoch",

        # report_to="wandb",
        report_to="none",

        group_by_length=True,

        # eval_strategy="no",
        eval_strategy="steps",
        eval_steps=0.25,
        logging_strategy="steps",
        logging_steps=0.25,

    ),
    compute_metrics=lambda eval_pred: { "accuracy": accuracy_score(eval_pred[1].argmax(axis=-1), eval_pred[0].argmax(axis=-1)) }
)
trainer_stats = trainer.train()

  trainer = Trainer(
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,893 | Num Epochs = 3 | Total steps = 366
O^O/ \_/ \    Batch size per device = 32 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (32 x 1 x 1) = 32
 "-____-"     Trainable parameters = 395,834,371/395,834,371 (100.00% trained)


Step,Training Loss,Validation Loss,Accuracy
92,0.5472,0.389064,0.748268
184,0.3176,0.285406,0.833718
276,0.2349,0.277015,0.831409


Unsloth: Will smartly offload gradients to save VRAM!


In [10]:
model = model.cuda()
model = model.eval()
FastLanguageModel.for_inference(model)
print()




In [11]:
batch_size = 32
correct = 0
results = []

# If the val_labels are one-hot, convert to class indices
if isinstance(val_labels, np.ndarray) and val_labels.ndim == 2:
    val_true_labels = np.argmax(val_labels, axis=1)
else:
    val_true_labels = val_labels

val_texts = list(val_data)
val_true_labels = list(val_true_labels)

with torch.no_grad():
    for i in tqdm(range(0, len(val_texts), batch_size), desc="Evaluating"):
        batch_texts = val_texts[i:i+batch_size]
        batch_labels = val_true_labels[i:i+batch_size]
        # Tokenize
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=2048)
        inputs = {k: v.cuda() for k, v in inputs.items()}
        # Forward pass
        outputs = model(**inputs)
        logits = outputs.logits
        probs = F.softmax(logits, dim=-1)
        preds = torch.argmax(probs, dim=-1).cpu().numpy()
        # Count correct
        correct += np.sum(preds == batch_labels)
        # Store results for display
        for j in range(len(batch_texts)):
            results.append({
                "text": batch_texts[j][:200],
                "true": batch_labels[j],
                "pred": preds[j],
                "probs": probs[j].detach().float().cpu().numpy(),
                "ok": preds[j] == batch_labels[j]
        })

accuracy = 100 * correct / len(val_texts)
print(f"\nValidation accuracy: {accuracy:.2f}% ({correct}/{len(val_texts)})")

# Show a few random samples
import random
display = 20
print("\n--- Random samples ---")
for s in random.sample(results, min(display, len(results))):
    print(f"\nText: {s['text']}")
    print(f"True: {s['true']}  Pred: {s['pred']} {'✅' if s['ok'] else '❌'}")
    print("Probs:", ", ".join([f"{k}: {v:.3f}" for k, v in enumerate(s['probs'])]))

Evaluating: 100%|██████████| 14/14 [00:00<00:00, 18.15it/s]


Validation accuracy: 83.14% (360/433)

--- Random samples ---

Text: Turkey Stiffens Manipulation Penalties in Banking Overhaul
True: 0  Pred: 0 ✅
Probs: 0: 0.840, 1: 0.087, 2: 0.073

Text: The Manitowoc Company, Inc. Just Reported Earnings, And Analysts Cut Their Target Price
True: 2  Pred: 2 ✅
Probs: 0: 0.080, 1: 0.010, 2: 0.910

Text: $BLMN $EAT $SBUX - Restaurants stocks break higher, analysts reel in near-term expectations https://t.co/fOjVVJdfF0
True: 1  Pred: 1 ✅
Probs: 0: 0.000, 1: 1.000, 2: 0.000

Text: $CMCSA $LHX - Comcast sues L3Harris in patent dispute https://t.co/kWReshGbvz
True: 2  Pred: 2 ✅
Probs: 0: 0.032, 1: 0.002, 2: 0.966

Text: Libyan economic experts will study the distribution of crucial oil revenue as efforts continue to solve the war-rav… https://t.co/S9lmpnDTqJ
True: 0  Pred: 0 ✅
Probs: 0: 0.792, 1: 0.008, 2: 0.201

Text: Stocks Suffer 'Shocking' Down Week As Fed Balance Sheet Unexpectedly Shrinks https://t.co/bspsRi3Wow
True: 2  Pred: 2 ✅
Probs: 0: 0.001, 1




# to load the model again (run every cell above the one where the trainer is called)

In [None]:
from transformers.trainer_utils import get_last_checkpoint

output_dir = "trainer_output"
last_checkpoint = get_last_checkpoint(output_dir)
print("Last checkpoint:", last_checkpoint)

model, tokenizer = FastModel.from_pretrained(
    model_name = last_checkpoint,load_in_4bit = False,
    max_seq_length = 2048,
    dtype = None,
    auto_model = AutoModelForSequenceClassification,
    num_labels = NUM_CLASSES,
)

Last checkpoint: trainer_output\checkpoint-244
==((====))==  Unsloth 2025.4.5: Fast Modernbert patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 23.999 GB. Platform: Windows.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.30. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: QLoRA and full finetuning all not selected. Switching to 16bit LoRA.


In [None]:
from torch import tensor
print(model(input_ids=tensor([[1,2,3,4,5]]).cuda(), attention_mask=tensor([[1,1,1,1,1]]).cuda()))

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0579, -0.5859, -1.1719]], device='cuda:0', dtype=torch.bfloat16,
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
