In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd

import torch
from transformers import AutoTokenizer, ModernBertForSequenceClassification, TrainingArguments, Trainer
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, f1_score, accuracy_score

!pip install tqdm -q
from tqdm import tqdm

In [3]:
file_path = '/content/drive/My Drive/metadata/4a.train.json'
df = pd.read_json(file_path)
len(df)

4578

In [4]:
mlb = MultiLabelBinarizer()
df["binarized_labels"] = mlb.fit_transform(df["filtered_issues"]).tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["docket_entries"], df["binarized_labels"], test_size=0.2, random_state=42
)

In [5]:
# Load tokenizer and model
model_name = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Load model with correct number of labels
model = ModernBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification"
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("DEVICE: ", device)
model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DEVICE:  cuda


ModernBertForSequenceClassification(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(50368, 768, padding_idx=50283)
      (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (rotary_emb): ModernBertRotaryEmbedding()
          (Wo): Linear(in_features=768, out_features=768, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=768, out_features=2304, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=1152, out_features=768, bias=False)
        )
      

In [6]:
def tokenize_and_encode(texts, labels):
    tokenized = tokenizer(texts, padding="max_length", truncation=True, max_length=512)
    tokenized["labels"] = labels
    return tokenized

train_encodings = tokenize_and_encode(train_texts.tolist(), train_labels.tolist())
val_encodings = tokenize_and_encode(val_texts.tolist(), val_labels.tolist())

In [7]:
class MultiLabelDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __len__(self):
        return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items() if key != "labels"
        }
        item["labels"] = torch.tensor(self.encodings["labels"][idx], dtype=torch.float)
        return item

train_dataset = MultiLabelDataset(train_encodings)
val_dataset = MultiLabelDataset(val_encodings)

In [8]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    preds = (probs > 0.5).astype(int)

    return {
        "f1_micro": f1_score(labels, preds, average="micro"),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "accuracy": accuracy_score(labels, preds),
    }

In [9]:
training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/metadata/model/",
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    warmup_ratio=0.1,
    fp16=True,
    bf16=False,
    eval_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    save_total_limit=2,
    report_to="none"
)

In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss,F1 Micro,F1 Macro,Accuracy
100,No log,0.283043,0.280333,0.036676,0.068777
200,No log,0.244953,0.308955,0.04779,0.067686
300,No log,0.237952,0.343496,0.06297,0.025109
400,No log,0.23127,0.318625,0.052961,0.065502


W0424 20:23:37.825000 9322 torch/_inductor/utils.py:1137] [1/1] Not enough SMs to use max_autotune_gemm mode


TrainOutput(global_step=458, training_loss=0.2744152285646663, metrics={'train_runtime': 1047.4122, 'train_samples_per_second': 3.496, 'train_steps_per_second': 0.437, 'total_flos': 1248192620783616.0, 'train_loss': 0.2744152285646663, 'epoch': 1.0})

In [11]:
trainer.evaluate()

{'eval_loss': 0.23033729195594788,
 'eval_f1_micro': 0.3398516520566419,
 'eval_f1_macro': 0.06383013327891142,
 'eval_accuracy': 0.05895196506550218,
 'eval_runtime': 11.9056,
 'eval_samples_per_second': 76.938,
 'eval_steps_per_second': 9.659,
 'epoch': 1.0}

In [12]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) Y
Token is valid (permission: fineGrained).
The token `FLP` has been saved to /root/.cache/huggingface/stored_tokens
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenti

In [13]:
# Save the trained model
model.save_pretrained(f"/content/drive/My Drive/metadata/model/final")

In [14]:
# Push the trained model to hub
model.push_to_hub(f"finetune_try", private=True, exist_ok=True)

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/rachelFLP/finetune_try/commit/2e8e829965e8bd725fe9eb65a3ee17425dd80467', commit_message='Upload ModernBertForSequenceClassification', commit_description='', oid='2e8e829965e8bd725fe9eb65a3ee17425dd80467', pr_url=None, repo_url=RepoUrl('https://huggingface.co/rachelFLP/finetune_try', endpoint='https://huggingface.co', repo_type='model', repo_id='rachelFLP/finetune_try'), pr_revision=None, pr_num=None)