In [26]:
from dataclasses import dataclass
from typing import Any, Dict, Tuple

from datasets import DatasetDict, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    PreTrainedTokenizer,
)

In [20]:
dataset = load_dataset("dair-ai/emotion", name="split")

In [36]:
def get_id_label_mapping(dataset: DatasetDict) -> Tuple[Dict[int, str], Dict[str, int]]:
    class_labels = dataset["train"].features["label"].names
    id2label = {i: label for i, label in enumerate(class_labels)}
    label2id = {v: k for k, v in id2label.items()}
    return id2label, label2id


def process_dataset(
    dataset: DatasetDict, tokenizer: PreTrainedTokenizer
) -> DatasetDict:
    def tokenize(example):
        return tokenizer(example["text"], truncation=True, padding="max_length")
    dataset = dataset.map(tokenize, batched=True)
    return dataset

In [None]:
id2label, label2id = get_id_label_mapping(dataset)

In [23]:
MODEL_ID = "answerdotai/ModernBERT-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_ID,
    num_labels=len(id2label.keys()),
    ignore_mismatched_sizes=True,
    trust_remote_code=True,
)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
process_dataset = process_dataset(dataset, tokenizer)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

In [None]:
@dataclass
class FineTuningMetrics:
    """Метрики, собранные в процессе файн-тюнинга."""

    classification_report: Dict[str, Any]
    training_time: float
    trainable_params: int
    peak_memory_usage: float