<a href="https://colab.research.google.com/github/desve/labelcraft-2025-ml-challenge/blob/main/03_baseline_transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

03 — Baseline Transformer (LabelCraft 2025)
В этом ноутбуке:

* используем labeled_train.parquet и category_tree.csv;

* готовим поле text_clean;

* обучаем простой классификатор на базе RuBERT/ruRoBERTa;

* считаем macro/micro F1 на валидации и сравниваем с TF-IDF baseline.

# Импорты, монтирование Drive, пути

In [1]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import torch
from torch.utils.data import Dataset, DataLoader

!pip install -q transformers accelerate

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

pd.set_option("display.max_colwidth", 200)
pd.set_option("display.max_columns", 50)

print("Setup OK. Torch version:", torch.version)

DATA_DIR = "/content/drive/MyDrive/LabelCraft_2025/data"

train_path = os.path.join(DATA_DIR, "labeled_train.parquet")
categories_path = os.path.join(DATA_DIR, "category_tree.csv")

print("DATA_DIR:", DATA_DIR)
print("train_path:", train_path)
print("categories_path:", categories_path)

# Загрузка данных и подготовка text_clean

In [None]:
os.makedirs(DATA_DIR, exist_ok=True)
print("Files in DATA_DIR:", os.listdir(DATA_DIR))

train = pd.read_parquet(train_path)
cat_tree = pd.read_csv(categories_path)

print("train:", train.shape)
print("category_tree:", cat_tree.shape)
print("train columns:", train.columns.tolist())

Склейка текста: source_name + attributes

In [None]:
train["source_name"] = train["source_name"].fillna("")
train["attributes"] = train["attributes"].fillna("")
train["text"] = train["source_name"] + " " + train["attributes"]

Обрезаем до N слов для baseline

In [None]:
MAX_WORDS = 64

def truncate_text(s, max_words=MAX_WORDS):
    if not isinstance(s, str):
        return ""
    words = s.split()
    if len(words) <= max_words:
        return s
    return " ".join(words[:max_words])

train["text_clean"] = train["text"].apply(truncate_text)
train["text_clean"] = train["text_clean"].fillna("").astype(str)

# Ограничиваемся подвыборкой и кодируем классы в 0..num_labels-1

In [None]:
SAMPLE_SIZE = 40_000 # можно уменьшить до 20_000 при необходимости

sample = train.sample(n=SAMPLE_SIZE, random_state=42).copy()

Гарантируем чистый текст

In [None]:
sample["text_clean"] = sample["text_clean"].fillna("").astype(str)

Частоты по cat_id в sample

In [None]:
cat_counts = sample["cat_id"].value_counts()
valid_cats = cat_counts[cat_counts >= 2].index

Оставляем только категории с >= 2 объектами

In [None]:
sample = sample[sample["cat_id"].isin(valid_cats)].copy()

Строим маппинг cat_id -> label_index на отфильтрованных данных

In [None]:
unique_cats = sorted(sample["cat_id"].unique())
cat2label = {cat_id: idx for idx, cat_id in enumerate(unique_cats)}
label2cat = {idx: cat_id for cat_id, idx in cat2label.items()}

sample["label"] = sample["cat_id"].map(cat2label)

print("Всего классов после фильтрации:", len(unique_cats))
print("Минимальная частота класса:", sample["cat_id"].value_counts().min())

X_text = sample["text_clean"].values
y = sample["label"].values

from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
X_text, y, test_size=0.2, random_state=42, stratify=y
)

print("Train size:", len(X_train), "Valid size:", len(X_valid))

# Dataset и токенизатор

In [None]:
MODEL_NAME = "cointegrated/rubert-tiny" # лёгкий русский BERT, можно заменить позже

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

In [None]:
from torch.utils.data import Dataset
import torch

class ProductsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = int(self.labels[idx])
        enc = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in enc.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item


In [None]:
train_dataset = ProductsDataset(X_train, y_train, tokenizer, max_length=128)
valid_dataset = ProductsDataset(X_valid, y_valid, tokenizer, max_length=128)

print("Train dataset size:", len(train_dataset))
print("Valid dataset size:", len(valid_dataset))

# Модель и Trainer

In [None]:
num_labels = len(unique_cats)

model = AutoModelForSequenceClassification.from_pretrained(
MODEL_NAME,
num_labels=num_labels
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    macro = f1_score(labels, preds, average="macro")
    micro = f1_score(labels, preds, average="micro")

    return {
    "macro_f1": macro,
    "micro_f1": micro
    }

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
              output_dir="/content/labelcraft_rubert_baseline",
              do_train=True,
              do_eval=True,
              learning_rate=2e-5,
              per_device_train_batch_size=32,
              per_device_eval_batch_size=64,
              num_train_epochs=1.0,
              weight_decay=0.01,
              logging_steps=50,
              eval_steps=200, # оценка каждые 200 шагов
              save_strategy="no", # не сохраняем чекпоинты
              report_to=[],
              )

trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics,
        )

trainer.train()

# Оценка на валидации

In [None]:
eval_results = trainer.evaluate()
print("Eval results:", eval_results)