In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from sklearn.metrics import f1_score
import numpy as np


In [2]:
train = pd.read_csv("train.csv")

# при необходимости очистить текст
train["text"] = train["text"].astype(str)


In [3]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(
    train, test_size=0.1, random_state=42, stratify=train["label"]
)

train_ds = Dataset.from_pandas(train_df)
val_ds   = Dataset.from_pandas(val_df)


In [4]:
model_name = "cointegrated/rubert-tiny2"

tokenizer = AutoTokenizer.from_pretrained(model_name)


'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /cointegrated/rubert-tiny2/resolve/main/tokenizer_config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x15920fbc0>: Failed to resolve \'huggingface.co\' ([Errno 8] nodename nor servname provided, or not known)"))'), '(Request ID: e7db9aec-845e-47ef-9c40-3ac3fee81371)')' thrown while requesting HEAD https://huggingface.co/cointegrated/rubert-tiny2/resolve/main/tokenizer_config.json
Retrying in 1s [Retry 1/5].
'(MaxRetryError('HTTPSConnectionPool(host=\'huggingface.co\', port=443): Max retries exceeded with url: /cointegrated/rubert-tiny2/resolve/main/tokenizer_config.json (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x35d38f500>: Failed to resolve \'huggingface.co\' ([Errno 8] nodename nor servname provided, or not known)"))'), '(Request ID: dff4536e-9e62-4584-a3d1-5f607504ce91)')' thrown while requesting HEAD http

In [5]:
def tokenize(batch):
    return tokenizer(
        batch["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# === токенизация ===
train_ds = train_ds.map(tokenize, batched=True)
val_ds   = val_ds.map(tokenize, batched=True)

# === ПЕРЕИМЕНОВАНИЕ label -> labels ВЕЗДЕ ===
if "label" in train_ds.column_names:
    train_ds = train_ds.rename_column("label", "labels")

if "label" in val_ds.column_names:
    val_ds = val_ds.rename_column("label", "labels")

# Проверим ещё раз перед форматированием
print("TRAIN COLUMNS:", train_ds.column_names)
print("VAL COLUMNS:", val_ds.column_names)

# === ОСТАВЛЯЕМ ТОЛЬКО НУЖНЫЕ КОЛОНКИ ===
train_ds = train_ds.remove_columns(
    [col for col in train_ds.column_names if col not in ["input_ids", "attention_mask", "labels"]]
)

val_ds = val_ds.remove_columns(
    [col for col in val_ds.column_names if col not in ["input_ids", "attention_mask", "labels"]]
)

# === Устанавливаем формат ===
train_ds.set_format(type="torch")
val_ds.set_format(type="torch")


Map:   0%|          | 0/209129 [00:00<?, ? examples/s]

Map:   0%|          | 0/23237 [00:00<?, ? examples/s]

TRAIN COLUMNS: ['ID', 'text', 'src', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask']
VAL COLUMNS: ['ID', 'text', 'src', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask']


In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at cointegrated/rubert-tiny2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    f1 = f1_score(pred.label_ids, preds, average="macro")
    return {"macro_f1": f1}


In [8]:
def compute_metrics(pred):
    preds = np.argmax(pred.predictions, axis=1)
    f1 = f1_score(pred.label_ids, preds, average="macro")
    return {"macro_f1": f1}


In [9]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./bert_model",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    logging_steps=100,
)


In [10]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

# Явно вызываем оценку после обучения
metrics = trainer.evaluate()
print(metrics)

# Явно сохраняем модель
trainer.save_model("./bert_model")
tokenizer.save_pretrained("./bert_model")


  trainer = Trainer(


Step,Training Loss
100,1.0596
200,0.9503
300,0.8588
400,0.8307
500,0.7967
600,0.775
700,0.7862
800,0.7871
900,0.7866
1000,0.7733




{'eval_loss': 0.6122087240219116, 'eval_macro_f1': 0.7280065911059675, 'eval_runtime': 62.2475, 'eval_samples_per_second': 373.3, 'eval_steps_per_second': 23.342, 'epoch': 3.0}


('./bert_model/tokenizer_config.json',
 './bert_model/special_tokens_map.json',
 './bert_model/vocab.txt',
 './bert_model/added_tokens.json',
 './bert_model/tokenizer.json')