<a href="https://colab.research.google.com/github/componavt/ProJouRu/blob/main/src/vk/topics/Finetune_XLM_RoBERTa_Topic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Define input files
file_expert_labeled = "512_posts_24_topics.csv"  # expert-labeled dataset
unlabeled_posts = "speechvepkar.csv"

# see 25 XLM-RoBERTa models: Russian + text classification: https://huggingface.co/models?pipeline_tag=text-classification&language=ru&sort=trending&search=XLM-RoBERTa
# model_name = "xlm-roberta-base" # слишком тяжёлая модель для бесплатной версии Google Colab (125M+ параметров)
# model_name = "mrm8488/XLM-RoBERTa-tiny" # failed to load Урезанная версия XLM-R с многоязычной поддержкой 🔽 22М
# model_name = "mrm8488/xlm-roberta-base-finetuned-HC3-mix" # модель уже содержит классификационную "голову" (head) с 2 выходами, потому что она обучалась на двоичную классификацию (например, truthful / deceptive)

# model_name = "cointegrated/rubert-tiny" # 👌 Крайне низкие метрики (Accuracy 21.5%, F1 ~4–9%) — модель почти не обучается.
model_name = "distilbert-base-multilingual-cased"
# todo:
# DeepPavlov/xlm-roberta-large-en-ru-mnli
# model_name = "distilbert-base-multilingual-cased"          # Упрощённый DistilBERT с мультиязычностью	🔽 134М
# model_name = "papluca/xlm-roberta-base-language-detection" # Быстрая модель для мультиязычного классификатора

# Уменьшить количество эпох для ускорения 3, 2, 1
num_train_epochs=10 # 3

# Уменьшить max_length при токенизации (todo: подсчитать длину постов)
max_length=256
# max_length=128

learning_rate=2e-5 # (0.00002) Меньший learning rate означает более медленное обновление весов модели. Это может привести к более стабильному, но медленному обучению.
# learning_rate=3e-5 # (0.00003) Чуть больший learning rate ускоряет обновление весов, что может помочь модели быстрее сходиться, но увеличивает риск "перескакивания" оптимального решения.

In [2]:
# Fine-tune XLM-RoBERTa for topic classification on Russian social media posts (GPU-friendly 🧠⚡)

!pip install -U transformers datasets scikit-learn

import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score
from io import StringIO
import requests

# Load data labeled by expert from GitHub
response = requests.get(f'https://raw.githubusercontent.com/componavt/sns4human/refs/heads/main/data/vk/topics/{file_expert_labeled}')
df = pd.read_csv(StringIO(response.text), delimiter=',', encoding='utf-8')
df = df[df['topic'].str.lower() != 'пусто'].copy()

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['topic'])

# Train-validation split
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Optional: you can also encode 'domain' and 'type_group' if you later want to use them
# domain_encoder = LabelEncoder()
# df['domain_id'] = domain_encoder.fit_transform(df['domain'])

# Convert to HuggingFace Datasets
train_ds = Dataset.from_pandas(train_df[['text', 'label']])
val_ds = Dataset.from_pandas(val_df[['text', 'label']])
datasets = DatasetDict({"train": train_ds, "validation": val_ds})

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_length)

tokenized_datasets = datasets.map(preprocess_function, batched=True)

# Load model with classification head
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

# Metrics for evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "f1_weighted": f1_score(labels, preds, average="weighted")
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./results_xlm_topic",
    eval_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    report_to="none"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

# Save model and tokenizer
model.save_pretrained("./topic_model_finetuned")
tokenizer.save_pretrained("./topic_model_finetuned")

# Run inference on new unlabeled data
from transformers import TextClassificationPipeline

# Load unlabeled posts
test_response = requests.get(f'https://raw.githubusercontent.com/componavt/sns4human/refs/heads/main/data/vk/posts/{unlabeled_posts}')
test_df = pd.read_csv(StringIO(test_response.text), delimiter=',', encoding='utf-8')
pipe = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    top_k=None,
    device=0 if torch.cuda.is_available() else -1,
    truncation=True,              # 👈 обрезать длинные тексты
    max_length=512,               # 👈 безопасный максимум для BERT
    padding=True                  # 👈 чтобы batch работал корректно
)

# Predict topic with score
results = []
for _, row in test_df.iterrows():
    text = row['text']
    if not isinstance(text, str) or not text.strip():
      continue  # Skip invalid text entries

    preds = pipe(text)[0]  # list of dicts [{label: 'LABEL_0', score: ...}, ...]
    best = max(preds, key=lambda x: x['score'])
    topic_label = label_encoder.inverse_transform([int(best['label'].split('_')[-1])])[0]
    results.append({
        "id": row.get('id', None),  # if 'id' exists

        "topic": topic_label, # two calculated fields
        "relatedness": round(best['score'], 4),

        "text": text,
        "date": row.get('date'),
        "likes": row.get('likes'),
        "reposts": row.get('reposts'),
        "views": row.get('views'),
    })

# Save labeled results
pd.DataFrame(results).to_csv("labeled_predictions.csv", index=False, encoding="utf-8")

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/466 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/370 [00:00<?, ? examples/s]

Map:   0%|          | 0/93 [00:00<?, ? examples/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/542M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


KeyboardInterrupt: 