<a href="https://colab.research.google.com/github/componavt/ProJouRu/blob/main/src/vk/topics/Finetune_XLM_RoBERTa_Topic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Define input files
file_expert_labeled = "512_posts_24_topics.csv"  # expert-labeled dataset
unlabeled_posts = "speechvepkar.csv"

# model_name = "xlm-roberta-base" # слишком тяжёлая модель для бесплатной версии Google Colab (125M+ параметров)
model_name = "mrm8488/XLM-RoBERTa-tiny" # Урезанная версия XLM-R с многоязычной поддержкой 🔽 22М
# todo:
# model_name = "distilbert-base-multilingual-cased"          # Упрощённый DistilBERT с мультиязычностью	🔽 134М
# model_name = "papluca/xlm-roberta-base-language-detection" # Быстрая модель для мультиязычного классификатора

# Уменьшить количество эпох для ускорения 3, 2, 1
num_train_epochs=3

# Уменьшить max_length при токенизации (todo: подсчитать длину постов)
# max_length=256
max_length=128

In [None]:
# Fine-tune XLM-RoBERTa for topic classification on Russian social media posts (GPU-friendly 🧠⚡)

!pip install -U transformers datasets scikit-learn

import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch
import numpy as np
from sklearn.metrics import classification_report, accuracy_score, f1_score
from io import StringIO
import requests

# Load data labeled by expert from GitHub
response = requests.get(f'https://raw.githubusercontent.com/componavt/sns4human/refs/heads/main/data/vk/topics/{file_expert_labeled}')
df = pd.read_csv(StringIO(response.text), delimiter=',', encoding='utf-8')
df = df[df['topic'].str.lower() != 'пусто'].copy()

# Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['topic'])

# Train-validation split
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

# Optional: you can also encode 'domain' and 'type_group' if you later want to use them
# domain_encoder = LabelEncoder()
# df['domain_id'] = domain_encoder.fit_transform(df['domain'])

# Convert to HuggingFace Datasets
train_ds = Dataset.from_pandas(train_df[['text', 'label']])
val_ds = Dataset.from_pandas(val_df[['text', 'label']])
datasets = DatasetDict({"train": train_ds, "validation": val_ds})

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization
def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_length)

tokenized_datasets = datasets.map(preprocess_function, batched=True)

# Load model with classification head
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

# Metrics for evaluation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro"),
        "f1_weighted": f1_score(labels, preds, average="weighted")
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./results_xlm_topic",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=num_train_epochs,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro",
    report_to="none"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Start training
trainer.train()

# Save model and tokenizer
model.save_pretrained("./topic_model_finetuned")
tokenizer.save_pretrained("./topic_model_finetuned")

# Run inference on new unlabeled data
from transformers import TextClassificationPipeline

# Load unlabeled posts
test_response = requests.get(f'https://raw.githubusercontent.com/componavt/sns4human/refs/heads/main/data/vk/posts/{unlabeled_posts}')
test_df = pd.read_csv(StringIO(test_response.text), delimiter=',', encoding='utf-8')
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer, return_all_scores=True, device=0 if torch.cuda.is_available() else -1)

# Predict topic with score
results = []
for _, row in test_df.iterrows():
    tokens = row['tokens']
    preds = pipe(tokens)[0]  # list of dicts [{label: 'LABEL_0', score: ...}, ...]
    best = max(preds, key=lambda x: x['score'])
    topic_label = label_encoder.inverse_transform([int(best['label'].split('_')[-1])])[0]
    results.append({
        "id": row['id'],

        "topic": topic_label,
        "relatedness": round(best['score'], 4),

        "text": row['text'],
        "date": row['date'],
        "likes": row['likes'],
        "reposts": row['reposts'],
        "views": row['views'],
    })

# Save labeled results
pd.DataFrame(results).to_csv("labeled_predictions.csv", index=False, encoding="utf-8")