<a href="https://colab.research.google.com/github/componavt/sns4human/blob/main/src/vk/topics/Finetune_XLM_RoBERTa_Topic_HuggingFace_Top-N-posts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**English:**
This script loads a fine-tuned XLM-RoBERTa model for topic classification from the Hugging Face Hub. It then classifies posts from up to `n_files` CSV files in the `./data/vk/posts/` directory. Each post is assigned a topic with a confidence score. The results are saved into separate CSV files, one for each predicted topic, sorted by confidence in descending order. Progress is printed to the console.


**Русский:**
Этот скрипт загружает дообученную модель XLM-RoBERTa из Hugging Face для тематической классификации. Он классифицирует посты из выбранных CSV-файлов в директории `./data/vk/posts/`. Каждому посту присваивается тема и рассчитывается степень уверенности. Для каждой темы сохраняются только `k` постов с наибольшей уверенностью в отдельные CSV-файлы.

In [8]:
# Number of CSV files to process (change as needed)
n_files = 2

# Top-N posts per topic to keep
k_top = 10

# Define model path (Hugging Face repo)
model_repo = "componavt/xlm-roberta-base-topic-classification-2025"

In [None]:
### Inference with Fine-Tuned Model from Hugging Face Hub

!pip install -U transformers pandas scikit-learn

import torch
import pandas as pd
import numpy as np
from io import StringIO
import requests
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
from sklearn.preprocessing import LabelEncoder
import os
from huggingface_hub import snapshot_download
from tqdm import tqdm
from collections import defaultdict
import heapq

# Model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_repo)
tokenizer = AutoTokenizer.from_pretrained(model_repo)

# Set model to GPU if available
device = 0 if torch.cuda.is_available() else -1
pipe = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    top_k=None,
    device=device,
    truncation=True,
    max_length=512,
    padding=True
)

# Define the topic list (same as during training)
topic_labels = [
    "благоустройство", "вепсы", "война", "выставка",
    "день рождения и юбилей", "еда кухня", "карельский язык",
    "конкурс", "мероприятия проекты", "музей. экскурсия",
    "музыка", "образование", "праздник", "природа",
    "поэзия Калевала литература", "социальная сфера",
    "традиция", "фестиваль", "этнокультура фольклор",
    "язык", "Эпос_Калевала", "поэзия", "литература"
]

# Fit label encoder for inverse mapping
label_encoder = LabelEncoder()
label_encoder.fit(topic_labels)

# Clone the GitHub repo and collect CSVs
if not os.path.exists("sns4human"):
    !git clone https://github.com/componavt/sns4human.git
posts_dir = "sns4human/data/vk/posts"

csv_files = [f for f in os.listdir(posts_dir) if f.endswith(".csv")][:n_files]

# Dictionary of min-heaps for each topic
top_k_heap = defaultdict(list)

for file in tqdm(csv_files, desc="Processing files"):
    df = pd.read_csv(os.path.join(posts_dir, file), encoding='utf-8')
    for _, row in df.iterrows():
        text = row.get('text', '')
        if not isinstance(text, str) or not text.strip():
            continue

        try:
            preds = pipe(text)[0]  # list of dicts [{label: 'LABEL_0', score: ...}, ...]
            best = max(preds, key=lambda x: x['score'])
            pred_label = int(best['label'].replace('LABEL_', ''))
            topic_label = label_encoder.inverse_transform([pred_label])[0]
            relatedness = round(best['score'], 4)

            heap = top_k_heap[topic_label]
            if len(heap) < k_top:
                heapq.heappush(heap, (relatedness, {
                    "topic": topic_label,
                    "relatedness": relatedness,
                    "text": text,
                    "date": row.get('date'),
                    "group": row.get('group'),
                    "likes": row.get('likes'),
                    "reposts": row.get('reposts'),
                    "views": row.get('views')
                }))
                print(f"Classified: {topic_label} (confidence: {relatedness:.2%}) | Text snippet: {text[:50]}...")
            else:
                if relatedness > heap[0][0]:
                    heapq.heappushpop(heap, (relatedness, {
                        "topic": topic_label,
                        "relatedness": relatedness,
                        "text": text,
                        "date": row.get('date'),
                        "group": row.get('group'),
                        "likes": row.get('likes'),
                        "reposts": row.get('reposts'),
                        "views": row.get('views')
                    }))
                print(f"Classified: {topic_label} (confidence: {relatedness:.2%}) | Text snippet: {text[:50]}...")
        except Exception as e:
            print(f"Skipping row due to error: {e}")

# Save top-k results for each topic
for topic, heap in top_k_heap.items():
    sorted_items = sorted(heap, key=lambda x: -x[0])  # descending
    records = [entry for _, entry in sorted_items]
    pd.DataFrame(records).to_csv(f"{topic}.csv", index=False, encoding="utf-8")