<a href="https://colab.research.google.com/github/componavt/sns4human/blob/main/src/vk/topics/accuracy/xlm-roberta_accuracy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

🧮 xlm-roberta Accuracy: {accuracy}%  
(Tested on {N} expert-labeled posts)  

In [1]:
# Load expert-labeled dataset
file_expert_labeled = "512_posts_24_topics.csv"

# Define model path (Hugging Face repo)
model_repo = "componavt/xlm-roberta-base-topic-classification-2025"

In [None]:
# Install necessary libraries
!pip install -U transformers pandas scikit-learn

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
from sklearn.preprocessing import LabelEncoder
from io import StringIO
import requests

# ========== 1. Load expert-labeled dataset ==========
url = f'https://raw.githubusercontent.com/componavt/sns4human/refs/heads/main/data/vk/topics/{file_expert_labeled}'
response = requests.get(url)
df = pd.read_csv(StringIO(response.text), encoding='utf-8')
df = df[df['topic'].str.lower() != 'пусто'].copy()

print(f"Loaded {len(df)} posts with expert-labeled topics.")

# ========== 2. Prepare label encoder ==========
unique_topics = sorted(df['topic'].unique())
label_encoder = LabelEncoder()
label_encoder.fit(unique_topics)
df['true_label_id'] = label_encoder.transform(df['topic'])

# ========== 3. Load model and tokenizer ==========
model = AutoModelForSequenceClassification.from_pretrained(model_repo)
tokenizer = AutoTokenizer.from_pretrained(model_repo)

device = 0 if torch.cuda.is_available() else -1
pipe = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    device=device,
    top_k=None,
    truncation=True,
    max_length=512,
    padding=True
)

# ========== 4. Run inference and compare ==========
correct = 0
total = len(df)

for idx, row in df.iterrows():
    text = row['text']
    true_topic = row['topic']
    true_id = row['true_label_id']

    if not isinstance(text, str) or not text.strip():
        continue

    try:
        preds = pipe(text)[0]
        best = max(preds, key=lambda x: x['score'])
        pred_id = int(best['label'].replace('LABEL_', ''))
        pred_topic = label_encoder.inverse_transform([pred_id])[0]

        if pred_topic == true_topic:
            correct += 1

    except Exception as e:
        print(f"Error on row {idx}: {e}")

# ========== 5. Report results ==========
accuracy = correct / total * 100
print(f"\n✅ Correct predictions: {correct} / {total}")
print(f"🎯 Accuracy: {accuracy:.2f}%")