<a href="https://colab.research.google.com/github/componavt/sns4human/blob/main/src/vk/topics/accuracy/xlm-roberta_accuracy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

🧮 xlm-roberta Accuracy: {accuracy}%  
(Tested on {N} expert-labeled posts)  

In [1]:
# Load expert-labeled dataset
file_expert_labeled = "512_posts_24_topics.csv"

# Define model path (Hugging Face repo)
model_repo = "componavt/xlm-roberta-base-topic-classification-2025"

In [2]:
# Install necessary libraries
!pip install -U transformers pandas scikit-learn

import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
from sklearn.preprocessing import LabelEncoder
from io import StringIO
import requests

# ========== 1. Load expert-labeled dataset ==========
url = f'https://raw.githubusercontent.com/componavt/sns4human/refs/heads/main/data/vk/topics/{file_expert_labeled}'
response = requests.get(url)
df = pd.read_csv(StringIO(response.text), encoding='utf-8')
df = df[df['topic'].str.lower() != 'пусто'].copy()

print(f"Loaded {len(df)} posts with expert-labeled topics.")

# ========== 2. Prepare label encoder ==========
unique_topics = sorted(df['topic'].unique())
label_encoder = LabelEncoder()
label_encoder.fit(unique_topics)
df['true_label_id'] = label_encoder.transform(df['topic'])

# ========== 3. Load model and tokenizer ==========
model = AutoModelForSequenceClassification.from_pretrained(model_repo)
tokenizer = AutoTokenizer.from_pretrained(model_repo)

device = 0 if torch.cuda.is_available() else -1
pipe = TextClassificationPipeline(
    model=model,
    tokenizer=tokenizer,
    device=device,
    top_k=None,
    truncation=True,
    max_length=512,
    padding=True
)

# ========== 4. Run inference and compare ==========
correct = 0
total = len(df)

for idx, row in df.iterrows():
    text = row['text']
    true_topic = row['topic']
    true_id = row['true_label_id']

    if not isinstance(text, str) or not text.strip():
        continue

    try:
        preds = pipe(text)[0]
        best = max(preds, key=lambda x: x['score'])
        pred_id = int(best['label'].replace('LABEL_', ''))
        pred_topic = label_encoder.inverse_transform([pred_id])[0]

        if pred_topic == true_topic:
            correct += 1

    except Exception as e:
        print(f"Error on row {idx}: {e}")

# ========== 5. Report results ==========
accuracy = correct / total * 100
print(f"\n✅ Correct predictions: {correct} / {total}")
print(f"🎯 Accuracy: {accuracy:.2f}%")

Collecting pandas
  Downloading pandas-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (91 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.2/91.2 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-learn
  Downloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (17 kB)
Downloading pandas-2.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.4/12.4 MB[0m [31m89.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading scikit_learn-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m89.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, pandas
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
    

config.json:   0%|          | 0.00/1.70k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



✅ Correct predictions: 354 / 463
🎯 Accuracy: 76.46%
