# DistilBERT multilingual — дообучение для классификации ролей участников PR

## Цель
Обучить модель классификации паттернов активности участников open-source проектов на основе данных PR.

## Используемые алгоритмы
- **Модель**: DistilBERT-base-multilingual-cased (134M параметров, поддержка 104 языков)
- **Архитектура**: Transformer encoder + линейный классификатор (head)
- **Оптимизатор**: AdamW с weight decay
- **Loss**: CrossEntropyLoss для многоклассовой классификации

## Метрики оценки
- **Accuracy**: доля правильных предсказаний
- **Precision/Recall/F1 per class**: качество по каждому паттерну
- **Macro F1**: среднее F1 по всем классам (важно при дисбалансе)
- **Weighted F1**: F1 с весами по размеру класса
- **Confusion Matrix**: матрица ошибок для анализа


In [None]:
# Устанавливаем зависимости и рабочую директорию (Colab-friendly)
import sys, subprocess, pkg_resources, os
from pathlib import Path
try:
    pkg_resources.get_distribution("accelerate>=0.26.0")
except pkg_resources.DistributionNotFound:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", "accelerate>=0.26.0"])

if "google.colab" in sys.modules:
    !git clone https://github.com/elenagernichenko/mcp-analyzer.git
    %cd /content/mcp-analyzer

PROJECT_ROOT = Path(".").resolve()
PR_SAMPLES = PROJECT_ROOT / "data" / "pr_samples_vscode.json"  # 1000 PR из microsoft/vscode
print("DATA PATH:", PR_SAMPLES) 

In [None]:
# Optional: !pip install transformers datasets scikit-learn pandas
import json
import random
import os

import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

import sys
from pathlib import Path
PROJECT_ROOT = Path(".").resolve()
sys.path.append(str(PROJECT_ROOT / "analysis"))
from role_classifier import classify_participant

device = "cuda" if (os.getenv("CUDA_VISIBLE_DEVICES") not in [None, ""] and os.environ.get("CUDA_VISIBLE_DEVICES") != "-1") else "cpu"
print("device:", device)



  from .autonotebook import tqdm as notebook_tqdm


device: cpu


In [None]:
# Загружаем выборку PR и собираем датасет участник → паттерн
with open(PR_SAMPLES, "r", encoding="utf-8") as f:
    data = json.load(f)
prs = data.get("prs", [])

participants = {}
for pr in prs:
    author = pr.get("author")
    if author:
        participants.setdefault(author, {"authored": 0, "reviewed": 0, "comments": 0})
        participants[author]["authored"] += 1
    for user in pr.get("participants", []) or []:
        if user and user != author:
            participants.setdefault(user, {"authored": 0, "reviewed": 0, "comments": 0})
            participants[user]["reviewed"] += 1

total_prs = len(prs)
rows = []
for user, stats in participants.items():
    label = classify_participant(
        username=user,
        prs_authored=stats["authored"],
        prs_reviewed=stats["reviewed"],
        comments_count=stats["comments"],
        total_prs=total_prs,
    )
    participation_rate = (stats["authored"] + stats["reviewed"]) / max(total_prs, 1)
    text = (
        f"{user}: authored {stats['authored']} PRs, reviewed {stats['reviewed']} PRs, "
        f"participation_rate {participation_rate:.2f}"
    )
    rows.append({"text": text, "label": label})

print("PRs:", total_prs, "participants:", len(rows))
pd.DataFrame(rows).head()


samples: 10


Unnamed: 0,text,label
0,"User jeroen authored 1 PRs, reviewed 0 PRs in ...",Периферийного участия
1,"User andrey-khropov authored 0 PRs, reviewed 5...",Кураторства и управления
2,"User topepo authored 0 PRs, reviewed 1 PRs in ...",Периферийного участия
3,"User isaigordeev authored 1 PRs, reviewed 0 PR...",Периферийного участия
4,"User Shersula authored 1 PRs, reviewed 0 PRs i...",Периферийного участия


In [None]:
# Подготавливаем HF Dataset + маппинг меток (stratified split)
unique_labels = sorted(set(r["label"] for r in rows))
label2id = {lbl: i for i, lbl in enumerate(unique_labels)}
id2label = {i: lbl for lbl, i in label2id.items()}

df = pd.DataFrame(rows)

# Распределение классов (для анализа дисбаланса)
print("=" * 50)
print("РАСПРЕДЕЛЕНИЕ ПАТТЕРНОВ В ДАННЫХ")
print("=" * 50)
counts = df["label"].value_counts()
for label, count in counts.items():
    print(f"  {label}: {count} ({count/len(df)*100:.1f}%)")
print("-" * 50)

min_count = counts.min()
if min_count < 2 or len(counts) < 2:
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True, stratify=None)
    print("Stratify disabled: min_count=", min_count, "classes=", len(counts))
else:
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])
    print("Stratified split: train/test сохраняют пропорции классов")

train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(batch):
    enc = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)
    enc["labels"] = [label2id[lbl] for lbl in batch["label"]]
    return enc

train_ds = train_ds.map(preprocess, batched=True, remove_columns=["text", "label"])
test_ds = test_ds.map(preprocess, batched=True, remove_columns=["text", "label"])

datasets = DatasetDict({"train": train_ds, "test": test_ds})
print(f"\nГотово: train={len(train_ds)}, test={len(test_ds)}, classes={len(unique_labels)}")



Map: 100%|██████████| 7/7 [00:00<00:00, 707.20 examples/s]
Map: 100%|██████████| 3/3 [00:00<00:00, 848.53 examples/s]


In [None]:
# Обучение DistilBERT для классификации паттернов
# Алгоритм: fine-tuning предобученного DistilBERT с новым classification head
# Оптимизатор: AdamW (lr=3e-5, weight_decay=0.01)
# Scheduler: linear warmup + decay (по умолчанию в Trainer)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id,
)

# Гиперпараметры для полноценного обучения
args = TrainingArguments(
    output_dir="./roles-checkpoints",
    num_train_epochs=5,  # больше эпох для лучшей сходимости
    per_device_train_batch_size=16,  # увеличенный batch для стабильности градиентов
    per_device_eval_batch_size=16,
    learning_rate=3e-5,  # стандартный LR для fine-tuning BERT
    weight_decay=0.01,  # L2 регуляризация
    warmup_ratio=0.1,  # 10% шагов на warmup
    logging_steps=50,
    remove_unused_columns=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["test"],
    tokenizer=tokenizer,
)

print(f"Training on {len(datasets['train'])} samples, evaluating on {len(datasets['test'])} samples")
print(f"Classes: {len(unique_labels)}, Epochs: {args.num_train_epochs}, Batch: {args.per_device_train_batch_size}")
trainer.train();


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
20,0.6359


In [None]:
# Оценка модели по ключевым метрикам
# - Accuracy: общая доля правильных предсказаний
# - Precision: точность (TP / (TP + FP)) — сколько предсказанных верны
# - Recall: полнота (TP / (TP + FN)) — сколько реальных найдено
# - F1-score: гармоническое среднее precision и recall
# - Macro F1: среднее F1 по всем классам (важно при дисбалансе)
# - Weighted F1: F1 с весами по размеру класса

import numpy as np
from sklearn.metrics import accuracy_score, f1_score

preds = trainer.predict(datasets["test"])
logits = preds.predictions
pred_labels = np.argmax(logits, axis=1)
true_labels = preds.label_ids

labels_list = sorted(id2label)
target_names = [id2label[i] for i in labels_list]

# Основные метрики
accuracy = accuracy_score(true_labels, pred_labels)
macro_f1 = f1_score(true_labels, pred_labels, average="macro", zero_division=0)
weighted_f1 = f1_score(true_labels, pred_labels, average="weighted", zero_division=0)

print("=" * 60)
print("РЕЗУЛЬТАТЫ ДООБУЧЕНИЯ DistilBERT")
print("=" * 60)
print(f"Test size: {len(true_labels)} samples")
print(f"Classes: {len(unique_labels)}")
print("-" * 60)
print(f"Accuracy:    {accuracy:.2%}")
print(f"Macro F1:    {macro_f1:.2%}")
print(f"Weighted F1: {weighted_f1:.2%}")
print("-" * 60)
print("\nПодробный отчет по классам:")
print(classification_report(true_labels, pred_labels, labels=labels_list, target_names=target_names, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(true_labels, pred_labels, labels=labels_list))

# Сохранение модели
model.save_pretrained("distilbert-finetune-roles")
tokenizer.save_pretrained("distilbert-finetune-roles")
print("\nМодель сохранена в distilbert-finetune-roles/")




test size: 3
                             precision    recall  f1-score   support

Активного соисполнительства       0.00      0.00      0.00         1
   Кураторства и управления       0.00      0.00      0.00         0
      Периферийного участия       0.67      1.00      0.80         2

                   accuracy                           0.67         3
                  macro avg       0.22      0.33      0.27         3
               weighted avg       0.44      0.67      0.53         3

Confusion matrix:
 [[0 0 1]
 [0 0 0]
 [0 0 2]]


SafetensorError: Error while serializing: I/O error: No space left on device (os error 28)