# DistilBERT — классификация паттернов и ролей участников

## Цель
Обучить модель классификации паттернов активности и ролей участников open-source проектов на основе:
- **Текстов комментариев** пользователя (агрегированные)
- **Метрик активности**: PRs authored, PRs reviewed, comments count

## Данные
- **Источники**: microsoft/vscode + chaoss/augur
- **Объём**: 7000+ комментариев
- **Вход модели**: `[PRs:5 Reviews:12 Comments:30] Текст комментариев...`

## Классы (7 паттернов → роли)
| Паттерн | Роли |
|---------|------|
| Пассивного потребления | Lurker, Passive user, Rare contributor |
| Инициации обратной связи | Bug reporter, Coordinator |
| Периферийного участия | Peripheral developer, Nomad Coder |
| Активного соисполнительства | Bug fixer, Active developer, Code Warrior |
| Кураторства и управления | Project steward, Coordinator, Progress controller |
| Лидерства и наставничества | Project leader, Core member, Core developer |
| Социального влияния | Project Rockstar |

## Алгоритм
- **Модель**: DistilBERT-base-multilingual-cased
- **Вход**: метрики активности + тексты комментариев
- **Выход**: паттерн участия


In [None]:
# Устанавливаем зависимости (Colab-friendly)
import sys, subprocess, pkg_resources, os
from pathlib import Path
try: pkg_resources.get_distribution("accelerate>=0.26.0")
except: subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "accelerate>=0.26.0"])

if "google.colab" in sys.modules:
    !git clone https://github.com/elenagernichenko/mcp-analyzer.git 2>/dev/null || true
    %cd /content/mcp-analyzer

PROJECT_ROOT = Path(".").resolve()
PR_SAMPLES = PROJECT_ROOT / "data" / "pr_samples_multi.json"  # 7000+ комментариев из 2 репозиториев
print("DATA:", PR_SAMPLES) 

In [None]:
# Optional: !pip install transformers datasets scikit-learn pandas
import json
import random
import os

import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

device = "cuda" if (os.getenv("CUDA_VISIBLE_DEVICES") not in [None, ""] and os.environ.get("CUDA_VISIBLE_DEVICES") != "-1") else "cpu"
print("device:", device)



  from .autonotebook import tqdm as notebook_tqdm


device: cpu


In [None]:
# Загружаем PR и агрегируем данные ПО ПОЛЬЗОВАТЕЛЯМ (метрики + тексты)
from collections import Counter, defaultdict

with open(PR_SAMPLES, "r", encoding="utf-8") as f:
    data = json.load(f)
prs = data.get("prs", [])

# Собираем статистику и комментарии по каждому пользователю
user_stats = defaultdict(lambda: {"authored": 0, "reviewed": 0, "comments": [], "prs_participated": set()})

for pr in prs:
    author = pr.get("author", "")
    if author: user_stats[author]["authored"] += 1
    
    for c in pr.get("comments", []):
        user, body = c.get("user", ""), c.get("body", "").strip()
        if user and body and "bot" not in user.lower():
            user_stats[user]["comments"].append(body)
            user_stats[user]["prs_participated"].add(pr.get("number"))

# Классификация паттерна по метрикам активности + содержанию комментариев
def classify_user(stats: dict, comments_text: str) -> str:
    authored, reviewed = stats["authored"], len(stats["prs_participated"]) - stats["authored"]
    n_comments = len(stats["comments"])
    total_activity = authored + reviewed + n_comments
    text_lower = comments_text.lower()
    
    # По активности и содержанию
    if total_activity < 3: return "Пассивного потребления"
    if authored == 0 and n_comments > 0 and any(k in text_lower for k in ["bug", "issue", "error", "?"]): 
        return "Инициации обратной связи"
    if reviewed > authored * 2 or any(k in text_lower for k in ["lgtm", "approved", "merge", "please fix"]):
        return "Кураторства и управления"
    if authored > 10 and n_comments > 20 and any(k in text_lower for k in ["architecture", "design", "explain"]):
        return "Лидерства и наставничества"
    if any(k in text_lower for k in ["community", "team", "everyone", "milestone", "release"]):
        return "Социального влияния"
    if authored > 3 or any(k in text_lower for k in ["fixed", "implemented", "added", "commit"]):
        return "Активного соисполнительства"
    return "Периферийного участия"

# Формируем датасет: один пользователь = один пример
rows = []
for user, stats in user_stats.items():
    if len(stats["comments"]) < 2: continue  # минимум 2 комментария
    
    # Метрики активности
    authored, reviewed = stats["authored"], len(stats["prs_participated"]) - stats["authored"]
    n_comments = len(stats["comments"])
    
    # Текст: метрики + комментарии (до 400 символов)
    comments_text = " | ".join(stats["comments"])[:400]
    text = f"[PRs:{authored} Reviews:{max(0,reviewed)} Comments:{n_comments}] {comments_text}"
    
    label = classify_user(stats, comments_text)
    rows.append({"text": text, "label": label, "user": user})

label_counts = Counter(r["label"] for r in rows)
print(f"Пользователей: {len(rows)}, Классов: {len(label_counts)}")
print("Распределение:", dict(label_counts.most_common()))
pd.DataFrame(rows).head()


samples: 10


Unnamed: 0,text,label
0,"User jeroen authored 1 PRs, reviewed 0 PRs in ...",Периферийного участия
1,"User andrey-khropov authored 0 PRs, reviewed 5...",Кураторства и управления
2,"User topepo authored 0 PRs, reviewed 1 PRs in ...",Периферийного участия
3,"User isaigordeev authored 1 PRs, reviewed 0 PR...",Периферийного участия
4,"User Shersula authored 1 PRs, reviewed 0 PRs i...",Периферийного участия


In [None]:
# Подготовка датасета для обучения
unique_labels = sorted(set(r["label"] for r in rows))
label2id = {lbl: i for i, lbl in enumerate(unique_labels)}
id2label = {i: lbl for lbl, i in label2id.items()}

df = pd.DataFrame(rows)[["text", "label"]]  # только нужные колонки
counts = df["label"].value_counts()
print("Классы:", dict(counts))

# Stratified split (если возможно)
try:
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])
except:
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

train_ds = Dataset.from_pandas(train_df, preserve_index=False)
test_ds = Dataset.from_pandas(test_df, preserve_index=False)

model_name = "distilbert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(batch):
    enc = tokenizer(batch["text"], truncation=True, padding="max_length", max_length=256)
    enc["labels"] = [label2id[lbl] for lbl in batch["label"]]
    return enc

train_ds = train_ds.map(preprocess, batched=True, remove_columns=["text", "label"])
test_ds = test_ds.map(preprocess, batched=True, remove_columns=["text", "label"])
datasets = DatasetDict({"train": train_ds, "test": test_ds})
print(f"Train: {len(train_ds)}, Test: {len(test_ds)}, Classes: {len(unique_labels)}")



Map: 100%|██████████| 7/7 [00:00<00:00, 707.20 examples/s]
Map: 100%|██████████| 3/3 [00:00<00:00, 848.53 examples/s]


In [None]:
# Обучение DistilBERT для классификации паттернов
# Алгоритм: fine-tuning предобученного DistilBERT с новым classification head
# Оптимизатор: AdamW (lr=3e-5, weight_decay=0.01)
# Scheduler: linear warmup + decay (по умолчанию в Trainer)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id,
)

# Гиперпараметры для полноценного обучения
args = TrainingArguments(
    output_dir="./roles-checkpoints",
    num_train_epochs=5,  # больше эпох для лучшей сходимости
    per_device_train_batch_size=16,  # увеличенный batch для стабильности градиентов
    per_device_eval_batch_size=16,
    learning_rate=3e-5,  # стандартный LR для fine-tuning BERT
    weight_decay=0.01,  # L2 регуляризация
    warmup_ratio=0.1,  # 10% шагов на warmup
    logging_steps=50,
    remove_unused_columns=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=datasets["train"],
    eval_dataset=datasets["test"],
    tokenizer=tokenizer,
)

print(f"Training on {len(datasets['train'])} samples, evaluating on {len(datasets['test'])} samples")
print(f"Classes: {len(unique_labels)}, Epochs: {args.num_train_epochs}, Batch: {args.per_device_train_batch_size}")
trainer.train();


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss
20,0.6359


In [None]:
# Оценка модели по ключевым метрикам
# - Accuracy: общая доля правильных предсказаний
# - Precision: точность (TP / (TP + FP)) — сколько предсказанных верны
# - Recall: полнота (TP / (TP + FN)) — сколько реальных найдено
# - F1-score: гармоническое среднее precision и recall
# - Macro F1: среднее F1 по всем классам (важно при дисбалансе)
# - Weighted F1: F1 с весами по размеру класса

import numpy as np
from sklearn.metrics import accuracy_score, f1_score

preds = trainer.predict(datasets["test"])
logits = preds.predictions
pred_labels = np.argmax(logits, axis=1)
true_labels = preds.label_ids

labels_list = sorted(id2label)
target_names = [id2label[i] for i in labels_list]

# Основные метрики
accuracy = accuracy_score(true_labels, pred_labels)
macro_f1 = f1_score(true_labels, pred_labels, average="macro", zero_division=0)
weighted_f1 = f1_score(true_labels, pred_labels, average="weighted", zero_division=0)

print("=" * 60)
print("РЕЗУЛЬТАТЫ ДООБУЧЕНИЯ DistilBERT")
print("=" * 60)
print(f"Test size: {len(true_labels)} samples")
print(f"Classes: {len(unique_labels)}")
print("-" * 60)
print(f"Accuracy:    {accuracy:.2%}")
print(f"Macro F1:    {macro_f1:.2%}")
print(f"Weighted F1: {weighted_f1:.2%}")
print("-" * 60)
print("\nПодробный отчет по классам:")
print(classification_report(true_labels, pred_labels, labels=labels_list, target_names=target_names, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(true_labels, pred_labels, labels=labels_list))

# Сохранение модели
model.save_pretrained("distilbert-finetune-roles")
tokenizer.save_pretrained("distilbert-finetune-roles")
print("\nМодель сохранена в distilbert-finetune-roles/")




test size: 3
                             precision    recall  f1-score   support

Активного соисполнительства       0.00      0.00      0.00         1
   Кураторства и управления       0.00      0.00      0.00         0
      Периферийного участия       0.67      1.00      0.80         2

                   accuracy                           0.67         3
                  macro avg       0.22      0.33      0.27         3
               weighted avg       0.44      0.67      0.53         3

Confusion matrix:
 [[0 0 1]
 [0 0 0]
 [0 0 2]]


SafetensorError: Error while serializing: I/O error: No space left on device (os error 28)

In [None]:
# Функция определения роли по паттерну
PATTERN_TO_ROLES = {
    "Пассивного потребления": ["Lurker", "Passive user", "Rare contributor"],
    "Инициации обратной связи": ["Bug reporter", "Coordinator"],
    "Периферийного участия": ["Peripheral developer", "Nomad Coder", "Independent"],
    "Активного соисполнительства": ["Bug fixer", "Active developer", "Code Warrior"],
    "Кураторства и управления": ["Project steward", "Coordinator", "Progress controller"],
    "Лидерства и наставничества": ["Project leader", "Core member", "Core developer"],
    "Социального влияния": ["Project Rockstar"],
}

def predict_user_role(prs_authored: int, reviews: int, comments: list[str]) -> tuple[str, str, float]:
    """Предсказание паттерна и роли пользователя."""
    n_comments = len(comments)
    comments_text = " | ".join(comments)[:400]
    text = f"[PRs:{prs_authored} Reviews:{reviews} Comments:{n_comments}] {comments_text}"
    
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)
    pred_id = torch.argmax(probs, dim=1).item()
    
    pattern = model.config.id2label[pred_id]
    role = PATTERN_TO_ROLES.get(pattern, ["Unknown"])[0]
    confidence = probs[0][pred_id].item()
    return pattern, role, confidence

# Примеры использования
import torch
examples = [
    (0, 0, ["Thanks!", "Nice work"]),
    (5, 2, ["Fixed the bug", "Implemented new feature", "Added tests"]),
    (0, 10, ["LGTM", "Please fix this", "Approved", "Merge when ready"]),
    (0, 1, ["This causes a crash", "Bug: doesn't work on Windows"]),
]

print("Примеры предсказаний:")
for prs, reviews, comments in examples:
    pattern, role, conf = predict_user_role(prs, reviews, comments)
    print(f"  PRs:{prs} Reviews:{reviews} → {pattern} ({role}) [{conf:.0%}]")
