# DistilBERT — 7 паттернов + роли

## Данные
- Источник: microsoft/vscode (цель 7000 PR, файл data/pr_samples_vscode_7000.json)
- Вход: `[PRs:x Reviews:y Comments:z] тексты комментариев пользователя`

## Паттерны → роли
| Паттерн | Роли |
|---------|------|
| Пассивного потребления | Lurker, Passive user, Rare contributor |
| Инициации обратной связи | Bug reporter, Coordinator |
| Периферийного участия | Peripheral developer, Nomad Coder, Independent |
| Активного соисполнительства | Bug fixer, Active developer, Code Warrior |
| Кураторства и управления | Project steward, Coordinator, Progress controller |
| Лидерства и наставничества | Project leader, Core member, Core developer |
| Социального влияния | Project Rockstar |


In [None]:
# Setup (Colab-friendly)
import sys, subprocess, pkg_resources, os
from pathlib import Path
try: pkg_resources.get_distribution('accelerate>=0.26.0')
except: subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-q', 'accelerate>=0.26.0'])

if 'google.colab' in sys.modules:
    !git clone https://github.com/elenagernichenko/github-analyzer.git 2>/dev/null || true
    %cd /content/github-analyzer

PROJECT_ROOT = Path('.').resolve()
PR_SAMPLES = PROJECT_ROOT / 'data' / 'pr_samples_vscode_7000.json'
print('DATA:', PR_SAMPLES)


In [None]:
# Imports
import json
from collections import Counter, defaultdict
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('device:', device)


In [None]:
# Load data
with open(PR_SAMPLES, 'r', encoding='utf-8') as f:
    data = json.load(f)
prs = data.get('prs', [])

user_stats = defaultdict(lambda: {'authored': 0, 'comments': [], 'prs': set()})
for pr in prs:
    author = pr.get('author', '')
    if author: user_stats[author]['authored'] += 1
    for c in pr.get('comments', []):
        user, body = c.get('user', ''), c.get('body', '').strip()
        if user and body and 'bot' not in user.lower() and 'copilot' not in user.lower():
            user_stats[user]['comments'].append(body)
            user_stats[user]['prs'].add(pr.get('number'))

print('Users total:', len(user_stats))


In [None]:
# Heuristic classification (7 паттернов)
def classify(text: str, authored: int, reviewed: int, n_comments: int) -> str:
    t = text.lower()
    total = authored + reviewed + n_comments
    if total < 2: return 'Пассивного потребления'
    if authored == 0 and n_comments > 0 and any(k in t for k in ['bug','issue','error','crash','fail']):
        return 'Инициации обратной связи'
    if any(k in t for k in ['community','team','milestone','release']):
        return 'Социального влияния'
    if authored > 10 and n_comments > 20 and any(k in t for k in ['architecture','design','explain','doc']):
        return 'Лидерства и наставничества'
    if reviewed > authored*2 or any(k in t for k in ['lgtm','approved','merge','please fix','please add']):
        return 'Кураторства и управления'
    if authored > 3 or any(k in t for k in ['fixed','implemented','added','commit','updated']):
        return 'Активного соисполнительства'
    return 'Периферийного участия'

rows = []
for user, stats in user_stats.items():
    if len(stats['comments']) < 2: continue
    authored = stats['authored']
    reviewed = max(0, len(stats['prs']) - authored)
    n_comments = len(stats['comments'])
    comments_text = ' | '.join(stats['comments'])[:400]
    label = classify(comments_text, authored, reviewed, n_comments)
    text = f'[PRs:{authored} Reviews:{reviewed} Comments:{n_comments}] {comments_text}'
    rows.append({'text': text, 'label': label})

print('Users with >=2 comments:', len(rows))
print('Distribution:', dict(Counter(r['label'] for r in rows)))


In [None]:
# Dataset + class weights
unique_labels = sorted(set(r['label'] for r in rows))
label2id = {lbl:i for i,lbl in enumerate(unique_labels)}
id2label = {i:lbl for lbl,i in label2id.items()}
df = pd.DataFrame(rows)
counts = df['label'].value_counts()
class_weights = {label2id[lbl]: len(df)/(len(counts)*cnt) for lbl,cnt in counts.items()}
print('Weights:', {id2label[i]: f'{w:.2f}' for i,w in class_weights.items()})

# Stratified split; если редкий класс <2 — без stratify
if counts.min() < 2:
    print("Stratify disabled: rare class has <2 samples")
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
else:
    train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-multilingual-cased')

def preprocess(batch):
    enc = tokenizer(batch['text'], truncation=True, padding='max_length', max_length=256)
    enc['labels'] = [label2id[lbl] for lbl in batch['label']]
    return enc

train_ds = Dataset.from_pandas(train_df, preserve_index=False).map(preprocess, batched=True, remove_columns=['text','label'])
test_ds  = Dataset.from_pandas(test_df,  preserve_index=False).map(preprocess, batched=True, remove_columns=['text','label'])
print(f'Train: {len(train_ds)}, Test: {len(test_ds)}, Classes: {len(unique_labels)}')


In [None]:
# Weighted Trainer
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.w = torch.tensor([class_weights[i] for i in range(len(class_weights))], dtype=torch.float)
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop('labels')
        outputs = model(**inputs)
        loss = torch.nn.CrossEntropyLoss(weight=self.w.to(outputs.logits.device))(outputs.logits, labels)
        return (loss, outputs) if return_outputs else loss

model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-multilingual-cased', num_labels=len(unique_labels), id2label=id2label, label2id=label2id)
args = TrainingArguments(output_dir='./ckpt', num_train_epochs=10, per_device_train_batch_size=4, learning_rate=2e-5, weight_decay=0.01, warmup_ratio=0.1, logging_steps=10, report_to='none')
trainer = WeightedTrainer(class_weights=class_weights, model=model, args=args, train_dataset=train_ds, eval_dataset=test_ds, tokenizer=tokenizer)
print(f'Training {len(train_ds)} samples, {len(unique_labels)} classes')
trainer.train()


In [None]:
# (old cell - skip, use Cell 8 below)


In [None]:
# Evaluation (fixed)
preds = trainer.predict(test_ds)
pred_labels = np.argmax(preds.predictions, axis=1)
true_labels = preds.label_ids

print('=' * 60)
print('RESULTS (class weights)')
print('=' * 60)
print(f'Accuracy: {accuracy_score(true_labels, pred_labels):.2%}')
print(f'Macro F1: {f1_score(true_labels, pred_labels, average="macro", zero_division=0):.2%}')
print('\n' + classification_report(true_labels, pred_labels, target_names=[id2label[i] for i in sorted(id2label)], zero_division=0))
print('Confusion Matrix:')
print(confusion_matrix(true_labels, pred_labels))


In [None]:
# Pattern -> role mapping and predict helper
PATTERN_TO_ROLES = {
    'Пассивного потребления': ['Lurker','Passive user','Rare contributor'],
    'Инициации обратной связи': ['Bug reporter','Coordinator'],
    'Периферийного участия': ['Peripheral developer','Nomad Coder','Independent'],
    'Активного соисполнительства': ['Bug fixer','Active developer','Code Warrior'],
    'Кураторства и управления': ['Project steward','Coordinator','Progress controller'],
    'Лидерства и наставничества': ['Project leader','Core member','Core developer'],
    'Социального влияния': ['Project Rockstar'],
}

def predict_user(prs_authored:int, reviews:int, comments:list[str]):
    text_comments = ' | '.join(comments)[:400]
    text = f'[PRs:{prs_authored} Reviews:{reviews} Comments:{len(comments)}] {text_comments}'
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=256)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)
    pred_id = torch.argmax(probs, dim=1).item()
    pattern = model.config.id2label[pred_id]
    role = PATTERN_TO_ROLES.get(pattern, ['Unknown'])[0]
    conf = probs[0][pred_id].item()
    return pattern, role, conf

print('Примеры предсказаний:')
for example in [
    (0,0,['Thanks!','Nice work']),
    (5,2,['Fixed bug','Implemented feature','Added tests']),
    (0,5,['LGTM','Please fix this','Approved']),
    (0,1,['Bug: crashes on start','Error on Windows']),
]:
    p,r,c = predict_user(*example)
    print(f'  [{c:.0%}] {p} -> {r}')
