# Baseline: время первого комментария (регрессия)
Простой числовой baseline для сравнения с LLM+MCP.


In [None]:
import json
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

PR_SAMPLES = Path("..").resolve() / "data" / "pr_samples.json"
with open(PR_SAMPLES, "r", encoding="utf-8") as f:
    data = json.load(f)
prs = data.get("prs", [])

# Формируем простой табличный датасет для регрессии
rows = []
for pr in prs:
    rows.append({
        "first_response_hours": pr.get("first_response_hours", 0.0),
        "issue_comments": pr.get("issue_comment_count", 0),
        "review_comments": pr.get("review_comment_count", 0),
        "total_comments": pr.get("total_comment_count", 0),
        "participants": len(pr.get("participants", []) or []),
    })

df = pd.DataFrame(rows)
X = df.drop(columns=["first_response_hours"])
y = df["first_response_hours"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

preds = model.predict(X_test)
mae = mean_absolute_error(y_test, preds)
mape = mean_absolute_percentage_error(y_test, preds) * 100

print("MAE (hours):", round(mae, 2))
print("MAPE (%):", round(mape, 2))
