# Domain Risk Training and Evaluation

This notebook trains the domain risk model and evaluates PR/confusion metrics.

In [None]:
from pathlib import Path
import json
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import PrecisionRecallDisplay, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve, roc_curve
from sklearn.model_selection import train_test_split

from sentineldns.features.domain_features import build_domain_feature_matrix
from sentineldns.models.domain_risk import train_domain_risk_model
from sentineldns.models.export import export_joblib, export_metadata

csv_path = Path('../data/processed/labeled_domains.csv')
artifacts_dir = Path('../data/artifacts/domain_risk')
df = pd.read_csv(csv_path)
print('rows:', len(df))
print(df['label'].value_counts())
print(df.groupby('label')['domain'].nunique())
df.head()

In [None]:
domains = df['domain'].astype(str).tolist()
y = df['label'].astype(int).to_numpy()

X, vectorizer, _ = build_domain_feature_matrix(domains)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

model = LogisticRegression(solver='liblinear', class_weight='balanced', max_iter=500, random_state=42)
model.fit(X_train, y_train)

probs = model.predict_proba(X_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, probs)

fpr, _, thresholds = roc_curve(y_test, probs)
finite = [(fp, thr) for fp, thr in zip(fpr, thresholds, strict=True) if np.isfinite(thr)]
candidates = [thr for fp, thr in finite if fp <= 0.01]
threshold = max(candidates) if candidates else 0.9
pred = (probs >= threshold).astype(int)

cm = confusion_matrix(y_test, pred)
tp = cm[1, 1]
fp_count = cm[0, 1]
fn = cm[1, 0]
precision_at_threshold = tp / max(tp + fp_count, 1)
recall_at_threshold = tp / max(tp + fn, 1)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))
PrecisionRecallDisplay(precision=precision, recall=recall).plot(ax=axes[0])
axes[0].set_title('PR Curve (Domain Risk)')
ConfusionMatrixDisplay(cm).plot(ax=axes[1], colorbar=False)
axes[1].set_title(f'Confusion Matrix @ threshold={threshold:.4f}')
plt.tight_layout()

print({'threshold': threshold, 'precision': precision_at_threshold, 'recall': recall_at_threshold})

artifacts_dir.mkdir(parents=True, exist_ok=True)
export_joblib(artifacts_dir / 'model.joblib', model)
export_joblib(artifacts_dir / 'vectorizer.joblib', vectorizer)
export_metadata(
    artifacts_dir / 'metadata.json',
    {
        'model_version': pd.Timestamp.utcnow().strftime('%Y%m%dT%H%M%SZ'),
        'train_rows': int(len(y_train)),
        'test_rows': int(len(y_test)),
        'threshold': float(threshold),
        'target_fpr': 0.01,
    },
)

# Also run package training path for parity with CLI/service behavior.
metrics = train_domain_risk_model(csv_path)
metrics

In [None]:
# False-positive / true-positive analysis
# Rebuild the exact test split indices so we can inspect domain-level outcomes.
index_df = df.reset_index().rename(columns={'index': 'row_id'})
idx = index_df['row_id'].to_numpy()
_, idx_test, _, _ = train_test_split(idx, y, test_size=0.25, random_state=42, stratify=y)
analysis_df = df.iloc[idx_test].copy().reset_index(drop=True)
analysis_df['prob'] = probs
analysis_df['pred'] = pred

false_positives = analysis_df[(analysis_df['label'] == 0) & (analysis_df['pred'] == 1)].sort_values('prob', ascending=False)
true_positives = analysis_df[(analysis_df['label'] == 1) & (analysis_df['pred'] == 1)].sort_values('prob', ascending=False)

print('Top false positives')
display(false_positives[['domain', 'prob', 'source']].head(20))
print('Top true positives')
display(true_positives[['domain', 'prob', 'source']].head(20))