# CRIS ‚Äî Model Training & Evaluation

Trains Logistic Regression (baseline), Random Forest, and XGBoost on the
SQL-engineered feature matrix. Evaluates with PR-AUC, F1, and Confusion Matrix.

**Prerequisites:** Run `python src/data_ingestion.py` first.

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from pathlib import Path

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    precision_recall_curve,
    average_precision_score,
    roc_auc_score,
    f1_score,
    ConfusionMatrixDisplay,
    PrecisionRecallDisplay
)

try:
    from xgboost import XGBClassifier
    HAS_XGBOOST = True
except ImportError:
    HAS_XGBOOST = False
    print('XGBoost not installed. Skipping XGBoost model.')

from src.preprocessing import prepare_data, FEATURE_COLUMNS

sns.set_theme(style='whitegrid')
MODELS_DIR = Path('..') / 'models'
MODELS_DIR.mkdir(exist_ok=True)

print('Libraries loaded.')

## 1. Load & Prepare Data

In [None]:
data = prepare_data()

X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']
X_train_scaled = data['X_train_scaled']
X_test_scaled = data['X_test_scaled']
scaler = data['scaler']

print(f'\nFeatures: {FEATURE_COLUMNS}')
print(f'Train shape: {X_train.shape}')
print(f'Test shape:  {X_test.shape}')

## 2. Model Training

In [None]:
# ‚îÄ‚îÄ Baseline: Logistic Regression ‚îÄ‚îÄ
lr = LogisticRegression(
    class_weight='balanced',
    max_iter=1000,
    random_state=42
)
lr.fit(X_train_scaled, y_train)
print('‚úÖ Logistic Regression trained.')

# ‚îÄ‚îÄ Challenger 1: Random Forest ‚îÄ‚îÄ
rf = RandomForestClassifier(
    n_estimators=200,
    class_weight='balanced',
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)  # RF doesn't need scaling
print('‚úÖ Random Forest trained.')

# ‚îÄ‚îÄ Challenger 2: XGBoost ‚îÄ‚îÄ
if HAS_XGBOOST:
    # Compute scale_pos_weight for class imbalance
    n_neg = (y_train == 0).sum()
    n_pos = (y_train == 1).sum()
    spw = n_neg / n_pos if n_pos > 0 else 1

    xgb = XGBClassifier(
        n_estimators=200,
        max_depth=6,
        scale_pos_weight=spw,
        learning_rate=0.1,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    xgb.fit(X_train, y_train)
    print('‚úÖ XGBoost trained.')

## 3. Evaluation

In [None]:
def evaluate_model(model, X, y, model_name, needs_scaling=False):
    """Evaluate a model and return metrics."""
    X_eval = X_test_scaled if needs_scaling else X
    
    y_pred = model.predict(X_eval)
    y_prob = model.predict_proba(X_eval)[:, 1]
    
    pr_auc = average_precision_score(y, y_prob)
    roc = roc_auc_score(y, y_prob)
    f1 = f1_score(y, y_pred)
    
    print(f'\n{"‚ïê"*50}')
    print(f'  {model_name}')
    print(f'{"‚ïê"*50}')
    print(f'  PR-AUC:   {pr_auc:.4f}')
    print(f'  ROC-AUC:  {roc:.4f}')
    print(f'  F1-Score: {f1:.4f}')
    print(f'\n{classification_report(y, y_pred, target_names=["Safe", "Churned"])}')
    
    return {'name': model_name, 'model': model, 'pr_auc': pr_auc, 'roc_auc': roc, 'f1': f1,
            'y_pred': y_pred, 'y_prob': y_prob, 'needs_scaling': needs_scaling}

results = []
results.append(evaluate_model(lr, X_test, y_test, 'Logistic Regression', needs_scaling=True))
results.append(evaluate_model(rf, X_test, y_test, 'Random Forest'))
if HAS_XGBOOST:
    results.append(evaluate_model(xgb, X_test, y_test, 'XGBoost'))

## 4. Model Comparison

In [None]:
# ‚îÄ‚îÄ Comparison table ‚îÄ‚îÄ
comparison = pd.DataFrame([{
    'Model': r['name'],
    'PR-AUC': r['pr_auc'],
    'ROC-AUC': r['roc_auc'],
    'F1-Score': r['f1']
} for r in results]).set_index('Model')

print('\nüìä Model Comparison:')
print(comparison.to_string())

# ‚îÄ‚îÄ Bar chart ‚îÄ‚îÄ
fig, ax = plt.subplots(figsize=(10, 5))
comparison.plot.bar(ax=ax, rot=0, colormap='viridis')
ax.set_title('Model Comparison', fontsize=16, fontweight='bold')
ax.set_ylabel('Score')
ax.legend(loc='lower right')
ax.set_ylim(0, 1.1)
for container in ax.containers:
    ax.bar_label(container, fmt='%.3f', fontsize=9)
plt.tight_layout()
plt.show()

## 5. Confusion Matrices

In [None]:
fig, axes = plt.subplots(1, len(results), figsize=(6 * len(results), 5))
if len(results) == 1:
    axes = [axes]

for ax, r in zip(axes, results):
    ConfusionMatrixDisplay.from_predictions(
        y_test, r['y_pred'],
        display_labels=['Safe', 'Churned'],
        cmap='Blues',
        ax=ax
    )
    ax.set_title(r['name'], fontweight='bold')

plt.suptitle('Confusion Matrices', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.show()

## 6. Precision-Recall Curves

In [None]:
fig, ax = plt.subplots(figsize=(10, 7))
colors = ['#3498db', '#2ecc71', '#e74c3c']

for r, color in zip(results, colors):
    precision, recall, _ = precision_recall_curve(y_test, r['y_prob'])
    ax.plot(recall, precision, color=color, linewidth=2,
            label=f"{r['name']} (PR-AUC={r['pr_auc']:.3f})")

ax.set_xlabel('Recall', fontsize=13)
ax.set_ylabel('Precision', fontsize=13)
ax.set_title('Precision-Recall Curves', fontsize=16, fontweight='bold')
ax.legend(fontsize=12)
ax.set_xlim([0, 1.05])
ax.set_ylim([0, 1.05])
plt.tight_layout()
plt.show()

## 7. Feature Importances

In [None]:
# Use the best tree-based model for feature importances
best_tree = results[1]  # Random Forest

importances = pd.Series(
    best_tree['model'].feature_importances_,
    index=FEATURE_COLUMNS
).sort_values(ascending=True)

fig, ax = plt.subplots(figsize=(10, 8))
importances.plot.barh(ax=ax, color='#3498db')
ax.set_title(f"Feature Importances ({best_tree['name']})",
             fontsize=16, fontweight='bold')
ax.set_xlabel('Importance')
plt.tight_layout()
plt.show()

## 8. Save Best Model

In [None]:
# Select best model by PR-AUC
best = max(results, key=lambda r: r['pr_auc'])
print(f'\nüèÜ Best model: {best["name"]} (PR-AUC: {best["pr_auc"]:.4f})')

# Save model
joblib.dump(best['model'], MODELS_DIR / 'best_model.pkl')
print(f'‚úÖ Model saved to models/best_model.pkl')

# Save scaler
joblib.dump(scaler, MODELS_DIR / 'scaler.pkl')
print(f'‚úÖ Scaler saved to models/scaler.pkl')

# Save metadata
metadata = {
    'model_name': best['name'],
    'pr_auc': best['pr_auc'],
    'roc_auc': best['roc_auc'],
    'f1': best['f1'],
    'features': FEATURE_COLUMNS,
    'needs_scaling': best['needs_scaling']
}

import json
with open(MODELS_DIR / 'model_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)
print(f'‚úÖ Metadata saved to models/model_metadata.json')

print(f'\nüì¶ All artifacts saved to {MODELS_DIR.resolve()}')