# Model Training

**Purpose**: Interactive model training and quick iteration

This notebook provides:
- Quick model training experiments
- Parameter adjustments
- Immediate feedback on metrics
- Model comparison
- Save/load functionality

## Setup

In [None]:
import sys
sys.path.insert(0, '../')

from packages.training import FeatureExtractor, FeatureBuilder, ModelTrainer
from packages.storage import ClientFactory, get_connection_params
from notebook_utils import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from loguru import logger
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score

setup_plotting()

## Configuration

In [None]:
NETWORK = 'ethereum'
START_DATE = '2024-01-01'
END_DATE = '2024-02-29'
WINDOW_DAYS = 7
TEST_SIZE = 0.2
RANDOM_STATE = 42

print(f"Network: {NETWORK}")
print(f"Training Period: {START_DATE} to {END_DATE}")
print(f"Window: {WINDOW_DAYS} days")
print(f"Test Split: {TEST_SIZE * 100}%")

## Load and Prepare Data

In [None]:
connection_params = get_connection_params(NETWORK)
client_factory = ClientFactory(connection_params)

with client_factory.client_context() as client:
    extractor = FeatureExtractor(client)
    data = extractor.extract_training_data(
        start_date=START_DATE,
        end_date=END_DATE,
        window_days=WINDOW_DAYS
    )

print(f"Extracted {len(data):,} samples")

## Build Features

In [None]:
builder = FeatureBuilder()
X, y = builder.build_training_features(data)

print(f"Feature matrix: {X.shape}")
print(f"Target: {y.shape}")
print(f"\nClass distribution:")
print(y.value_counts())
print(f"\nClass balance: {(y.value_counts() / len(y) * 100).round(2)}%")

## Train/Test Split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTraining class distribution:")
print(y_train.value_counts())
print(f"\nTest class distribution:")
print(y_test.value_counts())

## Train Alert Scorer Model

In [None]:
trainer = ModelTrainer(model_type='alert_scorer')
model, metrics = trainer.train(X_train, y_train, cv_folds=5)

print("\n=== Alert Scorer Metrics ===")
print(f"Test AUC: {metrics['test_auc']:.4f}")
print(f"CV AUC: {metrics['cv_auc_mean']:.4f} ± {metrics['cv_auc_std']:.4f}")
print(f"CV Precision: {metrics['cv_precision_mean']:.4f} ± {metrics['cv_precision_std']:.4f}")
print(f"CV Recall: {metrics['cv_recall_mean']:.4f} ± {metrics['cv_recall_std']:.4f}")
print(f"CV F1: {metrics['cv_f1_mean']:.4f} ± {metrics['cv_f1_std']:.4f}")

## Evaluate on Test Set

In [None]:
y_pred_proba = model.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)

test_auc = roc_auc_score(y_test, y_pred_proba)
print(f"Test Set AUC: {test_auc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

## Visualize ROC Curve

In [None]:
plot_roc_curve(y_test, y_pred_proba, model_name='Alert Scorer')
plt.show()

## Visualize Precision-Recall Curve

In [None]:
plot_pr_curve(y_test, y_pred_proba, model_name='Alert Scorer')
plt.show()

## Confusion Matrix

In [None]:
plot_confusion_matrix(y_test, y_pred, labels=['Low Risk', 'High Risk'])
plt.show()

## Score Distribution

In [None]:
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(y_pred_proba[y_test == 0], bins=50, alpha=0.6, label='Actual: Low Risk', edgecolor='black')
plt.hist(y_pred_proba[y_test == 1], bins=50, alpha=0.6, label='Actual: High Risk', edgecolor='black')
plt.xlabel('Predicted Probability')
plt.ylabel('Frequency')
plt.title('Score Distribution by Actual Class')
plt.legend()
plt.grid(True, alpha=0.3)

plt.subplot(1, 2, 2)
plt.boxplot([y_pred_proba[y_test == 0], y_pred_proba[y_test == 1]], labels=['Low Risk', 'High Risk'])
plt.ylabel('Predicted Probability')
plt.title('Score Distribution Box Plot')
plt.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## Train Alert Ranker Model

In [None]:
ranker_trainer = ModelTrainer(model_type='alert_ranker')
ranker_model, ranker_metrics = ranker_trainer.train(X_train, y_train, cv_folds=5)

print("\n=== Alert Ranker Metrics ===")
print(f"Test NDCG: {ranker_metrics.get('test_ndcg', 'N/A')}")
print(f"CV NDCG: {ranker_metrics.get('cv_ndcg_mean', 'N/A')}")

## Compare Models

In [None]:
comparison_metrics = {
    'Alert Scorer': metrics['test_auc'],
    'Alert Ranker': ranker_metrics.get('test_auc', 0.0)
}

plot_metric_comparison(comparison_metrics, 'Model Comparison: Test AUC')
plt.show()

## Save Model (Optional)

In [None]:
# Uncomment to save model
# from packages.training import ModelStorage
# 
# with client_factory.client_context() as client:
#     storage = ModelStorage(client)
#     model_id = storage.save_model(
#         model=model,
#         model_type='alert_scorer',
#         network=NETWORK,
#         metrics=metrics,
#         feature_names=X.columns.tolist(),
#         training_config={
#             'start_date': START_DATE,
#             'end_date': END_DATE,
#             'window_days': WINDOW_DAYS
#         }
#     )
#     print(f"Model saved with ID: {model_id}")

print("Model saving disabled (uncomment to enable)")

## Experiment: Different Parameters

In [None]:
# Experiment with different learning rates
learning_rates = [0.01, 0.05, 0.1]
lr_results = {}

for lr in learning_rates:
    print(f"\nTraining with learning_rate={lr}")
    exp_trainer = ModelTrainer(
        model_type='alert_scorer',
        learning_rate=lr
    )
    exp_model, exp_metrics = exp_trainer.train(X_train, y_train, cv_folds=3)
    lr_results[f'LR={lr}'] = exp_metrics['test_auc']
    print(f"Test AUC: {exp_metrics['test_auc']:.4f}")

print("\nLearning Rate Comparison:")
plot_metric_comparison(lr_results, 'Learning Rate Impact on Test AUC')
plt.show()

## Experiment: Different Tree Depths

In [None]:
# Experiment with different max depths
max_depths = [3, 5, 7, 10]
depth_results = {}

for depth in max_depths:
    print(f"\nTraining with max_depth={depth}")
    exp_trainer = ModelTrainer(
        model_type='alert_scorer',
        max_depth=depth
    )
    exp_model, exp_metrics = exp_trainer.train(X_train, y_train, cv_folds=3)
    depth_results[f'Depth={depth}'] = exp_metrics['test_auc']
    print(f"Test AUC: {exp_metrics['test_auc']:.4f}")

print("\nTree Depth Comparison:")
plot_metric_comparison(depth_results, 'Max Depth Impact on Test AUC')
plt.show()

## Feature Importance Quick View

In [None]:
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 15 Most Important Features:")
print(feature_importance.head(15))

plt.figure(figsize=(10, 6))
feature_importance.head(15).plot(x='feature', y='importance', kind='barh')
plt.title('Top 15 Feature Importances')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## Conclusions

**Model Performance**:
- Review test AUC and cross-validation scores
- Check for overfitting (train vs test performance)
- Analyze score distributions

**Parameter Experiments**:
- Learning rate impact
- Tree depth impact
- Other hyperparameter effects

**Next Steps**:
- Proceed to Hyperparameter Tuning for systematic optimization
- Use Model Evaluation notebook for detailed analysis
- Review Feature Importance notebook for deeper insights
- Consider Error Analysis to understand mistakes