# Hyperparameter Tuning

**Purpose**: Systematic hyperparameter optimization

This notebook demonstrates:
- Grid search
- Random search
- Bayesian optimization with Optuna
- Cross-validation
- Parameter importance analysis

## Setup

In [None]:
import sys
sys.path.insert(0, '../')

from packages.training import FeatureExtractor, FeatureBuilder, ModelTrainer
from packages.storage import ClientFactory, get_connection_params
from notebook_utils import *

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from loguru import logger
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

setup_plotting()
optuna.logging.set_verbosity(optuna.logging.WARNING)

## Configuration

In [None]:
NETWORK = 'ethereum'
START_DATE = '2024-01-01'
END_DATE = '2024-02-29'
WINDOW_DAYS = 7
TEST_SIZE = 0.2
RANDOM_STATE = 42
N_TRIALS = 50

print(f"Network: {NETWORK}")
print(f"Date Range: {START_DATE} to {END_DATE}")
print(f"Optimization Trials: {N_TRIALS}")

## Load and Prepare Data

In [None]:
connection_params = get_connection_params(NETWORK)
client_factory = ClientFactory(connection_params)

with client_factory.client_context() as client:
    extractor = FeatureExtractor(client)
    data = extractor.extract_training_data(
        start_date=START_DATE,
        end_date=END_DATE,
        window_days=WINDOW_DAYS
    )

builder = FeatureBuilder()
X, y = builder.build_training_features(data)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

## Grid Search

In [None]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 200]
}

print("Grid Search Configuration:")
print(f"Parameter grid: {param_grid}")
print(f"Total combinations: {np.prod([len(v) for v in param_grid.values()])}")

base_trainer = ModelTrainer(model_type='alert_scorer')
grid_search = GridSearchCV(
    base_trainer,
    param_grid,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

print("\nRunning grid search...")
grid_search.fit(X_train, y_train)

print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best CV score: {grid_search.best_score_:.4f}")

## Grid Search Results Analysis

In [None]:
results_df = pd.DataFrame(grid_search.cv_results_)
results_df = results_df.sort_values('rank_test_score')

print("Top 10 parameter combinations:")
display_cols = ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']
print(results_df[display_cols].head(10))

## Random Search

In [None]:
from scipy.stats import uniform, randint

param_distributions = {
    'learning_rate': uniform(0.01, 0.29),
    'max_depth': randint(3, 11),
    'n_estimators': randint(50, 301),
    'min_child_weight': randint(1, 11)
}

random_search = RandomizedSearchCV(
    base_trainer,
    param_distributions,
    n_iter=20,
    cv=3,
    scoring='roc_auc',
    n_jobs=-1,
    random_state=RANDOM_STATE,
    verbose=1
)

print("Running random search...")
random_search.fit(X_train, y_train)

print(f"\nBest parameters: {random_search.best_params_}")
print(f"Best CV score: {random_search.best_score_:.4f}")

## Bayesian Optimization with Optuna

In [None]:
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)
    }
    
    trainer = ModelTrainer(model_type='alert_scorer', **params)
    model, metrics = trainer.train(X_train, y_train, cv_folds=3)
    
    return metrics['cv_auc_mean']

study = optuna.create_study(direction='maximize', study_name='alert_scorer_tuning')
print(f"Running Optuna optimization with {N_TRIALS} trials...")
study.optimize(objective, n_trials=N_TRIALS, show_progress_bar=True)

print(f"\nBest trial:")
print(f"  Value: {study.best_trial.value:.4f}")
print(f"  Params: {study.best_trial.params}")

## Optimization History

In [None]:
fig = plot_optimization_history(study)
fig.show()

## Parameter Importance

In [None]:
fig = plot_param_importances(study)
fig.show()

## Parallel Coordinate Plot

In [None]:
from optuna.visualization import plot_parallel_coordinate

fig = plot_parallel_coordinate(study)
fig.show()

## Parameter Relationships

In [None]:
trials_df = study.trials_dataframe()
trials_df = trials_df.sort_values('value', ascending=False)

print("Top 10 trials:")
print(trials_df[['value', 'params_learning_rate', 'params_max_depth', 
                  'params_n_estimators']].head(10))

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

param_names = ['learning_rate', 'max_depth', 'n_estimators', 
               'min_child_weight', 'subsample', 'colsample_bytree']

for idx, param in enumerate(param_names):
    col_name = f'params_{param}'
    if col_name in trials_df.columns:
        axes[idx].scatter(trials_df[col_name], trials_df['value'], alpha=0.6)
        axes[idx].set_xlabel(param)
        axes[idx].set_ylabel('CV AUC')
        axes[idx].set_title(f'{param} vs Performance')
        axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Train Final Model with Best Parameters

In [None]:
best_params = study.best_trial.params
print(f"Training final model with best parameters:")
print(best_params)

final_trainer = ModelTrainer(model_type='alert_scorer', **best_params)
final_model, final_metrics = final_trainer.train(X_train, y_train, cv_folds=5)

print("\n=== Final Model Metrics ===")
print(f"Test AUC: {final_metrics['test_auc']:.4f}")
print(f"CV AUC: {final_metrics['cv_auc_mean']:.4f} ± {final_metrics['cv_auc_std']:.4f}")
print(f"CV Precision: {final_metrics['cv_precision_mean']:.4f} ± {final_metrics['cv_precision_std']:.4f}")
print(f"CV Recall: {final_metrics['cv_recall_mean']:.4f} ± {final_metrics['cv_recall_std']:.4f}")

## Evaluate on Test Set

In [None]:
from sklearn.metrics import roc_auc_score

y_pred_proba = final_model.predict(X_test)
test_auc = roc_auc_score(y_test, y_pred_proba)

print(f"Test Set AUC: {test_auc:.4f}")

plot_roc_curve(y_test, y_pred_proba, model_name='Optimized Model')
plt.show()

plot_pr_curve(y_test, y_pred_proba, model_name='Optimized Model')
plt.show()

## Compare Methods

In [None]:
comparison = {
    'Grid Search': grid_search.best_score_,
    'Random Search': random_search.best_score_,
    'Bayesian (Optuna)': study.best_trial.value,
    'Final Model (Test)': test_auc
}

plot_metric_comparison(comparison, 'Hyperparameter Tuning Method Comparison')
plt.ylabel('AUC Score')
plt.show()

print("\nComparison Summary:")
for method, score in comparison.items():
    print(f"{method:25s}: {score:.4f}")

## Cross-Validation Stability

In [None]:
cv_scores = []
for i in range(5):
    model, metrics = final_trainer.train(X_train, y_train, cv_folds=5)
    cv_scores.append(metrics['cv_auc_mean'])

plt.figure(figsize=(10, 6))
plt.plot(range(1, 6), cv_scores, marker='o', linewidth=2, markersize=8)
plt.axhline(y=np.mean(cv_scores), color='r', linestyle='--', label=f'Mean: {np.mean(cv_scores):.4f}')
plt.xlabel('Run')
plt.ylabel('CV AUC')
plt.title('Cross-Validation Stability (5 Runs)')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print(f"CV Score Stability:")
print(f"  Mean: {np.mean(cv_scores):.4f}")
print(f"  Std:  {np.std(cv_scores):.4f}")
print(f"  Min:  {np.min(cv_scores):.4f}")
print(f"  Max:  {np.max(cv_scores):.4f}")

## Save Best Parameters

In [None]:
import json

best_config = {
    'model_type': 'alert_scorer',
    'network': NETWORK,
    'tuning_method': 'optuna',
    'n_trials': N_TRIALS,
    'best_params': best_params,
    'best_cv_score': study.best_trial.value,
    'test_score': test_auc,
    'training_period': {
        'start_date': START_DATE,
        'end_date': END_DATE,
        'window_days': WINDOW_DAYS
    }
}

print("Best Configuration:")
print(json.dumps(best_config, indent=2))

# Uncomment to save
# with open('best_hyperparameters.json', 'w') as f:
#     json.dump(best_config, f, indent=2)
# print("\nSaved to best_hyperparameters.json")

## Conclusions

**Hyperparameter Tuning Results**:

1. **Best Method**: Compare Grid Search, Random Search, and Bayesian Optimization
2. **Important Parameters**: Review parameter importance plot
3. **Optimal Configuration**: Use best parameters for production
4. **Stability**: Verify cross-validation consistency

**Key Insights**:
- Bayesian optimization typically finds better parameters with fewer trials
- Parameter importance helps understand what drives performance
- Cross-validation stability indicates robust configuration

**Next Steps**:
- Use optimized model for comprehensive evaluation
- Compare with baseline in Model Comparison notebook
- Analyze predictions in Error Analysis notebook
- Deploy best model to production