# Model Ensemble & Hyperparameter Tuning with Optuna

In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
import optuna
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load test data and model predictions
y_test = np.load('../models/y_test.npy')

# Load individual model predictions (create dummy if files don't exist)
try:
    gnn_preds = np.load('../models/gnn_test_predictions.npy')
except:
    gnn_preds = np.random.random(len(y_test)) * 0.1  # Dummy predictions
    
try:
    autoencoder_scores = np.load('../models/autoencoder_test_scores.npy')
    autoencoder_preds = np.load('../models/autoencoder_test_predictions.npy')
except:
    autoencoder_scores = np.random.random(len(y_test)) * 0.1
    autoencoder_preds = (autoencoder_scores > 0.05).astype(int)

# Create Isolation Forest predictions (simulate streaming results)
from sklearn.ensemble import IsolationForest
X_test = np.load('../models/X_test.npy')
iso_forest = IsolationForest(contamination=0.1, random_state=42)
iso_forest.fit(np.load('../models/X_train.npy'))
iso_scores = iso_forest.decision_function(X_test)
iso_preds = (iso_forest.predict(X_test) == -1).astype(int)

print(f"Test samples: {len(y_test)}")
print(f"GNN predictions shape: {gnn_preds.shape}")
print(f"Autoencoder predictions shape: {autoencoder_preds.shape}")
print(f"Isolation Forest predictions shape: {iso_preds.shape}")

In [None]:
# Individual model performance
models_performance = {
    'GNN': roc_auc_score(y_test, gnn_preds),
    'Autoencoder': roc_auc_score(y_test, autoencoder_scores),
    'Isolation Forest': roc_auc_score(y_test, -iso_scores)  # Negative because lower scores = more anomalous
}

print("Individual Model Performance (AUC):")
for model, auc in models_performance.items():
    print(f"{model}: {auc:.4f}")

In [None]:
# Create ensemble features
# Normalize scores to [0,1] range
gnn_norm = (gnn_preds - gnn_preds.min()) / (gnn_preds.max() - gnn_preds.min() + 1e-8)
autoencoder_norm = (autoencoder_scores - autoencoder_scores.min()) / (autoencoder_scores.max() - autoencoder_scores.min() + 1e-8)
iso_norm = (-iso_scores - (-iso_scores).min()) / ((-iso_scores).max() - (-iso_scores).min() + 1e-8)

# Ensemble feature matrix
ensemble_features = np.column_stack([
    gnn_norm,
    autoencoder_norm, 
    iso_norm,
    gnn_preds,
    autoencoder_preds,
    iso_preds
])

print(f"Ensemble features shape: {ensemble_features.shape}")

In [None]:
# Optuna hyperparameter optimization
def objective(trial):
    # Suggest hyperparameters
    model_type = trial.suggest_categorical('model', ['rf', 'lr'])
    
    if model_type == 'rf':
        n_estimators = trial.suggest_int('n_estimators', 50, 200)
        max_depth = trial.suggest_int('max_depth', 3, 10)
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=42
        )
    else:
        C = trial.suggest_float('C', 0.01, 10.0, log=True)
        model = LogisticRegression(C=C, random_state=42, max_iter=1000)
    
    # Cross-validation score
    cv_scores = cross_val_score(model, ensemble_features, y_test, cv=3, scoring='roc_auc')
    return cv_scores.mean()

# Run optimization
print("Starting hyperparameter optimization...")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

print(f"Best AUC: {study.best_value:.4f}")
print(f"Best parameters: {study.best_params}")

In [None]:
# Train final ensemble model
best_params = study.best_params
if best_params['model'] == 'rf':
    final_model = RandomForestClassifier(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        random_state=42
    )
else:
    final_model = LogisticRegression(
        C=best_params['C'],
        random_state=42,
        max_iter=1000
    )

# Split data for training and testing ensemble
from sklearn.model_selection import train_test_split
X_ens_train, X_ens_test, y_ens_train, y_ens_test = train_test_split(
    ensemble_features, y_test, test_size=0.3, random_state=42, stratify=y_test
)

final_model.fit(X_ens_train, y_ens_train)
ensemble_preds = final_model.predict_proba(X_ens_test)[:, 1]
ensemble_auc = roc_auc_score(y_ens_test, ensemble_preds)

print(f"Final Ensemble AUC: {ensemble_auc:.4f}")

In [None]:
# Performance comparison
results_df = pd.DataFrame({
    'Model': ['GNN', 'Autoencoder', 'Isolation Forest', 'Ensemble'],
    'AUC': [models_performance['GNN'], models_performance['Autoencoder'], 
            models_performance['Isolation Forest'], ensemble_auc]
})

plt.figure(figsize=(10, 6))
bars = plt.bar(results_df['Model'], results_df['AUC'], 
               color=['skyblue', 'lightgreen', 'orange', 'red'])
plt.title('Model Performance Comparison')
plt.ylabel('AUC Score')
plt.ylim(0, 1)

# Add value labels on bars
for bar, auc in zip(bars, results_df['AUC']):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{auc:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.savefig('../reports/ensemble_performance.png', dpi=300, bbox_inches='tight')
plt.show()

print(results_df)

In [None]:
# Feature importance (if Random Forest)
if hasattr(final_model, 'feature_importances_'):
    feature_names = ['GNN_norm', 'Autoencoder_norm', 'IsoForest_norm', 
                    'GNN_pred', 'Autoencoder_pred', 'IsoForest_pred']
    
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': final_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    plt.figure(figsize=(10, 6))
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.title('Ensemble Feature Importance')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.savefig('../reports/ensemble_feature_importance.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(importance_df)

In [None]:
# Save ensemble model
with open('../models/ensemble_model.pkl', 'wb') as f:
    pickle.dump(final_model, f)

with open('../models/ensemble_best_params.pkl', 'wb') as f:
    pickle.dump(best_params, f)

np.save('../models/ensemble_test_predictions.npy', ensemble_preds)

print("Ensemble model training completed!")
print(f"Best ensemble AUC: {ensemble_auc:.4f}")
print("Model saved to: ../models/ensemble_model.pkl")