# 📊 Advanced Feature Analysis & Model Interpretation

This notebook demonstrates advanced techniques for:
- Feature correlation analysis
- SHAP value interpretation
- Hyperparameter optimization with Optuna
- Walk-forward validation

## Prerequisites

Make sure you have completed the basic tutorial first.

In [None]:
import sys
import os
sys.path.insert(0, os.path.abspath('../src'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import optuna

from utils.data_loader import DataLoader
from features.technical_indicators import TechnicalIndicators
from models.ml_models import TradingModel
from sklearn.metrics import accuracy_score, classification_report

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Suppress optuna logging
optuna.logging.set_verbosity(optuna.logging.WARNING)

print("✅ All imports successful!")

## 1️⃣ Prepare Data

In [None]:
# Load and prepare data
loader = DataLoader()
data = loader.generate_synthetic_data(n_days=1000, random_state=42)

indicators = TechnicalIndicators()
data_with_features = indicators.add_all_features(data.copy())
data_with_features['target'] = loader.create_target_variable(
    data_with_features, horizon=5, threshold=0.01, binary=False
)
data_with_features = data_with_features.dropna()

X_train, X_val, X_test, y_train, y_val, y_test = loader.prepare_training_data(
    data_with_features, target_col='target'
)

print(f"✅ Data prepared: {X_train.shape[0]} train, {X_val.shape[0]} val, {X_test.shape[0]} test samples")

## 2️⃣ Feature Correlation Analysis

Understand relationships between features.

In [None]:
# Calculate correlation matrix
correlation_matrix = X_train.corr()

# Plot correlation heatmap for top features
plt.figure(figsize=(16, 14))
sns.heatmap(correlation_matrix, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8},
            annot=False, fmt='.2f')
plt.title('Feature Correlation Heatmap', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

# Find highly correlated features
high_corr_pairs = []
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            high_corr_pairs.append((
                correlation_matrix.columns[i],
                correlation_matrix.columns[j],
                correlation_matrix.iloc[i, j]
            ))

if high_corr_pairs:
    print("\n⚠️ Highly Correlated Features (>0.9):")
    for feat1, feat2, corr in high_corr_pairs[:10]:
        print(f"  {feat1} <-> {feat2}: {corr:.3f}")
else:
    print("\n✅ No highly correlated features found")

## 3️⃣ Hyperparameter Optimization with Optuna

In [None]:
def objective(trial):
    """Objective function for Optuna optimization"""
    
    # Suggest hyperparameters
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'random_state': 42
    }
    
    # Train model
    model = TradingModel(model_type='xgboost', **params)
    model.fit(X_train, y_train, X_val, y_val)
    
    # Evaluate on validation set
    preds = model.predict(X_val)
    score = accuracy_score(y_val, preds)
    
    return score

# Run optimization
print("🔍 Starting hyperparameter optimization...")
study = optuna.create_study(direction='maximize', study_name='xgboost_optimization')
study.optimize(objective, n_trials=30, show_progress_bar=True)

print(f"\n✅ Optimization complete!")
print(f"\n🏆 Best Parameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")
print(f"\n📊 Best Validation Accuracy: {study.best_value:.4f}")

In [None]:
# Visualize optimization history
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot optimization history
trial_values = [trial.value for trial in study.trials]
axes[0].plot(trial_values, marker='o', linestyle='-', alpha=0.7)
axes[0].axhline(y=study.best_value, color='r', linestyle='--', label=f'Best: {study.best_value:.4f}')
axes[0].set_xlabel('Trial Number', fontsize=12)
axes[0].set_ylabel('Validation Accuracy', fontsize=12)
axes[0].set_title('Optimization Progress', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Plot parameter importance
try:
    importance = optuna.importance.get_param_importances(study)
    params = list(importance.keys())
    values = list(importance.values())
    axes[1].barh(params, values)
    axes[1].set_xlabel('Importance', fontsize=12)
    axes[1].set_title('Parameter Importance', fontsize=14, fontweight='bold')
    axes[1].grid(True, alpha=0.3, axis='x')
except:
    axes[1].text(0.5, 0.5, 'Not enough trials\nfor importance', 
                ha='center', va='center', fontsize=12)

plt.tight_layout()
plt.show()

## 4️⃣ Train Optimized Model

In [None]:
# Train final model with best parameters
best_model = TradingModel(model_type='xgboost', **study.best_params)
best_model.fit(X_train, y_train, X_val, y_val)

# Evaluate on test set
test_preds = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_preds)

print("\n📊 Test Set Performance:")
print(f"Accuracy: {test_accuracy:.4f}")
print("\nDetailed Classification Report:")
print(classification_report(y_test, test_preds))

## 5️⃣ SHAP Value Analysis

Understand which features contribute most to predictions.

In [None]:
# Create SHAP explainer
print("🔍 Calculating SHAP values... (this may take a moment)")
explainer = shap.TreeExplainer(best_model.model)

# Calculate SHAP values for test set (use subset for speed)
shap_values = explainer.shap_values(X_test[:100])

print("✅ SHAP values calculated!")

In [None]:
# Summary plot
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test[:100], show=False)
plt.title('SHAP Summary Plot - Feature Impact on Predictions', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Feature importance from SHAP
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test[:100], plot_type="bar", show=False)
plt.title('SHAP Feature Importance', fontsize=14, fontweight='bold', pad=20)
plt.tight_layout()
plt.show()

In [None]:
# Analyze specific feature
feature_to_analyze = X_test.columns[0]  # Change to analyze different features

plt.figure(figsize=(12, 6))
shap.dependence_plot(feature_to_analyze, shap_values, X_test[:100], show=False)
plt.title(f'SHAP Dependence Plot - {feature_to_analyze}', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

## 6️⃣ Prediction Confidence Analysis

In [None]:
# Get prediction probabilities
test_probas = best_model.predict_proba(X_test)

# Calculate confidence (max probability)
confidence = np.max(test_probas, axis=1)

# Analyze prediction confidence
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Confidence distribution
axes[0].hist(confidence, bins=30, edgecolor='black', alpha=0.7)
axes[0].axvline(x=confidence.mean(), color='r', linestyle='--', 
                label=f'Mean: {confidence.mean():.3f}')
axes[0].set_xlabel('Confidence', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Prediction Confidence Distribution', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Accuracy by confidence level
confidence_bins = [0, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
accuracy_by_conf = []
samples_by_conf = []

for i in range(len(confidence_bins)-1):
    mask = (confidence >= confidence_bins[i]) & (confidence < confidence_bins[i+1])
    if mask.sum() > 0:
        acc = accuracy_score(y_test[mask], test_preds[mask])
        accuracy_by_conf.append(acc)
        samples_by_conf.append(mask.sum())
    else:
        accuracy_by_conf.append(0)
        samples_by_conf.append(0)

bin_labels = [f'{confidence_bins[i]:.1f}-{confidence_bins[i+1]:.1f}' 
              for i in range(len(confidence_bins)-1)]

x = np.arange(len(bin_labels))
width = 0.35

ax2 = axes[1]
bars = ax2.bar(x, accuracy_by_conf, width, label='Accuracy', alpha=0.7)
ax2.set_xlabel('Confidence Range', fontsize=12)
ax2.set_ylabel('Accuracy', fontsize=12)
ax2.set_title('Accuracy by Confidence Level', fontsize=14, fontweight='bold')
ax2.set_xticks(x)
ax2.set_xticklabels(bin_labels, rotation=45)
ax2.grid(True, alpha=0.3, axis='y')

# Add sample counts on top of bars
for i, (bar, count) in enumerate(zip(bars, samples_by_conf)):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height,
            f'n={count}', ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

print("\n📊 Confidence Statistics:")
print(f"Mean Confidence: {confidence.mean():.4f}")
print(f"Std Confidence: {confidence.std():.4f}")
print(f"High Confidence (>0.7): {(confidence > 0.7).sum()} samples ({(confidence > 0.7).mean()*100:.1f}%)")

## 🎯 Key Takeaways

From this analysis, you've learned:

1. **Feature Correlations**: Which features are redundant
2. **Hyperparameter Optimization**: How to find optimal model parameters
3. **SHAP Values**: Which features drive predictions
4. **Prediction Confidence**: How confident the model is in its predictions

## 📚 Next Steps

- Remove highly correlated features to reduce overfitting
- Focus on the most important features from SHAP analysis
- Use confidence levels to filter trading signals
- Experiment with different model architectures