# 🔬 Feature Importance & Model Comparison

Este notebook foca na análise de feature importance e comparação detalhada entre diferentes modelos.

## Objetivos
1. Analisar importância das features
2. Comparar performance de modelos individuais
3. Otimizar threshold de sinais
4. Visualizar curvas de decisão

In [None]:
# Setup
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from utils.data_loader import DataLoader
from features.technical_indicators import TechnicalIndicators
from models.ml_models import TradingModel, EnsembleModel
from backtesting.backtest_engine import BacktestEngine
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
%matplotlib inline

print("✅ Libraries loaded!")

## 1. Data Preparation

In [None]:
# Load and prepare data
loader = DataLoader()
ticker = "AAPL"
data = loader.download_stock_data(ticker, period="5y")

indicators = TechnicalIndicators()
data_with_features = indicators.add_all_features(data)
data_with_features['target'] = loader.create_target_variable(
    data_with_features, horizon=5, threshold=0.01
)

data_clean = data_with_features.dropna()
X_train, X_val, X_test, y_train, y_val, y_test = loader.prepare_training_data(
    data_clean, target_col='target', test_size=0.2, validation_size=0.1
)

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")
print(f"Features: {len(X_train.columns)}")

## 2. Train Multiple Models

In [None]:
# Train models
models = {}

print("Training Random Forest...")
models['RF'] = TradingModel(model_type='random_forest')
models['RF'].fit(X_train, y_train, X_val, y_val, n_estimators=100, max_depth=10)

print("Training XGBoost...")
models['XGB'] = TradingModel(model_type='xgboost')
models['XGB'].fit(X_train, y_train, X_val, y_val, n_estimators=100, max_depth=6)

print("Training LightGBM...")
models['LGBM'] = TradingModel(model_type='lightgbm')
models['LGBM'].fit(X_train, y_train, X_val, y_val, n_estimators=100, max_depth=6)

print("Training Logistic Regression...")
models['LR'] = TradingModel(model_type='logistic')
models['LR'].fit(X_train, y_train, X_val, y_val)

print("\n✅ All models trained!")

## 3. Feature Importance Analysis

In [None]:
# Compare feature importance across models
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for idx, (name, model) in enumerate([('RF', models['RF']), ('XGB', models['XGB']), 
                                       ('LGBM', models['LGBM'])]):
    importance = model.get_feature_importance(top_n=15)
    
    axes[idx].barh(range(len(importance)), importance['importance'])
    axes[idx].set_yticks(range(len(importance)))
    axes[idx].set_yticklabels(importance['feature'])
    axes[idx].set_xlabel('Importance')
    axes[idx].set_title(f'{name} - Top 15 Features', fontsize=12, fontweight='bold')
    axes[idx].invert_yaxis()
    axes[idx].grid(True, alpha=0.3, axis='x')

# Remove empty subplot
fig.delaxes(axes[3])

plt.tight_layout()
plt.show()

In [None]:
# Find common important features
rf_top = set(models['RF'].get_feature_importance(top_n=10)['feature'].tolist())
xgb_top = set(models['XGB'].get_feature_importance(top_n=10)['feature'].tolist())
lgbm_top = set(models['LGBM'].get_feature_importance(top_n=10)['feature'].tolist())

common_features = rf_top & xgb_top & lgbm_top

print("\n🔥 Top features common to all models:")
for feat in common_features:
    print(f"  - {feat}")

## 4. Model Performance Comparison

In [None]:
# Compare models on validation set
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

results = []
for name, model in models.items():
    y_pred = model.predict(X_val)
    
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_val, y_pred),
        'Precision': precision_score(y_val, y_pred, average='weighted', zero_division=0),
        'Recall': recall_score(y_val, y_pred, average='weighted', zero_division=0),
        'F1': f1_score(y_val, y_pred, average='weighted', zero_division=0)
    })

results_df = pd.DataFrame(results)
print("\n📊 Model Comparison (Validation Set):\n")
print(results_df.to_string(index=False))

# Visualize
fig, ax = plt.subplots(figsize=(12, 6))
x = np.arange(len(results_df))
width = 0.2

ax.bar(x - width*1.5, results_df['Accuracy'], width, label='Accuracy', alpha=0.8)
ax.bar(x - width*0.5, results_df['Precision'], width, label='Precision', alpha=0.8)
ax.bar(x + width*0.5, results_df['Recall'], width, label='Recall', alpha=0.8)
ax.bar(x + width*1.5, results_df['F1'], width, label='F1', alpha=0.8)

ax.set_xlabel('Models')
ax.set_ylabel('Score')
ax.set_title('Model Performance Comparison', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(results_df['Model'])
ax.legend()
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

## 5. Confusion Matrices

In [None]:
# Plot confusion matrices
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
axes = axes.flatten()

for idx, (name, model) in enumerate(models.items()):
    y_pred = model.predict(X_val)
    cm = confusion_matrix(y_val, y_pred)
    
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx],
                xticklabels=['Sell', 'Hold', 'Buy'],
                yticklabels=['Sell', 'Hold', 'Buy'])
    axes[idx].set_title(f'{name} - Confusion Matrix', fontsize=12, fontweight='bold')
    axes[idx].set_ylabel('True Label')
    axes[idx].set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

## 6. Probability Distribution Analysis

In [None]:
# Analyze prediction probabilities
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
axes = axes.flatten()

for idx, (name, model) in enumerate(models.items()):
    proba = model.predict_proba(X_val)
    
    # Plot probability distribution for each class
    axes[idx].hist(proba[:, 0], bins=30, alpha=0.5, label='Sell', color='red')
    axes[idx].hist(proba[:, 1], bins=30, alpha=0.5, label='Hold', color='gray')
    axes[idx].hist(proba[:, 2], bins=30, alpha=0.5, label='Buy', color='green')
    
    axes[idx].set_title(f'{name} - Probability Distribution', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Probability')
    axes[idx].set_ylabel('Frequency')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 7. Threshold Optimization

In [None]:
# Test different thresholds
ensemble = EnsembleModel([models['RF'], models['XGB'], models['LGBM']])
ensemble.fit(X_train, y_train, X_val, y_val)

test_predictions = ensemble.predict_proba(X_test)
test_data = data_clean.iloc[-len(X_test):].copy().reset_index(drop=True)

backtest = BacktestEngine(initial_capital=100000, commission=0.001, slippage=0.0005)

thresholds = np.arange(0.45, 0.70, 0.02)
threshold_results = []

print("Testing different thresholds...\n")
for threshold in thresholds:
    signals = backtest.generate_signals_from_predictions(test_predictions, threshold=threshold)
    signals_series = pd.Series(signals).reset_index(drop=True)
    
    try:
        results = backtest.run_backtest(test_data, signals_series, price_col='close')
        threshold_results.append({
            'Threshold': threshold,
            'Total Return': results.total_return,
            'Sharpe Ratio': results.sharpe_ratio,
            'Max Drawdown': results.max_drawdown,
            'Win Rate': results.win_rate,
            'Total Trades': results.total_trades
        })
        print(f"Threshold {threshold:.2f}: Return={results.total_return:.2%}, "
              f"Sharpe={results.sharpe_ratio:.2f}, Trades={results.total_trades}")
    except:
        print(f"Threshold {threshold:.2f}: Failed (no valid trades)")

threshold_df = pd.DataFrame(threshold_results)

In [None]:
# Visualize threshold impact
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

axes[0, 0].plot(threshold_df['Threshold'], threshold_df['Total Return'], marker='o')
axes[0, 0].set_title('Total Return vs Threshold', fontweight='bold')
axes[0, 0].set_xlabel('Threshold')
axes[0, 0].set_ylabel('Total Return')
axes[0, 0].grid(True, alpha=0.3)

axes[0, 1].plot(threshold_df['Threshold'], threshold_df['Sharpe Ratio'], marker='o', color='orange')
axes[0, 1].set_title('Sharpe Ratio vs Threshold', fontweight='bold')
axes[0, 1].set_xlabel('Threshold')
axes[0, 1].set_ylabel('Sharpe Ratio')
axes[0, 1].grid(True, alpha=0.3)

axes[1, 0].plot(threshold_df['Threshold'], threshold_df['Max Drawdown'], marker='o', color='red')
axes[1, 0].set_title('Max Drawdown vs Threshold', fontweight='bold')
axes[1, 0].set_xlabel('Threshold')
axes[1, 0].set_ylabel('Max Drawdown')
axes[1, 0].grid(True, alpha=0.3)

axes[1, 1].plot(threshold_df['Threshold'], threshold_df['Total Trades'], marker='o', color='green')
axes[1, 1].set_title('Total Trades vs Threshold', fontweight='bold')
axes[1, 1].set_xlabel('Threshold')
axes[1, 1].set_ylabel('Total Trades')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Find optimal threshold
best_return_idx = threshold_df['Total Return'].idxmax()
best_sharpe_idx = threshold_df['Sharpe Ratio'].idxmax()

print(f"\n🎯 Optimal threshold for Total Return: {threshold_df.loc[best_return_idx, 'Threshold']:.2f}")
print(f"   Return: {threshold_df.loc[best_return_idx, 'Total Return']:.2%}")

print(f"\n🎯 Optimal threshold for Sharpe Ratio: {threshold_df.loc[best_sharpe_idx, 'Threshold']:.2f}")
print(f"   Sharpe: {threshold_df.loc[best_sharpe_idx, 'Sharpe Ratio']:.2f}")

## 8. Conclusions

### Key Findings:
1. **Feature Importance**: [Analyze which features are most predictive]
2. **Model Performance**: [Compare which model performs best]
3. **Optimal Threshold**: [Determine best threshold for signals]
4. **Trade-offs**: [Balance between return, risk, and number of trades]

### Next Steps:
- Use optimal threshold for production
- Focus on top features for faster training
- Consider ensemble with only best models
- Monitor performance over time