# Analise Avancada de Features e Modelos

Este notebook demonstra:
- Analise de correlacao entre features
- Comparacao detalhada de modelos
- Analise de confianca nas predicoes

Pre-requisito: ter completado o tutorial basico (notebook 01).

In [None]:
import sys
import os
sys.path.insert(0, os.path.abspath('../src'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from utils.data_loader import DataLoader
from features.technical_indicators import TechnicalIndicators
from models.ml_models import TradingModel
from sklearn.metrics import accuracy_score, classification_report

print("Imports successful.")

## 1. Preparar Dados

In [None]:
def generate_synthetic_stock_data(n_days=1000, initial_price=100, volatility=0.02, trend=0.0001):
    np.random.seed(42)
    returns = np.random.normal(trend, volatility, n_days)
    prices = initial_price * np.exp(np.cumsum(returns))
    data = pd.DataFrame({
        'date': pd.date_range(start='2020-01-01', periods=n_days, freq='D'),
        'open': prices * (1 + np.random.uniform(-0.005, 0.005, n_days)),
        'high': prices * (1 + np.random.uniform(0, 0.01, n_days)),
        'low': prices * (1 - np.random.uniform(0, 0.01, n_days)),
        'close': prices,
        'volume': np.random.randint(1000000, 5000000, n_days),
    })
    data['high'] = data[['open', 'high', 'close']].max(axis=1)
    data['low'] = data[['open', 'low', 'close']].min(axis=1)
    return data

loader = DataLoader()
data = generate_synthetic_stock_data(n_days=1000)

indicators = TechnicalIndicators()
data_with_features = indicators.add_all_features(data.copy())
data_with_features['target'] = loader.create_target_variable(
    data_with_features, horizon=5, threshold=0.01, binary=False
)
data_with_features = data_with_features.dropna()

X_train, X_val, X_test, y_train, y_val, y_test = loader.prepare_training_data(
    data_with_features, target_col='target'
)

print(f"Data prepared: {X_train.shape[0]} train, {X_val.shape[0]} val, {X_test.shape[0]} test")

## 2. Analise de Correlacao entre Features

In [None]:
correlation_matrix = X_train.corr()

plt.figure(figsize=(16, 14))
plt.imshow(correlation_matrix.values, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)
plt.colorbar(shrink=0.8)
plt.title('Feature Correlation Matrix', fontsize=14)
plt.tight_layout()
plt.show()

# Find highly correlated pairs
high_corr_pairs = []
cols = correlation_matrix.columns
for i in range(len(cols)):
    for j in range(i+1, len(cols)):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            high_corr_pairs.append((cols[i], cols[j], correlation_matrix.iloc[i, j]))

if high_corr_pairs:
    print(f"\nHighly correlated pairs (|r| > 0.9): {len(high_corr_pairs)}")
    for feat1, feat2, corr in high_corr_pairs[:10]:
        print(f"  {feat1} <-> {feat2}: {corr:.3f}")
else:
    print("No highly correlated features found.")

## 3. Comparacao de Modelos com Diferentes Hiperparametros

In [None]:
configs = [
    ('XGB depth=3', 'xgboost', {'max_depth': 3, 'n_estimators': 100}),
    ('XGB depth=6', 'xgboost', {'max_depth': 6, 'n_estimators': 100}),
    ('XGB depth=10', 'xgboost', {'max_depth': 10, 'n_estimators': 100}),
    ('RF n=50', 'random_forest', {'n_estimators': 50}),
    ('RF n=200', 'random_forest', {'n_estimators': 200}),
    ('LGB lr=0.05', 'lightgbm', {'learning_rate': 0.05, 'n_estimators': 100}),
    ('LGB lr=0.2', 'lightgbm', {'learning_rate': 0.2, 'n_estimators': 100}),
]

comparison = []
for name, model_type, params in configs:
    model = TradingModel(model_type=model_type, random_state=42, **params)
    model.fit(X_train, y_train, X_val, y_val)
    val_preds = model.predict(X_val)
    test_preds = model.predict(X_test)
    comparison.append({
        'Config': name,
        'Val Accuracy': accuracy_score(y_val, val_preds),
        'Test Accuracy': accuracy_score(y_test, test_preds),
    })

comp_df = pd.DataFrame(comparison)
print(comp_df.to_string(index=False))

In [None]:
# Visualize comparison
fig, ax = plt.subplots(figsize=(10, 5))
x = np.arange(len(comp_df))
width = 0.35
ax.bar(x - width/2, comp_df['Val Accuracy'], width, label='Validation')
ax.bar(x + width/2, comp_df['Test Accuracy'], width, label='Test')
ax.set_ylabel('Accuracy')
ax.set_title('Model Comparison')
ax.set_xticks(x)
ax.set_xticklabels(comp_df['Config'], rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()

## 4. Treinar Melhor Modelo e Analisar Predicoes

In [None]:
# Train best config (pick the one with highest val accuracy)
best_idx = comp_df['Val Accuracy'].idxmax()
best_name, best_type, best_params = configs[best_idx]
print(f"Best config: {best_name}")

best_model = TradingModel(model_type=best_type, random_state=42, **best_params)
best_model.fit(X_train, y_train, X_val, y_val)

test_preds = best_model.predict(X_test)
print(f"\nTest Accuracy: {accuracy_score(y_test, test_preds):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, test_preds, zero_division=0))

## 5. Analise de Confianca nas Predicoes

In [None]:
test_probas = best_model.predict_proba(X_test)
confidence = np.max(test_probas, axis=1)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Confidence distribution
axes[0].hist(confidence, bins=30, edgecolor='black', alpha=0.7)
axes[0].axvline(x=confidence.mean(), color='r', linestyle='--', label=f'Mean: {confidence.mean():.3f}')
axes[0].set_xlabel('Confidence')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Prediction Confidence Distribution')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Accuracy by confidence level
conf_bins = [0, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
acc_by_conf = []
n_by_conf = []

for i in range(len(conf_bins)-1):
    mask = (confidence >= conf_bins[i]) & (confidence < conf_bins[i+1])
    if mask.sum() > 0:
        acc_by_conf.append(accuracy_score(y_test[mask], test_preds[mask]))
        n_by_conf.append(mask.sum())
    else:
        acc_by_conf.append(0)
        n_by_conf.append(0)

bin_labels = [f'{conf_bins[i]:.1f}-{conf_bins[i+1]:.1f}' for i in range(len(conf_bins)-1)]
x = np.arange(len(bin_labels))

bars = axes[1].bar(x, acc_by_conf, alpha=0.7)
axes[1].set_xlabel('Confidence Range')
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Accuracy by Confidence Level')
axes[1].set_xticks(x)
axes[1].set_xticklabels(bin_labels, rotation=45)
axes[1].grid(True, alpha=0.3, axis='y')

for bar, count in zip(bars, n_by_conf):
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height, f'n={count}',
                ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

print(f"\nMean Confidence: {confidence.mean():.4f}")
print(f"High Confidence (>0.7): {(confidence > 0.7).sum()} samples ({(confidence > 0.7).mean()*100:.1f}%)")

## Conclusao

Neste notebook analisamos:

1. **Correlacao entre features** - quais indicadores sao redundantes
2. **Comparacao de hiperparametros** - impacto de profundidade, learning rate, etc.
3. **Confianca do modelo** - como a probabilidade maxima se relaciona com acuracia

Proximos passos:
- Remover features altamente correlacionadas para reduzir overfitting
- Usar niveis de confianca para filtrar sinais de trading
- Experimentar com diferentes horizontes de predicao