# ML Trading Strategies - Tutorial Introdutorio

Este notebook demonstra o pipeline completo do framework:

1. Gerar dados sinteticos de mercado
2. Engenharia de features (indicadores tecnicos)
3. Treinar modelos de ML
4. Backtesting da estrategia
5. Analise de performance

**Aviso:** Projeto educacional. Nao constitui aconselhamento financeiro.

In [None]:
import sys
import os

# Add src to path
sys.path.insert(0, os.path.abspath('../src'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

print("Libraries imported successfully.")

## 1. Gerar Dados Sinteticos

Usamos dados sinteticos para evitar dependencia de internet.

In [None]:
def generate_synthetic_stock_data(n_days=1000, initial_price=100, volatility=0.02, trend=0.0001):
    """Generate synthetic OHLCV data."""
    np.random.seed(42)
    returns = np.random.normal(trend, volatility, n_days)
    prices = initial_price * np.exp(np.cumsum(returns))

    data = pd.DataFrame({
        'date': pd.date_range(start='2020-01-01', periods=n_days, freq='D'),
        'open': prices * (1 + np.random.uniform(-0.005, 0.005, n_days)),
        'high': prices * (1 + np.random.uniform(0, 0.01, n_days)),
        'low': prices * (1 - np.random.uniform(0, 0.01, n_days)),
        'close': prices,
        'volume': np.random.randint(1000000, 5000000, n_days),
    })
    data['high'] = data[['open', 'high', 'close']].max(axis=1)
    data['low'] = data[['open', 'low', 'close']].min(axis=1)
    return data

data = generate_synthetic_stock_data(n_days=1000)
print(f"Generated {len(data)} days of data")
print(f"Date range: {data['date'].min()} to {data['date'].max()}")
data.head()

In [None]:
plt.figure(figsize=(14, 5))
plt.plot(data['date'], data['close'], linewidth=1)
plt.title('Synthetic Stock Price')
plt.xlabel('Date')
plt.ylabel('Price ($)')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## 2. Engenharia de Features

Adicionar indicadores tecnicos ao dataset.

In [None]:
from features.technical_indicators import TechnicalIndicators

indicators = TechnicalIndicators()
data_with_features = indicators.add_all_features(data.copy())

print(f"Features added: {data_with_features.shape[1] - data.shape[1]} indicators")
print(f"Total columns: {data_with_features.shape[1]}")

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(14, 10))

# Price with Moving Averages
axes[0].plot(data_with_features['date'], data_with_features['close'], label='Close', linewidth=1.5)
axes[0].plot(data_with_features['date'], data_with_features['sma_20'], label='SMA 20', alpha=0.7)
axes[0].plot(data_with_features['date'], data_with_features['sma_50'], label='SMA 50', alpha=0.7)
axes[0].set_title('Price with Moving Averages')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# RSI
axes[1].plot(data_with_features['date'], data_with_features['rsi_14'], color='purple', linewidth=1.5)
axes[1].axhline(y=70, color='r', linestyle='--', alpha=0.5, label='Overbought')
axes[1].axhline(y=30, color='g', linestyle='--', alpha=0.5, label='Oversold')
axes[1].set_title('RSI (14)')
axes[1].set_ylim(0, 100)
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# MACD
axes[2].plot(data_with_features['date'], data_with_features['macd'], label='MACD', linewidth=1.5)
axes[2].plot(data_with_features['date'], data_with_features['macd_signal'], label='Signal', linewidth=1.5)
axes[2].bar(data_with_features['date'], data_with_features['macd_hist'], label='Histogram', alpha=0.3)
axes[2].set_title('MACD')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. Criar Variavel Alvo e Dividir Dados

In [None]:
from utils.data_loader import DataLoader

loader = DataLoader()

# 3-class target: 0=down, 1=neutral, 2=up
data_with_features['target'] = loader.create_target_variable(
    data_with_features, horizon=5, threshold=0.01, binary=False
)
data_with_features = data_with_features.dropna()

print(f"Dataset: {len(data_with_features)} rows")
print(f"\nTarget distribution:")
print(data_with_features['target'].value_counts().sort_index())

In [None]:
X_train, X_val, X_test, y_train, y_val, y_test = loader.prepare_training_data(
    data_with_features, target_col='target', test_size=0.2, validation_size=0.1
)

print(f"Training set:   {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set:       {X_test.shape[0]} samples")
print(f"Features:       {X_train.shape[1]}")

## 4. Treinar Modelos de ML

In [None]:
from models.ml_models import TradingModel, EnsembleModel
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Train models
print("Training Random Forest...")
rf_model = TradingModel(model_type='random_forest', n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train, X_val, y_val)

print("Training XGBoost...")
xgb_model = TradingModel(model_type='xgboost', n_estimators=100, random_state=42)
xgb_model.fit(X_train, y_train, X_val, y_val)

print("Training LightGBM...")
lgb_model = TradingModel(model_type='lightgbm', n_estimators=100, random_state=42)
lgb_model.fit(X_train, y_train, X_val, y_val)

print("All models trained.")

In [None]:
# Evaluate on validation set
models = {'Random Forest': rf_model, 'XGBoost': xgb_model, 'LightGBM': lgb_model}

results = []
for name, model in models.items():
    preds = model.predict(X_val)
    results.append({
        'Model': name,
        'Accuracy': accuracy_score(y_val, preds),
        'Precision': precision_score(y_val, preds, average='weighted', zero_division=0),
        'Recall': recall_score(y_val, preds, average='weighted', zero_division=0),
        'F1 Score': f1_score(y_val, preds, average='weighted', zero_division=0)
    })

results_df = pd.DataFrame(results)
print("Validation Performance:")
print(results_df.to_string(index=False))

## 5. Ensemble e Feature Importance

In [None]:
# Create ensemble
ensemble = EnsembleModel([rf_model, xgb_model, lgb_model])
ensemble.fit(X_train, y_train, X_val, y_val)

ens_preds = ensemble.predict(X_val)
print(f"Ensemble Accuracy: {accuracy_score(y_val, ens_preds):.4f}")
print(f"Ensemble F1:       {f1_score(y_val, ens_preds, average='weighted', zero_division=0):.4f}")

In [None]:
# Feature importance (from Random Forest)
fi = rf_model.get_feature_importance(top_n=15)

plt.figure(figsize=(10, 6))
plt.barh(fi['feature'][::-1], fi['importance'][::-1])
plt.xlabel('Importance')
plt.title('Top 15 Features (Random Forest)')
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

## 6. Backtesting

In [None]:
from backtesting.backtest_engine import BacktestEngine

backtest = BacktestEngine(initial_capital=100000, commission=0.001, slippage=0.0005)

# Generate signals from ensemble
predictions = ensemble.predict_proba(X_test)
signals = backtest.generate_signals_from_predictions(predictions, threshold=0.55)

# Run backtest
test_data = data_with_features.iloc[-len(X_test):].copy()
test_data = test_data.reset_index(drop=True)
signals_series = pd.Series(signals).reset_index(drop=True)

results = backtest.run_backtest(test_data, signals_series, price_col='close')

print("=" * 50)
print("BACKTEST RESULTS")
print("=" * 50)
print(f"Initial Capital:      ${backtest.initial_capital:,.2f}")
print(f"Final Portfolio:      ${results.equity_curve.iloc[-1]:,.2f}")
print(f"Total Return:         {results.total_return:.2%}")
print(f"Annualized Return:    {results.annualized_return:.2%}")
print(f"Sharpe Ratio:         {results.sharpe_ratio:.2f}")
print(f"Max Drawdown:         {results.max_drawdown:.2%}")
print(f"Win Rate:             {results.win_rate:.2%}")
print(f"Profit Factor:        {results.profit_factor:.2f}")
print(f"Total Trades:         {results.total_trades}")
print("=" * 50)

# Buy & Hold comparison
bh_return = (test_data['close'].iloc[-1] / test_data['close'].iloc[0]) - 1
print(f"\nBuy & Hold Return:    {bh_return:.2%}")
print(f"Outperformance:       {(results.total_return - bh_return):.2%}")

In [None]:
# Plot equity curve
fig, axes = plt.subplots(2, 1, figsize=(14, 8))

axes[0].plot(results.equity_curve)
axes[0].set_title('Equity Curve')
axes[0].set_ylabel('Portfolio Value ($)')
axes[0].grid(True, alpha=0.3)

# Drawdown
rets = results.equity_curve.pct_change()
cumulative = (1 + rets).cumprod()
running_max = cumulative.cummax()
drawdown = (cumulative - running_max) / running_max

axes[1].fill_between(drawdown.index, drawdown, 0, alpha=0.5, color='red')
axes[1].set_title('Drawdown')
axes[1].set_ylabel('Drawdown')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Conclusao

Este tutorial demonstrou o pipeline completo:
- Geracao de dados sinteticos OHLCV
- Calculo de indicadores tecnicos
- Treinamento de modelos (Random Forest, XGBoost, LightGBM)
- Ensemble com majority voting
- Backtesting com custos de transacao

**Aviso:** Resultados em dados sinteticos nao representam performance real.
Para dados reais, use `DataLoader.download_stock_data()` ou o exemplo em `examples/complete_strategy.py`.