# ü§ñ Experimentos de Machine Learning Cl√°ssico

## Objetivo
Avaliar modelos de ML cl√°ssico com features extra√≠das via wavelets (db2):
- **SVM** (Support Vector Machine)
- **Random Forest**
- **XGBoost**
- **LightGBM**

## Pipeline
1. Carregar dados
2. Extrair features wavelet (estat√≠sticas dos coeficientes)
3. Otimizar hiperpar√¢metros com GridSearchCV
4. Avaliar no conjunto de teste
5. Salvar resultados

In [3]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import time
import warnings
warnings.filterwarnings('ignore')

# ML
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import lightgbm as lgb

# Imports locais
import sys
sys.path.append('.')
from src.feature_extraction import WaveletFeatureExtractor
from src.evaluation import RegressionEvaluator, ResultsManager
from src.visualization import ExperimentVisualizer
from config.experiment_config import (
    DATA_DIR, RESULTS_DIR, ML_MODELS_CONFIG, WAVELET_CONFIG
)

# Configura√ß√£o
plt.style.use('seaborn-v0_8-whitegrid')
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

print("‚úÖ Imports realizados com sucesso!")

‚úÖ Imports realizados com sucesso!


## 1. Carregar Dados

In [4]:
# Carregar datasets
X_train = np.load(DATA_DIR / "X_train.npy")
y_train = np.load(DATA_DIR / "y_train.npy")
X_val = np.load(DATA_DIR / "X_val.npy")
y_val = np.load(DATA_DIR / "y_val.npy")
X_test = np.load(DATA_DIR / "X_test.npy")
y_test = np.load(DATA_DIR / "y_test.npy")

print(f"üì¶ Dados Carregados:")
print(f"  Train: X={X_train.shape}, y={y_train.shape}")
print(f"  Val:   X={X_val.shape}, y={y_val.shape}")
print(f"  Test:  X={X_test.shape}, y={y_test.shape}")

üì¶ Dados Carregados:
  Train: X=(69820, 256), y=(69820,)
  Val:   X=(14962, 256), y=(14962,)
  Test:  X=(14962, 256), y=(14962,)


## 2. Extra√ß√£o de Features Wavelet

In [5]:
# Configurar extrator de features
feature_extractor = WaveletFeatureExtractor(
    wavelet=WAVELET_CONFIG['wavelet_type'],
    level=WAVELET_CONFIG['decomposition_level'],
    mode=WAVELET_CONFIG['mode']
)

print(f"Extrator Wavelet:")
print(f"  Wavelet: {WAVELET_CONFIG['wavelet_type']}")
print(f"  N√≠veis: {WAVELET_CONFIG['decomposition_level']}")
print(f"  Features por n√≠vel: {len(feature_extractor.features)}")

# Extrair features
print("\nExtraindo features...")
t0 = time.time()

X_train_feat = feature_extractor.extract_features(X_train)
X_val_feat = feature_extractor.extract_features(X_val)
X_test_feat = feature_extractor.extract_features(X_test)

print(f"  Tempo: {time.time()-t0:.2f}s")
print(f"\nüìä Features Extra√≠das:")
print(f"  Train: {X_train_feat.shape}")
print(f"  Val:   {X_val_feat.shape}")
print(f"  Test:  {X_test_feat.shape}")
print(f"  Nomes das features: {len(feature_extractor.get_feature_names())}")

Extrator Wavelet:
  Wavelet: db2
  N√≠veis: 4
  Features por n√≠vel: 17

Extraindo features...
  Tempo: 314.44s

üìä Features Extra√≠das:
  Train: (69820, 85)
  Val:   (14962, 85)
  Test:  (14962, 85)
  Nomes das features: 85


In [6]:
# Normaliza√ß√£o
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_feat)
X_val_scaled = scaler.transform(X_val_feat)
X_test_scaled = scaler.transform(X_test_feat)

# Combinar train + val para busca de hiperpar√¢metros
X_trainval = np.vstack([X_train_scaled, X_val_scaled])
y_trainval = np.concatenate([y_train, y_val])

print(f"‚úÖ Features normalizadas")
print(f"  TrainVal combinado: {X_trainval.shape}")

‚úÖ Features normalizadas
  TrainVal combinado: (84782, 85)


## 3. Configura√ß√£o de Experimentos

In [7]:
# Gerenciador de resultados
results_manager = ResultsManager(RESULTS_DIR / "ml_experiments")
evaluator = RegressionEvaluator()

# Armazenar todos os resultados
all_results = {}

# TimeSeriesSplit para valida√ß√£o cruzada
tscv = TimeSeriesSplit(n_splits=5)

## 4. Experimento 1: SVM

In [None]:
print("="*60)
print("üîµ Experimento: SVM com Features Wavelet")
print("="*60)

# Grid de par√¢metros (reduzido para velocidade)
svm_param_grid = {
    'C': [1, 10, 100],
    'gamma': ['scale', 0.1],
    'kernel': ['rbf'],
    'epsilon': [0.01, 0.1],
}

# Grid Search
t0 = time.time()
svm = SVR()
svm_grid = GridSearchCV(
    svm, svm_param_grid, cv=tscv,
    scoring='neg_mean_squared_error',
    n_jobs=-1, verbose=1
)
svm_grid.fit(X_trainval, y_trainval)

# Melhor modelo
best_svm = svm_grid.best_estimator_
print(f"\nMelhores par√¢metros: {svm_grid.best_params_}")

# Predi√ß√µes
y_pred_svm = best_svm.predict(X_test_scaled)

# M√©tricas
svm_metrics = evaluator.evaluate(y_test, y_pred_svm)
elapsed = time.time() - t0

print(f"\nüìä Resultados SVM:")
print(f"  RMSE: {svm_metrics['rmse']:.6f}")
print(f"  MAE:  {svm_metrics['mae']:.6f}")
print(f"  R¬≤:   {svm_metrics['r2']:.6f}")
print(f"  Tempo: {elapsed:.2f}s")

all_results['Wavelet_SVM'] = {
    'metrics': svm_metrics,
    'best_params': svm_grid.best_params_,
    'time': elapsed,
    'y_pred': y_pred_svm
}

# Salvar
results_manager.log_experiment(
    'ML_Wavelet', 'SVM', svm_metrics,
    {'params': svm_grid.best_params_}
)

üîµ Experimento: SVM com Features Wavelet
Fitting 5 folds for each of 12 candidates, totalling 60 fits


## 5. Experimento 2: Random Forest

In [None]:
print("="*60)
print("üå≤ Experimento: Random Forest com Features Wavelet")
print("="*60)

# Grid de par√¢metros
rf_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
}

# Grid Search
t0 = time.time()
rf = RandomForestRegressor(random_state=42, n_jobs=-1)
rf_grid = GridSearchCV(
    rf, rf_param_grid, cv=tscv,
    scoring='neg_mean_squared_error',
    n_jobs=-1, verbose=1
)
rf_grid.fit(X_trainval, y_trainval)

# Melhor modelo
best_rf = rf_grid.best_estimator_
print(f"\nMelhores par√¢metros: {rf_grid.best_params_}")

# Predi√ß√µes
y_pred_rf = best_rf.predict(X_test_scaled)

# M√©tricas
rf_metrics = evaluator.evaluate(y_test, y_pred_rf)
elapsed = time.time() - t0

print(f"\nüìä Resultados Random Forest:")
print(f"  RMSE: {rf_metrics['rmse']:.6f}")
print(f"  MAE:  {rf_metrics['mae']:.6f}")
print(f"  R¬≤:   {rf_metrics['r2']:.6f}")
print(f"  Tempo: {elapsed:.2f}s")

all_results['Wavelet_RF'] = {
    'metrics': rf_metrics,
    'best_params': rf_grid.best_params_,
    'time': elapsed,
    'y_pred': y_pred_rf
}

# Salvar
results_manager.log_experiment(
    'ML_Wavelet', 'RandomForest', rf_metrics,
    {'params': rf_grid.best_params_}
)

## 6. Experimento 3: XGBoost

In [None]:
print("="*60)
print("üöÄ Experimento: XGBoost com Features Wavelet")
print("="*60)

# Grid de par√¢metros
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
}

# Grid Search
t0 = time.time()
xgb_model = xgb.XGBRegressor(random_state=42, n_jobs=-1, verbosity=0)
xgb_grid = GridSearchCV(
    xgb_model, xgb_param_grid, cv=tscv,
    scoring='neg_mean_squared_error',
    n_jobs=-1, verbose=1
)
xgb_grid.fit(X_trainval, y_trainval)

# Melhor modelo
best_xgb = xgb_grid.best_estimator_
print(f"\nMelhores par√¢metros: {xgb_grid.best_params_}")

# Predi√ß√µes
y_pred_xgb = best_xgb.predict(X_test_scaled)

# M√©tricas
xgb_metrics = evaluator.evaluate(y_test, y_pred_xgb)
elapsed = time.time() - t0

print(f"\nüìä Resultados XGBoost:")
print(f"  RMSE: {xgb_metrics['rmse']:.6f}")
print(f"  MAE:  {xgb_metrics['mae']:.6f}")
print(f"  R¬≤:   {xgb_metrics['r2']:.6f}")
print(f"  Tempo: {elapsed:.2f}s")

all_results['Wavelet_XGB'] = {
    'metrics': xgb_metrics,
    'best_params': xgb_grid.best_params_,
    'time': elapsed,
    'y_pred': y_pred_xgb
}

# Salvar
results_manager.log_experiment(
    'ML_Wavelet', 'XGBoost', xgb_metrics,
    {'params': xgb_grid.best_params_}
)

## 7. Experimento 4: LightGBM

In [None]:
print("="*60)
print("üí° Experimento: LightGBM com Features Wavelet")
print("="*60)

# Grid de par√¢metros
lgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 7, 10],
    'learning_rate': [0.05, 0.1],
    'num_leaves': [31, 50],
}

# Grid Search
t0 = time.time()
lgb_model = lgb.LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1)
lgb_grid = GridSearchCV(
    lgb_model, lgb_param_grid, cv=tscv,
    scoring='neg_mean_squared_error',
    n_jobs=-1, verbose=1
)
lgb_grid.fit(X_trainval, y_trainval)

# Melhor modelo
best_lgb = lgb_grid.best_estimator_
print(f"\nMelhores par√¢metros: {lgb_grid.best_params_}")

# Predi√ß√µes
y_pred_lgb = best_lgb.predict(X_test_scaled)

# M√©tricas
lgb_metrics = evaluator.evaluate(y_test, y_pred_lgb)
elapsed = time.time() - t0

print(f"\nüìä Resultados LightGBM:")
print(f"  RMSE: {lgb_metrics['rmse']:.6f}")
print(f"  MAE:  {lgb_metrics['mae']:.6f}")
print(f"  R¬≤:   {lgb_metrics['r2']:.6f}")
print(f"  Tempo: {elapsed:.2f}s")

all_results['Wavelet_LGBM'] = {
    'metrics': lgb_metrics,
    'best_params': lgb_grid.best_params_,
    'time': elapsed,
    'y_pred': y_pred_lgb
}

# Salvar
results_manager.log_experiment(
    'ML_Wavelet', 'LightGBM', lgb_metrics,
    {'params': lgb_grid.best_params_}
)

## 8. Compara√ß√£o dos Resultados

In [None]:
# Criar DataFrame comparativo
comparison_data = []
for model_name, result in all_results.items():
    row = {
        'Model': model_name,
        'RMSE': result['metrics']['rmse'],
        'MAE': result['metrics']['mae'],
        'R¬≤': result['metrics']['r2'],
        'Time (s)': result['time']
    }
    comparison_data.append(row)

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('RMSE')

print("\n" + "="*70)
print("üìä COMPARA√á√ÉO FINAL - ML com Features Wavelet (db2)")
print("="*70)
print(comparison_df.to_string(index=False))

# Salvar
comparison_df.to_csv(RESULTS_DIR / "ml_experiments" / "comparison_ml.csv", index=False)

In [None]:
# Visualiza√ß√£o
visualizer = ExperimentVisualizer()

fig, axes = plt.subplots(1, 3, figsize=(15, 5))

metrics_to_plot = ['RMSE', 'MAE', 'R¬≤']
colors = ['steelblue', 'coral', 'seagreen', 'gold']

for idx, metric in enumerate(metrics_to_plot):
    data = comparison_df.set_index('Model')[metric].sort_values(
        ascending=(metric != 'R¬≤')
    )
    bars = axes[idx].barh(data.index, data.values, color=colors)
    axes[idx].set_xlabel(metric)
    axes[idx].set_title(f'Compara√ß√£o: {metric}')
    axes[idx].grid(True, alpha=0.3, axis='x')
    
    # Valores nas barras
    for bar, val in zip(bars, data.values):
        axes[idx].text(val, bar.get_y() + bar.get_height()/2,
                      f'{val:.4f}', va='center', ha='left', fontsize=9)

plt.suptitle('ML Cl√°ssico com Features Wavelet (db2)', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(RESULTS_DIR / "ml_experiments" / "comparison_ml.png", dpi=150, bbox_inches='tight')
plt.show()

## 9. An√°lise de Predi√ß√µes do Melhor Modelo

In [None]:
# Encontrar melhor modelo
best_model_name = comparison_df.iloc[0]['Model']
best_result = all_results[best_model_name]

print(f"\nüèÜ Melhor Modelo: {best_model_name}")

# Plot de predi√ß√µes
fig = visualizer.plot_prediction_comparison(
    y_test, best_result['y_pred'],
    model_name=best_model_name,
    n_samples=500,
    save_path=RESULTS_DIR / "ml_experiments" / f"predictions_{best_model_name}.png"
)
plt.show()

## 10. Feature Importance (para modelos baseados em √°rvore)

In [None]:
# Feature importance do melhor modelo (se for baseado em √°rvore)
feature_names = feature_extractor.get_feature_names()

fig, axes = plt.subplots(1, 3, figsize=(18, 8))

# Random Forest
rf_importance = best_rf.feature_importances_
indices = np.argsort(rf_importance)[::-1][:20]
axes[0].barh([feature_names[i] for i in indices], rf_importance[indices])
axes[0].set_title('Random Forest - Top 20 Features')
axes[0].invert_yaxis()

# XGBoost
xgb_importance = best_xgb.feature_importances_
indices = np.argsort(xgb_importance)[::-1][:20]
axes[1].barh([feature_names[i] for i in indices], xgb_importance[indices])
axes[1].set_title('XGBoost - Top 20 Features')
axes[1].invert_yaxis()

# LightGBM
lgb_importance = best_lgb.feature_importances_
indices = np.argsort(lgb_importance)[::-1][:20]
axes[2].barh([feature_names[i] for i in indices], lgb_importance[indices])
axes[2].set_title('LightGBM - Top 20 Features')
axes[2].invert_yaxis()

plt.suptitle('Import√¢ncia das Features Wavelet', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig(RESULTS_DIR / "ml_experiments" / "feature_importance.png", dpi=150, bbox_inches='tight')
plt.show()

## 11. Resumo

In [None]:
print("\n" + "="*70)
print("üìã RESUMO - Experimentos ML com Features Wavelet")
print("="*70)
print(f"\n‚úÖ Modelos avaliados: {len(all_results)}")
print(f"‚úÖ Melhor modelo: {best_model_name}")
print(f"‚úÖ Melhor RMSE: {comparison_df.iloc[0]['RMSE']:.6f}")
print(f"‚úÖ Melhor R¬≤: {comparison_df.iloc[0]['R¬≤']:.6f}")
print(f"\nüìÅ Resultados salvos em: {RESULTS_DIR / 'ml_experiments'}")
print("\nüéâ Notebook conclu√≠do com sucesso!")