# üöÄ Tutorial 2: Pr√©-processamento Avan√ßado e Feature Engineering

Este notebook cobre t√©cnicas avan√ßadas de pr√©-processamento:
1. Decomposi√ß√£o sazonal
2. Transforma√ß√µes de dados
3. Feature engineering
4. Tratamento de outliers
5. Estacionariedade

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

import sys
sys.path.insert(0, '../')
from src.preprocessing import TimeSeriesPreprocessor
from src.visualization import TimeSeriesVisualizer

preprocessor = TimeSeriesPreprocessor()
visualizer = TimeSeriesVisualizer()

print("‚úì Bibliotecas carregadas!")

## 1. Gera√ß√£o de Dados Complexos

Vamos criar dados com m√∫ltiplas sazonalidades e outliers.

In [None]:
def generate_complex_data(n_points=365*3):
    """Gera s√©rie temporal complexa com m√∫ltiplas sazonalidades e outliers."""
    dates = pd.date_range('2020-01-01', periods=n_points, freq='D')
    
    # Tend√™ncia n√£o-linear (exponencial)
    t = np.arange(n_points)
    trend = 100 * np.exp(t / (n_points * 2))
    
    # Sazonalidade anual
    seasonal_yearly = 30 * np.sin(2 * np.pi * t / 365)
    
    # Sazonalidade mensal
    seasonal_monthly = 15 * np.sin(2 * np.pi * t / 30)
    
    # Sazonalidade semanal
    seasonal_weekly = 10 * np.sin(2 * np.pi * t / 7)
    
    # Ru√≠do
    noise = np.random.normal(0, 5, n_points)
    
    # Combinar
    values = trend + seasonal_yearly + seasonal_monthly + seasonal_weekly + noise
    
    # Adicionar outliers (eventos especiais)
    outlier_indices = np.random.choice(n_points, size=10, replace=False)
    values[outlier_indices] += np.random.normal(50, 20, 10)
    
    # Adicionar valores faltantes
    missing_indices = np.random.choice(n_points, size=15, replace=False)
    values[missing_indices] = np.nan
    
    return pd.Series(values, index=dates, name='vendas')

data = generate_complex_data()

print(f"Dados gerados: {len(data)} pontos")
print(f"Valores faltantes: {data.isna().sum()}")
print(f"Per√≠odo: {data.index[0]} a {data.index[-1]}")

# Visualizar
visualizer.plot_time_series(data, title="S√©rie Temporal Complexa (com outliers e NaN)")
plt.show()

## 2. Tratamento de Valores Faltantes

In [None]:
# Comparar m√©todos de imputa√ß√£o
methods = ['interpolate', 'ffill', 'bfill', 'mean']
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
axes = axes.flatten()

for idx, method in enumerate(methods):
    filled = preprocessor.handle_missing_values(data.copy(), method=method)
    
    axes[idx].plot(data.index, data.values, 'o', alpha=0.3, label='Original (com NaN)', markersize=2)
    axes[idx].plot(filled.index, filled.values, '-', label=f'Preenchido ({method})', linewidth=1)
    axes[idx].set_title(f'M√©todo: {method}')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Usar melhor m√©todo (interpola√ß√£o)
data_filled = preprocessor.handle_missing_values(data, method='interpolate')
print(f"‚úì Valores faltantes preenchidos: {data_filled.isna().sum()}")

## 3. Detec√ß√£o e Tratamento de Outliers

In [None]:
# Comparar m√©todos de detec√ß√£o
methods = ['iqr', 'zscore', 'modified_zscore']

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, method in enumerate(methods):
    outliers_mask = preprocessor.detect_outliers(data_filled, method=method)
    
    axes[idx].plot(data_filled.index, data_filled.values, '-', label='Dados', linewidth=1)
    axes[idx].scatter(
        data_filled.index[outliers_mask],
        data_filled.values[outliers_mask],
        color='red',
        s=50,
        label=f'Outliers ({outliers_mask.sum()})',
        zorder=5
    )
    axes[idx].set_title(f'M√©todo: {method}')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Remover outliers com IQR
data_clean = preprocessor.remove_outliers(data_filled, method='iqr', threshold=3.0)
print(f"‚úì Outliers removidos")
print(f"  Dados originais: {len(data_filled)}")
print(f"  Dados limpos: {data_clean.notna().sum()}")

## 4. Decomposi√ß√£o Sazonal

In [None]:
# Decomposi√ß√£o aditiva
decomposition = preprocessor.decompose(data_clean, model='additive', period=365)

# Visualizar componentes
fig = visualizer.plot_decomposition(
    trend=decomposition.trend,
    seasonal=decomposition.seasonal,
    residual=decomposition.residual
)
plt.show()

print("Componentes extra√≠dos:")
print(f"  Tend√™ncia: {decomposition.trend.notna().sum()} pontos")
print(f"  Sazonal: {decomposition.seasonal.notna().sum()} pontos")
print(f"  Residual: {decomposition.residual.notna().sum()} pontos")

## 5. Transforma√ß√µes de Dados

In [None]:
# Comparar transforma√ß√µes
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Original
axes[0, 0].plot(data_clean.index, data_clean.values, linewidth=1)
axes[0, 0].set_title('Original')
axes[0, 0].grid(True, alpha=0.3)

# Log
data_log = preprocessor.transform(data_clean, method='log')
axes[0, 1].plot(data_log.index, data_log.values, linewidth=1, color='orange')
axes[0, 1].set_title('Transforma√ß√£o Logar√≠tmica')
axes[0, 1].grid(True, alpha=0.3)

# Box-Cox
data_boxcox = preprocessor.transform(data_clean, method='boxcox')
axes[1, 0].plot(data_boxcox.index, data_boxcox.values, linewidth=1, color='green')
axes[1, 0].set_title('Transforma√ß√£o Box-Cox')
axes[1, 0].grid(True, alpha=0.3)

# Diferencia√ß√£o
data_diff = preprocessor.difference(data_clean, periods=1)
axes[1, 1].plot(data_diff.index, data_diff.values, linewidth=1, color='red')
axes[1, 1].set_title('Diferencia√ß√£o (1¬™ ordem)')
axes[1, 1].axhline(0, color='black', linestyle='--', alpha=0.5)
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Teste de Estacionariedade

In [None]:
# Testar dados originais
is_stationary, result = preprocessor.test_stationarity(data_clean)

print("Teste ADF (Augmented Dickey-Fuller):")
print(f"  Dados originais:")
print(f"    Estacion√°rio: {is_stationary}")
print(f"    Estat√≠stica: {result['adf_statistic']:.4f}")
print(f"    p-valor: {result['p_value']:.4f}")
print(f"    Valores cr√≠ticos:")
for key, value in result['critical_values'].items():
    print(f"      {key}: {value:.4f}")

# Se n√£o for estacion√°rio, tornar estacion√°rio
if not is_stationary:
    print("\n‚Üí Tornando s√©rie estacion√°ria...")
    data_stationary = preprocessor.make_stationary(data_clean)
    
    is_stat_after, result_after = preprocessor.test_stationarity(data_stationary)
    print(f"\n  Ap√≥s transforma√ß√£o:")
    print(f"    Estacion√°rio: {is_stat_after}")
    print(f"    p-valor: {result_after['p_value']:.4f}")
    
    # Visualizar
    fig, axes = plt.subplots(2, 1, figsize=(14, 10))
    
    axes[0].plot(data_clean.index, data_clean.values, linewidth=1)
    axes[0].set_title('S√©rie Original (N√£o-Estacion√°ria)')
    axes[0].grid(True, alpha=0.3)
    
    axes[1].plot(data_stationary.index, data_stationary.values, linewidth=1, color='green')
    axes[1].set_title('S√©rie Estacion√°ria')
    axes[1].axhline(0, color='red', linestyle='--', alpha=0.5)
    axes[1].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

## 7. Feature Engineering

In [None]:
# Criar features de lag
features_lag = preprocessor.create_lag_features(
    data_clean,
    lags=[1, 7, 14, 30]
)

print("Features de Lag criadas:")
print(features_lag.head(40))
print(f"\nDimens√µes: {features_lag.shape}")

In [None]:
# Criar features de rolling statistics
features_rolling = preprocessor.create_rolling_features(
    data_clean,
    windows=[7, 14, 30],
    statistics=['mean', 'std', 'min', 'max']
)

print("Features Rolling criadas:")
print(features_rolling.columns.tolist())
print(f"\nDimens√µes: {features_rolling.shape}")

# Visualizar algumas features
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

axes[0].plot(features_rolling.index, features_rolling['value'], label='Original', linewidth=1)
axes[0].plot(features_rolling.index, features_rolling['rolling_mean_7'], label='M√©dia M√≥vel 7d', linewidth=2)
axes[0].plot(features_rolling.index, features_rolling['rolling_mean_30'], label='M√©dia M√≥vel 30d', linewidth=2)
axes[0].set_title('M√©dias M√≥veis')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(features_rolling.index, features_rolling['rolling_std_7'], label='Desvio Padr√£o 7d', linewidth=2)
axes[1].plot(features_rolling.index, features_rolling['rolling_std_30'], label='Desvio Padr√£o 30d', linewidth=2)
axes[1].set_title('Volatilidade (Desvio Padr√£o M√≥vel)')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Criar features temporais
features_time = preprocessor.create_time_features(data_clean)

print("Features Temporais criadas:")
print(features_time.head(10))
print(f"\nColunas: {features_time.columns.tolist()}")

In [None]:
# Criar conjunto completo de features
features_complete = preprocessor.create_features(
    data_clean,
    lags=[1, 7, 14, 30],
    rolling_windows=[7, 14, 30],
    time_features=True,
    seasonal_features=True
)

print(f"Features completas criadas: {features_complete.shape[1]} features")
print(f"\nLista de features:")
for col in features_complete.columns:
    print(f"  - {col}")

## 8. An√°lise de Correla√ß√£o

In [None]:
# Calcular correla√ß√µes
features_subset = features_complete[[
    'value', 'lag_1', 'lag_7', 'lag_30',
    'rolling_mean_7', 'rolling_std_7',
    'day_of_week', 'month'
]].dropna()

correlation_matrix = features_subset.corr()

# Visualizar heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(
    correlation_matrix,
    annot=True,
    fmt='.2f',
    cmap='coolwarm',
    center=0,
    square=True,
    linewidths=1
)
plt.title('Matriz de Correla√ß√£o das Features', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# Features mais correlacionadas com target
target_corr = correlation_matrix['value'].drop('value').sort_values(ascending=False)
print("\nFeatures mais correlacionadas com target:")
print(target_corr)

## 9. Escalonamento

In [None]:
# Comparar m√©todos de escalonamento
fig, axes = plt.subplots(2, 2, figsize=(16, 10))

# Original
axes[0, 0].hist(data_clean.values, bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Original')
axes[0, 0].set_xlabel('Valor')
axes[0, 0].set_ylabel('Frequ√™ncia')

# Min-Max
data_minmax = preprocessor.scale(data_clean, method='minmax')
axes[0, 1].hist(data_minmax.values, bins=50, edgecolor='black', alpha=0.7, color='orange')
axes[0, 1].set_title('Min-Max Scaling [0, 1]')
axes[0, 1].set_xlabel('Valor')
axes[0, 1].set_ylabel('Frequ√™ncia')

# Standard
data_standard = preprocessor.scale(data_clean, method='standard')
axes[1, 0].hist(data_standard.values, bins=50, edgecolor='black', alpha=0.7, color='green')
axes[1, 0].set_title('Standard Scaling (Œº=0, œÉ=1)')
axes[1, 0].set_xlabel('Valor')
axes[1, 0].set_ylabel('Frequ√™ncia')

# Compara√ß√£o temporal
axes[1, 1].plot(data_clean.index, data_clean.values, label='Original', alpha=0.5)
axes[1, 1].plot(data_minmax.index, data_minmax.values * 100, label='Min-Max (√ó100)', alpha=0.7)
axes[1, 1].plot(data_standard.index, data_standard.values * 10 + 100, label='Standard (√ó10+100)', alpha=0.7)
axes[1, 1].set_title('Compara√ß√£o Visual')
axes[1, 1].legend()
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("Estat√≠sticas dos dados escalonados:")
print(f"\nMin-Max:")
print(f"  M√≠n: {data_minmax.min():.4f}, M√°x: {data_minmax.max():.4f}")
print(f"\nStandard:")
print(f"  M√©dia: {data_standard.mean():.4f}, Desvio: {data_standard.std():.4f}")

## 10. Pipeline Completo de Pr√©-processamento

In [None]:
# Pipeline end-to-end
print("Executando pipeline completo de pr√©-processamento...\n")

# 1. Dados originais
print(f"1. Dados originais: {len(data)} pontos")
print(f"   Valores faltantes: {data.isna().sum()}")

# 2. Preencher valores faltantes
data_step1 = preprocessor.handle_missing_values(data, method='interpolate')
print(f"\n2. Ap√≥s imputa√ß√£o: {data_step1.isna().sum()} valores faltantes")

# 3. Remover outliers
outliers_before = preprocessor.detect_outliers(data_step1, method='iqr').sum()
data_step2 = preprocessor.remove_outliers(data_step1, method='iqr', threshold=3.0)
print(f"\n3. Remo√ß√£o de outliers: {outliers_before} outliers detectados")

# 4. Transforma√ß√£o
data_step3 = preprocessor.transform(data_step2, method='log')
print(f"\n4. Transforma√ß√£o logar√≠tmica aplicada")

# 5. Diferencia√ß√£o
data_step4 = preprocessor.difference(data_step3, periods=1)
print(f"\n5. Diferencia√ß√£o aplicada")

# 6. Escalonamento
data_final = preprocessor.scale(data_step4, method='standard')
print(f"\n6. Escalonamento padr√£o aplicado")

print(f"\n‚úì Pipeline conclu√≠do!")
print(f"  Dados finais: {data_final.notna().sum()} pontos")
print(f"  M√©dia: {data_final.mean():.4f}")
print(f"  Desvio: {data_final.std():.4f}")

# Visualizar antes e depois
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

axes[0].plot(data.index, data.values, linewidth=1)
axes[0].set_title('Dados Originais', fontsize=14, fontweight='bold')
axes[0].grid(True, alpha=0.3)

axes[1].plot(data_final.index, data_final.values, linewidth=1, color='green')
axes[1].set_title('Dados Ap√≥s Pipeline Completo', fontsize=14, fontweight='bold')
axes[1].axhline(0, color='red', linestyle='--', alpha=0.5)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Conclus√£o

Neste tutorial avan√ßado, voc√™ aprendeu:

1. ‚úÖ M√©todos de imputa√ß√£o de valores faltantes
2. ‚úÖ Detec√ß√£o e tratamento de outliers
3. ‚úÖ Decomposi√ß√£o sazonal
4. ‚úÖ Transforma√ß√µes para estabilizar vari√¢ncia
5. ‚úÖ Testes de estacionariedade
6. ‚úÖ Feature engineering (lags, rolling stats, features temporais)
7. ‚úÖ An√°lise de correla√ß√£o
8. ‚úÖ Escalonamento de dados
9. ‚úÖ Pipeline completo de pr√©-processamento

### Pr√≥ximos Passos

- **Tutorial 3:** Deep Learning com LSTM e GRU
- **Tutorial 4:** Modelos Ensemble Avan√ßados
- **Tutorial 5:** Otimiza√ß√£o de Hiperpar√¢metros