1. Importação de Bibliotecas e Dados

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
from xgboost import XGBRegressor
from statsmodels.tsa.arima.model import ARIMA
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks
from prophet import Prophet
from pathlib import Path
import joblib

  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.


In [2]:
DATA_PATH = Path("../src/data")
ticker = "PETR4.SA"
df = pd.read_parquet(DATA_PATH / f"{ticker}_features.parquet")
df = df.sort_index()


2. Preparação dos Dados para Modelagem
- Separação da variável alvo:

In [3]:
target = 'Close'
X = df.drop(columns=[target])
y = df[target]

- Divisão temporal:
70% treino, 15% validação, 15% teste.

In [4]:
train_size = int(len(df) * 0.7)
val_size = int(len(df) * 0.15)
X_train, y_train = X.iloc[:train_size], y.iloc[:train_size]
X_val, y_val = X.iloc[train_size:train_size+val_size], y.iloc[train_size:train_size+val_size]
X_test, y_test = X.iloc[train_size+val_size:], y.iloc[train_size+val_size:]


3. Definição e Treinamento dos Modelos
- 3.1. Baseline: XGBoost Regressor

In [6]:
xgb = XGBRegressor(n_estimators=100, learning_rate=0.05, random_state=42)
xgb.fit(X_train, y_train)
y_val_pred = xgb.predict(X_val)

- 3.2. ARIMA
    - Use apenas a série univariada de fechamento.

In [7]:
arima_model = ARIMA(y_train, order=(5,1,0))
arima_fitted = arima_model.fit()
y_val_pred_arima = arima_fitted.forecast(steps=len(y_val))

  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(


- 3.3. Prophet
    - Estruture o dataframe no formato esperado (ds, y).

In [8]:
prophet_df = pd.DataFrame({'ds': y_train.index, 'y': y_train.values})
model_prophet = Prophet()
model_prophet.fit(prophet_df)
future = pd.DataFrame({'ds': y_val.index})
y_val_pred_prophet = model_prophet.predict(future)['yhat'].values

17:54:36 - cmdstanpy - INFO - Chain [1] start processing
17:54:41 - cmdstanpy - INFO - Chain [1] done processing


- 3.4. LSTM (TensorFlow/Keras)
    - Estruture entradas como sequência.

In [9]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

timesteps = 10  # Usar janelas de 10 dias
def create_sequences(X, y, ts):
    Xs, ys = [], []
    for i in range(len(X) - ts):
        Xs.append(X[i:i+ts])
        ys.append(y[i+ts])
    return np.array(Xs), np.array(ys)

X_seq_train, y_seq_train = create_sequences(X_train_scaled, y_train.values, timesteps)
X_seq_val, y_seq_val = create_sequences(X_val_scaled, y_val.values, timesteps)

model_lstm = models.Sequential([
    layers.LSTM(50, input_shape=(timesteps, X_train.shape[1])),
    layers.Dense(1)
])
model_lstm.compile(optimizer='adam', loss='mse')
model_lstm.fit(X_seq_train, y_seq_train, epochs=20, validation_data=(X_seq_val, y_seq_val),
               callbacks=[callbacks.EarlyStopping(patience=3)])
y_val_pred_lstm = model_lstm.predict(X_seq_val).flatten()


Epoch 1/20


  super().__init__(**kwargs)


[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 9ms/step - loss: 141.9622 - val_loss: 691.5257
Epoch 2/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 63.4597 - val_loss: 521.7568
Epoch 3/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 43.2637 - val_loss: 448.8982
Epoch 4/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 38.0325 - val_loss: 415.7565
Epoch 5/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 32.9454 - val_loss: 385.9911
Epoch 6/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 22.0604 - val_loss: 339.0117
Epoch 7/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 16.9026 - val_loss: 302.2913
Epoch 8/20
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 13.6072 - val_loss: 274.1652
Epoch 9/20
[1m30/30[0m [32m━━━━━━━━━━━━

- 4. Otimização e Avaliação
    - Métricas a serem avaliadas:

        - Mean Absolute Error (MAE)

        - Root Mean Squared Error (RMSE)

        - Mean Absolute Percentage Error (MAPE)

In [13]:
def print_metrics(y_true, y_pred, model_name):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mape = mean_absolute_percentage_error(y_true, y_pred)
    print(f"{model_name}: MAE={mae:.4f} | RMSE={rmse:.4f} | MAPE={mape:.2%}")


- Exemplo de uso:

In [15]:
print_metrics(y_val, y_val_pred, "XGBoost")
print_metrics(y_val, y_val_pred_arima, "ARIMA")
print_metrics(y_val, y_val_pred_prophet, "Prophet")
print_metrics(y_seq_val, y_val_pred_lstm, "LSTM")

XGBoost: MAE=3.3407 | RMSE=3.8341 | MAPE=10.03%
ARIMA: MAE=3.4559 | RMSE=3.9155 | MAPE=10.41%
Prophet: MAE=4.3091 | RMSE=5.3807 | MAPE=13.24%
LSTM: MAE=10.2448 | RMSE=10.4054 | MAPE=31.34%
