## importações e configurações

In [None]:
!pip uninstall -y prophet cmdstanpy pystan
!pip install -U prophet cmdstanpy

In [None]:
import os, sys
from pathlib import Path
import importlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Tuple, Optional, Any
import numpy as np
from sklearn.model_selection import TimeSeriesSplit, RandomizedSearchCV, GridSearchCV

import shap

# modelos
from lightgbm import LGBMRegressor
import lightgbm as lgb

from prophet import Prophet

In [None]:
# Caminho raiz do projeto
PROJ = Path("/content/drive/MyDrive/tcc-modelo/3-tcc-demand-forecasting")

# monta o drive
from google.colab import drive
drive.mount('/content/drive')

# Garante que o PROJECT_DIR está no sys.path
if str(PROJ) not in sys.path:
    sys.path.append(str(PROJ))

print("Repositório ativo em:", PROJ)

In [None]:
# importação de módulos

from src.evaluations.models_metrics import calculate_metrics, compare_models # gera métricas de avaliação
from src.training_schema.split_rolling import split_rolling # gera as bases de treino e teste no esquema expansão de janela temporal
from src.evaluations.plot_real_pred import plot_real_pred # plota gráfico de linhas comparando os valores reais x preditos
from src.models.lgbm_tunning import run_lgbm_hpo_static
from src.models.lgbm_fit_predict import fit_predict_lgbm_fixed
from src.models.prophet_fit_predict import fit_predict_prophet_fixed
from src.models.lstm_fit_predict import fit_predict_lstm_fixed
from src.models.sarima_fit_predict import fit_predict_sarima_fixed

In [None]:
# caminho relativo para o dataframe final
interim_dir = PROJ / "data" / "interim"
output_name_imputed = "olist_weekly_agg_withlags_imputed2.parquet"
df_path = interim_dir / output_name_imputed

In [None]:
#feature_rank.csv contendo a análise individual das features
reports_dir = PROJ / "reports" / "tables"
rank_name = "feature_rank.csv"
rank_path = reports_dir / rank_name

In [None]:
# para salvar os graficos
figures_dir = PROJ / "reports" / "figures"

lgbm_plot_path = figures_dir / "lgbm_real_pred_plot.png"
prophet_plot_path = figures_dir / "prophet_real_pred_plot.png"
lstm_plot_path = figures_dir / "lstm_real_pred_plot.png"
sarima_plot_path = figures_dir / "sarima_real_pred_plot.png"

feature_importance_path = figures_dir / "feature_importance.png"

# para salvar a tabela com métricas concatenadas
metrics_path = PROJ / "reports" / "tables" / "metrics_all.csv"

In [None]:
# Colunas importantes
date_col    = "order_week"   # mesma coluna para carimbar previsões
target_col  = "sales_qty"    # alvo
id_col      = "id"   # opcional

# Períodos
train_start     = pd.Timestamp("2017-04-01")
first_train_end = pd.Timestamp("2018-03-18")
test_start      = pd.Timestamp("2018-03-19")
test_end        = pd.Timestamp("2018-07-31")

# Janela de rolling (ex.: blocos de 1 semana)
step = pd.Timedelta(days=7)  # (gap após cutoff, janela)

## definição do df

In [None]:
meu_df_all = pd.read_parquet(df_path)

## lista de features

In [None]:
selected_features = [feature for feature in meu_df_all.columns if 'roll' in feature or 'lag' in feature]

## modelos

### LGBM

#### params

In [None]:
df_hpo_all = meu_df_all[meu_df_all[date_col] <= first_train_end]

In [None]:
# ---- roda HPO uma vez só ----
best_params = run_lgbm_hpo_static(
    df=df_hpo_all,
    features=selected_features,
    date_col=date_col,
    target_col=target_col,
    first_train_end=first_train_end,
    n_splits=3,
    n_iter=50,
)

#### teste com todas features

In [None]:
# -----------------------------
# RODANDO O TESTE NA BASE INTEIRA
# -----------------------------

# Gera os pares de treino/validação com janela rolling para a base toda
pairs = split_rolling(
    df=meu_df_all,
    date_col=date_col,
    first_train_end=first_train_end,
    step=step,
)

y_true_all = []
y_pred_all = []
dates_all = []

for i, (train_i, valid_i) in enumerate(pairs):
    # limita a janela de validação ao período de teste global
    valid_i = valid_i[
        (valid_i[date_col] >= test_start) &
        (valid_i[date_col] <= test_end)
    ]
    if valid_i.empty:
        continue

    preds_i, mdl_lgbm , features_importance = fit_predict_lgbm_fixed(
        train=train_i,
        valid=valid_i,
        features=selected_features,
        date_col=date_col,
        target_col=target_col,
        best_params=best_params,
    )

    y_true_all.extend(valid_i[target_col].tolist())
    y_pred_all.extend(preds_i.tolist())
    dates_all.extend(valid_i[date_col].tolist())

    # pegando a importancia das features da ultima iteração
    features_importance_all = features_importance

if len(y_true_all) == 0:
    print(">> Sem janelas válidas no período de teste para a base inteira.")
else:
    y_true_all = np.array(y_true_all)
    y_pred_all = np.array(y_pred_all)

    # DataFrame com os resultados agregados da base inteira
    df_all = pd.DataFrame({
        "y_true": y_true_all,
        "y_pred": y_pred_all,
        "date": dates_all,
        # opcional: mantém uma coluna 'categoria' = 'all' para reaproveitar códigos antigos
        "categoria": ["all"] * len(y_true_all)
    })

In [None]:
# COM TODAS AS FEATURES LAG OU ROLL (com roll 2,3,4)
# dataframe com as metricas de cada categoria e geral

metrics_all = calculate_metrics(df_all, 'y_true', 'y_pred').to_dict()
metrics_all['categoria'] = 'all'
metrics_all = pd.DataFrame([metrics_all])

metrics_all

#### teste seleção de features

In [None]:
features_importance_all = features_importance_all.sort_values(by='importance', ascending=False)

In [None]:
# tamanhos dos subconjuntsos de features
k_list = [10, 20, 30, 40, 50, 60]

# dicionário: {k: [lista de features]}
conjunto_features = {}
for k in k_list:
    conjunto_features[k] = (
        features_importance_all
        .reset_index(drop=True)
        .loc[:k-1, "feature"]  # k primeiras features
        .tolist()
    )


In [None]:
resultados_k = []  # guardar as métricas de cada k

for k, feats_k in conjunto_features.items():
    print(f"\n========== Testando k={k} features ==========")
    print(f"Qtd de features: {len(feats_k)}")

    y_true_all = []
    y_pred_all = []
    dates_all = []

    # Gera os pares de treino/validação com janela rolling para a base toda
    pairs = split_rolling(
        df=meu_df_all,
        date_col=date_col,
        first_train_end=first_train_end,
        step=step,
    )

    for i, (train_i, valid_i) in enumerate(pairs):
        # limita a janela de validação ao período de teste global
        valid_i = valid_i[
            (valid_i[date_col] >= test_start) &
            (valid_i[date_col] <= test_end)
        ]
        if valid_i.empty:
            continue

        preds_i, _, _ = fit_predict_lgbm_fixed(
            train=train_i,
            valid=valid_i,
            features=feats_k,           # muda o conjunto de features
            date_col=date_col,
            target_col=target_col,
            best_params=best_params,
        )

        y_true_all.extend(valid_i[target_col].tolist())
        y_pred_all.extend(preds_i.tolist())
        dates_all.extend(valid_i[date_col].tolist())

    if len(y_true_all) == 0:
        print(f">> Sem janelas válidas no período de teste para k={k}.")
        continue

    # Monta DF com resultados desse k
    df_all_k = pd.DataFrame({
        "y_true": np.array(y_true_all),
        "y_pred": np.array(y_pred_all),
        "date": dates_all,
        "categoria": ["all"] * len(y_true_all),
    })

    # Calcula métricas para esse k
    metrics_k = calculate_metrics(df_all_k, "y_true", "y_pred").to_dict()
    metrics_k["n_features"] = k

    resultados_k.append(metrics_k)

# DataFrame final com as métricas por tamanho de conjunto
df_resultados_k = pd.DataFrame(resultados_k).sort_values("n_features").reset_index(drop=True)
print(df_resultados_k)


In [None]:
df_resultados_k

In [None]:
top_10_features = conjunto_features[10]
top_10_features

#### teste top10 features

In [None]:
# tunning
# ---- roda HPO uma vez só ----
best_params = run_lgbm_hpo_static(
    df=df_hpo_all,
    features=top_10_features,
    date_col=date_col,
    target_col=target_col,
    first_train_end=first_train_end,
    n_splits=3,
    n_iter=50,
)


In [None]:
y_true_all = []
y_pred_all = []
dates_all = []

for i, (train_i, valid_i) in enumerate(pairs):
    # limita a janela de validação ao período de teste global
    valid_i = valid_i[
        (valid_i[date_col] >= test_start) &
        (valid_i[date_col] <= test_end)
    ]
    if valid_i.empty:
        continue

    preds_i, mdl_lgbm , features_importance = fit_predict_lgbm_fixed(
        train=train_i,
        valid=valid_i,
        features=top_10_features, # apenas as top10
        date_col=date_col,
        target_col=target_col,
        best_params=best_params,
    )

    y_true_all.extend(valid_i[target_col].tolist())
    y_pred_all.extend(preds_i.tolist())
    dates_all.extend(valid_i[date_col].tolist())

    # pegando a importancia das features da ultima iteração
    features_importance_all = features_importance

if len(y_true_all) == 0:
    print(">> Sem janelas válidas no período de teste para a base inteira.")
else:
    y_true_all = np.array(y_true_all)
    y_pred_all = np.array(y_pred_all)

    # DataFrame com os resultados agregados da base inteira
    df_all = pd.DataFrame({
        "y_true": y_true_all,
        "y_pred": y_pred_all,
        "date": dates_all,
        # opcional: mantém uma coluna 'categoria' = 'all' para reaproveitar códigos antigos
        "categoria": ["all"] * len(y_true_all)
    })

In [None]:
metrics_lgbm = calculate_metrics(df_all, 'y_true', 'y_pred').to_dict()
metrics_lgbm['algoritimo'] = 'lgbm'
metrics_lgbm = pd.DataFrame([metrics_lgbm])

metrics_lgbm

In [None]:
plot_real_pred(df_all['y_true'], df_all['y_pred'], date = df_all['date'], title='LGBM', fig_path=lgbm_plot_path)

### Prophet

In [None]:
pairs = split_rolling(
    df=meu_df_all,
    date_col=date_col,
    first_train_end=first_train_end,
    step=step,
)

y_true_cat = []
y_pred_cat = []
date = []

for i, (train_i, valid_i) in enumerate(pairs):
    # limita a janela de validação ao período de teste global
    valid_i = valid_i[
        (valid_i[date_col] >= test_start) &
        (valid_i[date_col] <= test_end)
    ]
    if valid_i.empty:
        continue

    preds_i, _ = fit_predict_prophet_fixed(
        train=train_i,
        valid=valid_i,
        features=[],
        date_col=date_col,
        target_col=target_col
    )

    y_true_cat.extend(valid_i[target_col].tolist())
    y_pred_cat.extend(preds_i.tolist())
    date.extend(valid_i[date_col].tolist())

y_true_cat = np.array(y_true_cat)
y_pred_cat = np.array(y_pred_cat)

results = pd.DataFrame({
    "y_true": y_true_cat,
    "y_pred": y_pred_cat,
    "date": date
})

In [None]:
# AGRUPAMENTO SEMANAL
# dataframe com as metricas de cada categoria e geral

metrics_prophet = calculate_metrics(results, 'y_true', 'y_pred').to_dict()
metrics_prophet['algoritimo'] = 'prophet'
metrics_prophet = pd.DataFrame([metrics_prophet])

metrics_prophet

In [None]:
plot_real_pred(results['y_true'], results['y_pred'], date = results['date'], title='prophet', fig_path=prophet_plot_path)

### LSTM

In [None]:
# PARA AGREGAÇÃO SEMANAL
lookback=4        # janela menor
hidden_size=32
epochs=30
lr=0.001

In [None]:
pairs = split_rolling(
    df=meu_df_all,
    date_col=date_col,
    first_train_end=first_train_end,
    step=step,
)

y_true, y_pred, date = [], [], []

for train_i, valid_i in pairs:

    valid_i = valid_i[
        (valid_i[date_col] >= test_start) &
        (valid_i[date_col] <= test_end)
    ]
    if valid_i.empty:
        continue

    preds_i, _ = fit_predict_lstm_fixed(
        train=train_i,
        valid=valid_i,
        lookback=lookback,
        hidden_size=hidden_size,
        epochs=epochs,
        date_col=date_col,
        target_col=target_col
    )

    y_true.extend(valid_i[target_col].tolist())
    y_pred.extend(preds_i.tolist())
    date.extend(valid_i[date_col].tolist())

results = pd.DataFrame({
    "y_true": y_true,
    "y_pred": y_pred,
    "date": date
})


In [None]:
# AGRUPAMENTO SEMANAL
# dataframe com as metricas de cada categoria e geral

metrics_lstm = calculate_metrics(results, 'y_true', 'y_pred').to_dict()
metrics_lstm['algoritimo'] = 'lstm'
metrics_lstm = pd.DataFrame([metrics_lstm])

metrics_lstm

In [None]:
plot_real_pred(results['y_true'], results['y_pred'], date = results['date'], title='lstm', fig_path=lstm_plot_path)

### SARIMA

In [None]:
# PARA AGREGAÇÃO SEMANAL
order = (1, 1, 1)
seasonal_order = (0, 1, 1, 52)
date_col = "order_week"

In [None]:
pairs = split_rolling(
    df=meu_df_all,
    date_col=date_col,
    first_train_end=first_train_end,
    step=step,
)

y_true = []
y_pred = []
date = []

for i, (train_i, valid_i) in enumerate(pairs):

    # limita a janela de validação ao período de teste global
    valid_i = valid_i[
        (valid_i[date_col] >= test_start) &
        (valid_i[date_col] <= test_end)
    ]
    if valid_i.empty:
        continue

    preds_i, _ = fit_predict_sarima_fixed(
        train=train_i,
        valid=valid_i,
        date_col=date_col,
        target_col=target_col,
        order=(1, 1, 1),
        seasonal_order=(0, 1, 1, 7),  # diário, sazonalidade semanal
    )

    y_true.extend(valid_i[target_col].tolist())
    y_pred.extend(preds_i.tolist())
    date.extend(valid_i[date_col].tolist())

results_sarima = pd.DataFrame({
    "y_true": y_true,
    "y_pred": y_pred,
    "date": date,
})


In [None]:
# AGRUPAMENTO SEMANAL
# dataframe com as metricas de cada categoria e geral

metrics_sarima = calculate_metrics(results_sarima, 'y_true', 'y_pred').to_dict()
metrics_sarima['algoritimo'] = 'sarima'
metrics_sarima = pd.DataFrame([metrics_sarima])

metrics_sarima

In [None]:
plot_real_pred(results_sarima['y_true'], results_sarima['y_pred'], date = results_sarima['date'], title='sarima', fig_path=sarima_plot_path)

## Concatenação dos resultados

In [None]:
df_metrics = pd.concat([metrics_lgbm, metrics_prophet, metrics_lstm, metrics_sarima]).reset_index(drop=True)
df_metrics

In [None]:
df_metrics.to_csv(metrics_path, index=False)

## Análise das features

In [None]:
# puxar a tabela com métricas e filtras as top10 aqui utilizadas
# plotar gráficos para analisar correlação das features com o target

In [None]:
rank_features = pd.read_csv(rank_path)

In [None]:
rank_features[rank_features['feature'].isin(top_10_features)]

In [None]:
rank_features[rank_features['feature'].isin(top_10_features)]

In [None]:
# plotando grafico de barras da feature importance
plt.figure(figsize=(10, 6))
features_importance_all = features_importance_all.sort_values(by='importance', ascending=False)
sns.barplot(x='importance', y='feature', data=features_importance_all)

In [None]:
# plotando grafico de barras da feature importance
plt.figure(figsize=(10, 6))
features_importance_all = features_importance_all.sort_values(by='importance', ascending=False)
sns.barplot(x='importance', y='feature', data=features_importance_all)