# 01 - Coleta e Pré-Processamento

Este notebook carrega os dados brutos e realiza o pré-processamento completo, incluindo preenchimento de lacunas e geração de features ricas.

In [10]:
# Importar bibliotecas
import pandas as pd
import numpy as np

# Função de pré-processamento
def gerar_demanda_completa(path_raw_pedidos: str, path_raw_eventos: str, path_output: str) -> pd.DataFrame:
    # Carregar pedidos
    pedidos = pd.read_csv(path_raw_pedidos, parse_dates=["date"])
    pedidos = pedidos[pedidos["order_status"] == "Delivered"].copy()
    pedidos["date"] = pd.to_datetime(pedidos["date"].dt.date)

    # Agrupar demanda nacional por dia
    demanda_nacional = (
        pedidos
        .groupby("date", as_index=False)
        .agg(total_nacional=("quantity", "sum"))
        .set_index("date")
    )

    # Preencher datas faltantes
    datas_completas = pd.date_range(start=demanda_nacional.index.min(),
                                    end=demanda_nacional.index.max(),
                                    freq="D")
    demanda_nacional = demanda_nacional.reindex(datas_completas, fill_value=0)
    demanda_nacional.index.name = "date"

    # Carregar eventos
    eventos = pd.read_csv(path_raw_eventos, parse_dates=["date"])
    eventos["date"] = pd.to_datetime(eventos["date"].dt.date)
    datas_com_evento = (
        eventos[["date"]]
        .drop_duplicates()
        .assign(evento_na_data=1)
    )

    # Juntar eventos como exógena
    demanda_nacional = demanda_nacional.merge(
        datas_com_evento, left_index=True, right_on="date", how="left"
    ).set_index("date").fillna({"evento_na_data": 0})

    # Renomear coluna para 'y'
    demanda_nacional = demanda_nacional.rename(columns={"total_nacional": "y"})

    # Função para criar features ricas
    def criar_features_ricos(df: pd.DataFrame):
        df = df.copy()
        # Criação de lags
        for lag in [1, 7, 14, 21, 30]:
            df[f"lag_{lag}"] = df["y"].shift(lag)
        # Médias móveis
        df["rolling_mean_7"] = df["y"].rolling(window=7).mean()
        df["rolling_mean_30"] = df["y"].rolling(window=30).mean()
        # Variáveis de calendário
        df["day_of_week"] = df.index.dayofweek
        df["is_weekend"] = (df["day_of_week"] >= 5).astype(int)
        df["month"] = df.index.month
        df["day"] = df.index.day
        df["is_month_start"] = df.index.is_month_start.astype(int)
        df["is_month_end"] = df.index.is_month_end.astype(int)
        df = df.dropna()
        return df

    # Gerar DataFrame com features
    demanda_features = criar_features_ricos(demanda_nacional)

    # Salvar em CSV
    demanda_features.to_csv(path_output, index=True)
    return demanda_features

# Executar pré-processamento
df_demanda_features = gerar_demanda_completa(
    path_raw_pedidos="../data/raw/dados_pedidos.csv",
    path_raw_eventos="../data/raw/dados_contextuais.csv",
    path_output="../data/processed/demanda_features.csv"
)

# Mostrar primeiras linhas
df_demanda_features.head()

Unnamed: 0_level_0,y,evento_na_data,lag_1,lag_7,lag_14,lag_21,lag_30,rolling_mean_7,rolling_mean_30,day_of_week,is_weekend,month,day,is_month_start,is_month_end
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2023-01-31,66,0.0,161.0,70.0,212.0,195.0,259.0,173.714286,145.066667,1,0,1,31,0,1
2023-02-01,235,1.0,66.0,139.0,36.0,85.0,238.0,187.428571,144.966667,2,0,2,1,1,0
2023-02-02,257,0.0,235.0,269.0,136.0,139.0,144.0,185.714286,148.733333,3,0,2,2,0,0
2023-02-03,18,0.0,257.0,283.0,11.0,113.0,152.0,147.857143,144.266667,4,0,2,3,0,0
2023-02-04,154,0.0,18.0,281.0,87.0,301.0,58.0,129.714286,147.466667,5,1,2,4,0,0
