## importações e configurações

In [None]:
import os, sys
from pathlib import Path
import importlib
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Caminho raiz do projeto
PROJ = Path("/content/drive/MyDrive/tcc-modelo/2-tcc-demand-forecasting")

# monta o drive
from google.colab import drive
drive.mount('/content/drive')

# Garante que o PROJECT_DIR está no sys.path
if str(PROJ) not in sys.path:
    sys.path.append(str(PROJ))

print("Repositório ativo em:", PROJ)

In [None]:
# exibe todo o display do pandas, sem truncar
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

In [None]:
from src.data import preprocessing
from src.data.aggregation import build_weekly_aggregation, build_daily_aggregation
from src.features.build import build_features
from src.data.imputation import build_imputed_dataset
importlib.reload(preprocessing)

cfg_path = PROJ / "configs" / "data.yaml"
interim_dir = PROJ / "data" / "interim"
output_name_merged = "olist_merged.parquet"
output_name_aggregation = "olist_weekly_agg.parquet"
output_name_lags = "olist_weekly_agg_withlags2.parquet" # com rolls 2,3,4
output_name_imputed = "olist_weekly_agg_withlags_imputed2.parquet" # com rolls 2,3,4

## funções úteis

In [None]:
import pandas as pd
import numpy as np

def summarize_features(df: pd.DataFrame,
                       exclude: list[str] = ["product_category_name", "order_week"]) -> pd.DataFrame:
    """
    Gera um resumo estatístico das features numéricas do dataframe.

    Parâmetros
    ----------
    df : pd.DataFrame
        DataFrame resultante da agregação semanal.
    exclude : list[str]
        Colunas que não devem ser analisadas (ex: chaves).

    Retorno
    -------
    pd.DataFrame
        DataFrame resumo com estatísticas descritivas por feature.
    """
    df_num = df.drop(columns=exclude, errors="ignore").select_dtypes(include=[np.number])
    summary = []

    for col in df_num.columns:
        s = df_num[col].dropna()
        n_total = len(df_num)
        n_nonnull = s.shape[0]
        pct_nonnull = 100 * n_nonnull / n_total if n_total > 0 else np.nan

        stats = {
            "feature": col,
            "dtype": df_num[col].dtype,
            "count_notnull": n_nonnull,
            "pct_notnull": round(pct_nonnull, 2),
            "min": s.min() if n_nonnull > 0 else np.nan,
            "p25": s.quantile(0.25) if n_nonnull > 0 else np.nan,
            "p50": s.quantile(0.50) if n_nonnull > 0 else np.nan,
            "p75": s.quantile(0.75) if n_nonnull > 0 else np.nan,
            "max": s.max() if n_nonnull > 0 else np.nan,
            "mean": s.mean() if n_nonnull > 0 else np.nan,
            "std": s.std() if n_nonnull > 1 else np.nan,
            "skew": s.skew() if n_nonnull > 2 else np.nan,
            "kurtosis": s.kurtosis() if n_nonnull > 3 else np.nan,
            "n_unique": s.nunique() if n_nonnull > 0 else 0,
        }
        summary.append(stats)

    df_summary = pd.DataFrame(summary).sort_values("feature").reset_index(drop=True)
    return df_summary


## roda o pipeline de unificação e limpeza dos datasets

In [None]:
out_path = preprocessing.build_olist_merged(
    cfg_path=str(cfg_path),
    interim_dir=str(interim_dir),
    output_name=output_name_merged,
    project_dir=str(PROJ),
)

print(f"Parquet gerado em: {out_path}")

## roda o pipeline de agregação

In [None]:
out_path = build_weekly_aggregation(
    interim_dir=interim_dir,
    input_name=output_name_merged,
    output_name=output_name_aggregation,
    project_dir="."   # raiz do projeto
)
out_path

In [None]:
df_weekly = pd.read_parquet(interim_dir/output_name_aggregation)
df_weekly['sales_qty'].plot()

In [None]:
out_path = build_daily_aggregation(
    interim_dir=interim_dir,
    input_name=output_name_merged,
    output_name=output_name_aggregation,
    project_dir="."   # raiz do projeto
)
out_path

In [None]:
df_daily = pd.read_parquet(interim_dir/output_name_aggregation)
df_daily['sales_qty'].plot()

## roda o pipeline de lags

In [None]:
outp = build_features(
    interim_dir=interim_dir,
    input_name=output_name_aggregation,
    output_name=output_name_lags,
    project_dir="."
)

In [None]:
outp = build_features(
    interim_dir=interim_dir,
    input_name=output_name_aggregation,
    output_name=output_name_lags,
    project_dir="."
)

## roda o pipeline de tratamento de nulos

In [None]:
out_path = build_imputed_dataset(
    interim_dir=interim_dir,
    input_name=output_name_lags,
    output_name=output_name_imputed,
    project_dir="."
)


In [None]:
out_path = build_imputed_dataset(
    interim_dir=interim_dir,
    input_name=output_name_lags,
    output_name=output_name_imputed,
    project_dir="."
)

## conferencia dos resultados

In [None]:
df_features = pd.read_parquet(out_path)
sumario = summarize_features(df_features)
sumario

In [None]:
sumario['feature'].values

## entendendo os nulos da varicao de preco

In [None]:
# pegar um caso com variacao nula que tenha acontecido depois das semanas iniciais
df_weekly[df_weekly['price_var_m4_vs_prev4_mean'].isnull()].sort_values('order_week', ascending=True).tail()



In [None]:
df_weekly.info()

In [None]:
df_weekly[df_weekly['product_category_name'] == 'artes'].info()

In [None]:
df_weekly[(df_weekly['product_category_name'] == 'artes') & (df_weekly['order_week'] >= '2018-06-25	')]

In [None]:
# artes em 2018-08-20	 está com a variavel price_var_m4_vs_prev4_mean nula, mesmo tendo historico passado de vendas

# pegando o dataset antes da agregação com semanas
df_rows = pd.read_parquet("/content/drive/MyDrive/tcc-modelo/tcc-demand-forecasting/data/interim/olist_merged.parquet")
df_rows["order_week"] = df_rows["order_purchase_timestamp"].dt.to_period("W").dt.start_time

# filtrando o dataset para pegar os casos de artes num periodo próximo à semana em que foi observada a variacao 0, da 3 meses
df_rows = df_rows[(df_rows['product_category_name'] == 'artes') & (df_rows['order_week'] >= '2018-05-20	')][['product_category_name', 'price','product_id','order_id','order_week']].sort_values('order_week', ascending=True)
df_rows.head(10)

In [None]:
grp = (
        df_rows.groupby(["product_id", "order_week"], as_index=False)
          .agg(price_mean=("price", "mean"))
          .sort_values(["product_id", "order_week"])
    )
grp

In [None]:
grp["price_roll4_mean"] = grp.groupby("product_id")["price_mean"].transform(
        lambda s: s.rolling(4, min_periods=4).mean()
    )

# criando uma coluna produto_n para facilitar a leitura, onde o codigo de cada produto vai ser substituido por produto1...n
map = {k: f"produto{i+1}" for i, k in enumerate(grp["product_id"].unique())}
grp["produto_n"] = grp["product_id"].map(map) # Remove sort_values here

# Sort the DataFrame after adding the new column
grp = grp.sort_values([ 'produto_n','order_week'], ascending=True)

In [None]:
grp[grp['produto_n'].isin(['produto12','produto4'])]
#grp

In [None]:
grp["price_var_m4_vs_prev4"] = grp.groupby("product_id")["price_roll4_mean"].pct_change(4)
grp = grp.sort_values(['order_week','produto_n'], ascending=True)
grp[grp['produto_n'].isin(['produto12','produto4'])]

In [None]:
grp

In [None]:
grp[grp['order_week'] == '2018-08-20']

In [None]:
# agregação por categoria
cols_to_merge = [
    "product_id",
    "order_week",
    "price_var_m4_vs_prev4",
]
price_vars = grp[cols_to_merge].copy()

base = df_rows[["product_id", "product_category_name", "order_week"]].drop_duplicates()
base = base.merge(price_vars, on=["product_id", "order_week"], how="left")

In [None]:
#aplicando a agregação por categoria
cat_week = (
    base.groupby(["product_category_name", "order_week"], as_index=False)
        .agg(
            price_var_m4_vs_prev4_mean=("price_var_m4_vs_prev4", "mean"),
        )
)

cat_week