# 02 — Clusterização de Séries (Item × Loja)
Objetivos:
- Criar **clusters** de séries parecidas (item×loja) com base em estatísticas de comportamento (média, variação, tendência, sazonalidade, zeros, preço).
- Salvar o `cluster` e um **pipeline** (scaler + k-means) para reutilizar na previsão.


In [None]:

# ============================
# 0) IMPORTS & CONFIG
# ============================
import os, sys, gc, json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans
import joblib

pd.set_option("display.max_columns", 120)

INPUT_PATHS = [
    "./outputs/base_mensal_clean.parquet",
    "./base_mensal.csv",
    "/mnt/data/base_mensal.csv"
]

OUTPUT_DIR = "./outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Mesmos aliases do notebook 01
COLUMN_ALIASES = {
    "date": ["year_month","data","periodo","period","mes_ano","dt","date","ym","month_year"],
    "item": ["item","sku","produto","product_id","id_item"],
    "store": ["store","loja","id_loja","shop","filial"],
    "sales": ["sales","vendas","qtd_vendas","valor_vendas","demand","y"],
    "price": ["mean_price","price","preco","preço","avg_price","unit_price"],
}

def _infer_col(cols, aliases):
    cols_lower = {c.lower(): c for c in cols}
    for a in aliases:
        if a.lower() in cols_lower:
            return cols_lower[a.lower()]
    return None

def infer_columns(df):
    mapping = {}
    for logical, aliases in COLUMN_ALIASES.items():
        col = _infer_col(df.columns, aliases)
        mapping[logical] = col
    required = ["date","item","store","sales"]
    missing = [k for k in required if mapping.get(k) is None]
    if missing:
        raise ValueError(f"Faltam colunas {missing}.")
    return mapping

def try_load(paths):
    for p in paths:
        if os.path.exists(p):
            try:
                if p.endswith(".parquet"):
                    df = pd.read_parquet(p)
                else:
                    df = pd.read_csv(p)
                print(f"[OK] Lido: {p} -> shape {df.shape}")
                return df
            except Exception as e:
                print(f"[WARN] Falha ao ler {p}: {e}")
    raise FileNotFoundError(f"Não encontrei dataset em {paths}")

# Hiperparâmetros de cluster
N_CLUSTERS = 5
RANDOM_STATE = 42
BATCH_SIZE = 2048


## 1) Carregar e preparar

In [None]:

df = try_load(INPUT_PATHS)
mapping = infer_columns(df)

DATE_COL  = mapping["date"]
ITEM_COL  = mapping["item"]
STORE_COL = mapping["store"]
SALES_COL = mapping["sales"]
PRICE_COL = mapping["price"]

# Converter tipos básicos
df[DATE_COL] = pd.to_datetime(df[DATE_COL]).dt.to_period("M").dt.to_timestamp()
df[ITEM_COL]  = df[ITEM_COL].astype("string")
df[STORE_COL] = df[STORE_COL].astype("string")

df = df.sort_values([STORE_COL, ITEM_COL, DATE_COL]).reset_index(drop=True)
print(df.head(3))


## 2) Features por série (item×loja)

In [None]:

# time index por série
df["_time_index"] = df.groupby([STORE_COL, ITEM_COL]).cumcount()

def per_series_features(pdf):
    y = pdf[SALES_COL].astype(float).values
    t = pdf["_time_index"].astype(float).values
    feats = {}
    feats["n_months"] = len(pdf)
    feats["mean_sales"] = float(np.nanmean(y))
    feats["std_sales"]  = float(np.nanstd(y))
    feats["cv_sales"]   = float(feats["std_sales"] / feats["mean_sales"]) if feats["mean_sales"]>0 else 0.0
    feats["zero_rate"]  = float((y==0).mean())
    # tendência (reta) y ~ a + b*t
    if len(t) >= 2 and np.nanstd(t) > 0:
        b = np.polyfit(t, y, 1)[0]
    else:
        b = 0.0
    feats["trend_slope"] = float(b)
    # sazonalidade ~ autocorrelação em 12 meses (se houver)
    if len(y) > 12:
        y12 = y[12:]
        y_lag12 = y[:-12]
        if np.std(y12) > 0 and np.std(y_lag12) > 0:
            feats["acf12"] = float(np.corrcoef(y12, y_lag12)[0,1])
        else:
            feats["acf12"] = 0.0
    else:
        feats["acf12"] = 0.0
    # preço médio e correlação preço-venda (se houver)
    if PRICE_COL and PRICE_COL in pdf.columns:
        p = pdf[PRICE_COL].astype(float).values
        feats["mean_price"] = float(np.nanmean(p))
        valid = (~np.isnan(y)) & (~np.isnan(p))
        if valid.sum() > 3 and np.std(y[valid])>0 and np.std(p[valid])>0:
            feats["corr_price_sales"] = float(np.corrcoef(y[valid], p[valid])[0,1])
        else:
            feats["corr_price_sales"] = 0.0
    else:
        feats["mean_price"] = np.nan
        feats["corr_price_sales"] = np.nan
    return pd.Series(feats)

series_feats = df.groupby([STORE_COL, ITEM_COL]).apply(per_series_features).reset_index()
print(series_feats.head())

# Preenche NaN
series_feats = series_feats.fillna(0.0)


## 3) Padronizar e treinar MiniBatchKMeans

In [None]:

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans

feature_cols = ["n_months","mean_sales","std_sales","cv_sales","zero_rate","trend_slope","acf12","mean_price","corr_price_sales"]
X = series_feats[feature_cols].values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

kmeans = MiniBatchKMeans(n_clusters=N_CLUSTERS, random_state=RANDOM_STATE, batch_size=BATCH_SIZE)
cluster_labels = kmeans.fit_predict(X_scaled)

series_feats["cluster"] = cluster_labels
print(series_feats["cluster"].value_counts().sort_index())

# Plot simples da distribuição por cluster
counts = series_feats["cluster"].value_counts().sort_index()
fig = plt.figure(figsize=(6,3))
plt.bar(counts.index.astype(str), counts.values)
plt.title("Distribuição de séries por cluster")
plt.xlabel("cluster"); plt.ylabel("contagem")
plt.tight_layout(); plt.show()


## 4) Salvar pipeline e anexar cluster ao DF completo

In [None]:

import joblib, os
# Salvar pipeline
pipe_path = os.path.join(OUTPUT_DIR, "cluster_pipeline.pkl")
joblib.dump({"scaler": scaler, "kmeans": kmeans, "features": feature_cols}, pipe_path)
print(f"[OK] Pipeline salvo em: {pipe_path}")

# Anexar cluster por (store,item)
df_cluster = df.merge(series_feats[[STORE_COL, ITEM_COL, "cluster"]], on=[STORE_COL, ITEM_COL], how="left")
out_parquet = os.path.join(OUTPUT_DIR, "base_mensal_with_clusters.parquet")
df_cluster.to_parquet(out_parquet, index=False)
print(f"[OK] Base com clusters: {out_parquet}")

# Exportar features por cluster para análise
feat_csv = os.path.join(OUTPUT_DIR, "series_features_by_cluster.csv")
series_feats.to_csv(feat_csv, index=False)
print(f"[OK] Features por série: {feat_csv}")
