In [28]:
# Jupyter Notebook 02 — Dummy Scoring Engine + UI Snapshots
# ---------------------------------------------------------
# Objetivo:
# 1) Cargar los CSV del Notebook 01 (factories.csv y top10_skus.csv)
# 2) Recalcular métricas derivadas (days_of_supply, stockout_date_est, stock_needed_next_7d)
# 3) Generar risk_oos_7d con reglas dummy (explicables)
# 4) Seleccionar Top 10 SKUs por fábrica (aunque ya vengan, aquí se recalcula)
# 5) Recalcular factory overall risk (max SKU risk) + critical SKU
# 6) Exportar snapshots listos para GitHub Pages:
#    - public/data/factories.json
#    - public/data/predictions_latest.json

import json
from pathlib import Path
from datetime import datetime, timedelta
import numpy as np
import pandas as pd


In [29]:
# === CONFIG ===
PROCESSED_DIR = Path("data/processed")
FACTORIES_CSV = PROCESSED_DIR / "factories.csv"
TOP10_CSV = PROCESSED_DIR / "top10_skus.csv"

PUBLIC_DIR = Path("public/data")  # para GitHub Pages con Vite/React suele ser /public/data
PUBLIC_DIR.mkdir(parents=True, exist_ok=True)

AS_OF_DATE = "2026-01-14"  # usa la del JSON/CSV; si quieres la inferimos automáticamente abajo
HORIZON_DAYS = 7
TOP_K = 10

assert FACTORIES_CSV.exists(), f"No encuentro {FACTORIES_CSV.resolve()}"
assert TOP10_CSV.exists(), f"No encuentro {TOP10_CSV.resolve()}"

FACTORIES_CSV, TOP10_CSV


(PosixPath('data/processed/factories.csv'),
 PosixPath('data/processed/top10_skus.csv'))

In [30]:
# === LOAD ===
df_fact = pd.read_csv(FACTORIES_CSV)
df = pd.read_csv(TOP10_CSV)

# inferir as_of_date si viene en el csv
if "as_of_date" in df.columns and df["as_of_date"].notna().any():
    AS_OF_DATE = str(df["as_of_date"].dropna().iloc[0])

as_of = pd.to_datetime(AS_OF_DATE)
as_of


Timestamp('2026-01-14 00:00:00')

In [31]:
# === HELPERS: reglas dummy coherentes ===

def clamp01(x):
    return float(max(0.0, min(1.0, x)))

def compute_days_of_supply(on_hand, in_transit, on_order, daily_demand):
    inv = (on_hand or 0) + (in_transit or 0) + (on_order or 0)
    dd = daily_demand if daily_demand and daily_demand > 0 else np.nan
    return inv / dd

def compute_stockout_date(as_of_dt, days_of_supply):
    if pd.isna(days_of_supply):
        return None
    # redondeo hacia abajo para fecha conservadora
    d = int(np.floor(days_of_supply))
    return (as_of_dt + pd.Timedelta(days=d)).strftime("%Y-%m-%d")

def compute_stock_needed_next_7d(on_hand, in_transit, on_order, daily_demand, safety_factor=1.10):
    inv = (on_hand or 0) + (in_transit or 0) + (on_order or 0)
    dd = daily_demand if daily_demand and daily_demand > 0 else 0
    demand_7d = dd * HORIZON_DAYS
    need = max(0.0, demand_7d * safety_factor - inv)
    return float(np.round(need))

def risk_from_days_of_supply(dos):
    # base risk en función de cobertura (coherente con lo que hablamos)
    if pd.isna(dos):
        return 0.5
    if dos < 4:
        return 0.85
    if dos < 7:
        return 0.60
    if dos < 10:
        return 0.28
    return 0.10

def compute_risk_oos_7d(dos, promo_uplift, lead_time_std, scrap_rate, missingness_ratio=None):
    risk = risk_from_days_of_supply(dos)

    # ajustes simples, explicables
    if promo_uplift is not None and promo_uplift > 1.10:
        risk += 0.10
    if lead_time_std is not None and lead_time_std > 1.5:
        risk += 0.10
    if scrap_rate is not None and scrap_rate > 0.02:
        risk += 0.05
    if missingness_ratio is not None and missingness_ratio > 0.03:
        risk += 0.05

    return clamp01(risk)

def compute_confidence(missingness_ratio=None, drift_score=None):
    # si no tienes missingness/drift, regresa un valor alto por default
    conf = 0.90
    if missingness_ratio is not None and not pd.isna(missingness_ratio):
        conf -= min(0.25, float(missingness_ratio) * 3.0)  # 0.05 -> -0.15 aprox
    if drift_score is not None and not pd.isna(drift_score):
        conf -= min(0.20, float(drift_score) * 1.5)        # 0.10 -> -0.15 aprox
    return clamp01(conf)

def build_drivers_row(dos, promo_uplift, lead_time_std, scrap_rate, missingness_ratio=None):
    # Genera 3 drivers máximos, con "impactos" dummy pero consistentes
    drivers = []

    # dos bajo => sube riesgo
    if not pd.isna(dos):
        drivers.append(("days_of_supply", float(np.round(max(0.0, (7 - dos) / 10), 3))))

    if promo_uplift is not None and not pd.isna(promo_uplift):
        drivers.append(("promo_uplift_index", float(np.round(max(0.0, promo_uplift - 1.0), 3))))

    if lead_time_std is not None and not pd.isna(lead_time_std):
        drivers.append(("lead_time_days_std", float(np.round(max(0.0, (lead_time_std - 1.0) / 5), 3))))

    if scrap_rate is not None and not pd.isna(scrap_rate):
        drivers.append(("scrap_rate_7d", float(np.round(max(0.0, (scrap_rate - 0.01)), 3))))

    if missingness_ratio is not None and not pd.isna(missingness_ratio):
        drivers.append(("missingness_ratio_24h", float(np.round(missingness_ratio, 3))))

    # ordenar por impacto desc y quedarnos con top3
    drivers = sorted(drivers, key=lambda x: x[1], reverse=True)[:3]
    return [{"feature": f, "impact": imp} for f, imp in drivers]


In [32]:
# === NORMALIZAR COLUMNAS (por si tu CSV no trae algunas) ===
# En tu top10_skus.csv del notebook 1, quizá no incluiste missingness/drift.
# Si no existen, las ponemos como NaN para que el código funcione igual.

for col in ["missingness_ratio_24h", "drift_score_7d"]:
    if col not in df.columns:
        df[col] = np.nan

# Asegurar numéricos
num_cols = [
    "on_hand_units","in_transit_units","on_order_units","daily_demand_units",
    "lead_time_days_mean","lead_time_days_std","promo_uplift_index","scrap_rate_7d",
    "missingness_ratio_24h","drift_score_7d"
]
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df.head(3)


Unnamed: 0,dataset_version,as_of_date,factory_id,factory_name,state,city,lat,lon,factory_overall_risk_oos_7d,rank_in_factory,...,scrap_rate_7d,driver1_feature,driver1_impact,driver2_feature,driver2_impact,driver3_feature,driver3_impact,drivers_json,missingness_ratio_24h,drift_score_7d
0,mx_pilot_sku_v1,2026-01-14,MX_TOL_01,Planta Toluca,Estado de México,Toluca,19.2826,-99.6557,0.82,1,...,0.006,days_of_supply,0.32,lead_time_days_std,0.22,promo_uplift_index,0.12,"[{""feature"": ""days_of_supply"", ""impact"": 0.32}...",,
1,mx_pilot_sku_v1,2026-01-14,MX_TOL_01,Planta Toluca,Estado de México,Toluca,19.2826,-99.6557,0.82,2,...,0.019,days_of_supply,0.21,promo_uplift_index,0.18,,,"[{""feature"": ""promo_uplift_index"", ""impact"": 0...",,
2,mx_pilot_sku_v1,2026-01-14,MX_IRA_02,Planta Irapuato,Guanajuato,Irapuato,20.6767,-101.3563,0.46,1,...,0.017,days_of_supply,0.16,promo_uplift_index,0.1,,,"[{""feature"": ""days_of_supply"", ""impact"": 0.16}...",,


In [33]:
# === RECALC DERIVED FIELDS + RISK + CONFIDENCE + DRIVERS ===

df2 = df.copy()

df2["days_of_supply_calc"] = df2.apply(
    lambda r: compute_days_of_supply(r["on_hand_units"], r["in_transit_units"], r["on_order_units"], r["daily_demand_units"]),
    axis=1
)

df2["stockout_date_est_calc"] = df2["days_of_supply_calc"].apply(lambda dos: compute_stockout_date(as_of, dos))

df2["stock_needed_next_7d_calc"] = df2.apply(
    lambda r: compute_stock_needed_next_7d(r["on_hand_units"], r["in_transit_units"], r["on_order_units"], r["daily_demand_units"]),
    axis=1
)

df2["risk_oos_7d_calc"] = df2.apply(
    lambda r: compute_risk_oos_7d(
        r["days_of_supply_calc"],
        r["promo_uplift_index"],
        r["lead_time_days_std"],
        r["scrap_rate_7d"],
        r["missingness_ratio_24h"]
    ),
    axis=1
)

df2["confidence_calc"] = df2.apply(
    lambda r: compute_confidence(r["missingness_ratio_24h"], r["drift_score_7d"]),
    axis=1
)

df2["drivers_calc_json"] = df2.apply(
    lambda r: json.dumps(
        build_drivers_row(
            r["days_of_supply_calc"],
            r["promo_uplift_index"],
            r["lead_time_days_std"],
            r["scrap_rate_7d"],
            r["missingness_ratio_24h"]
        ),
        ensure_ascii=False
    ),
    axis=1
)

df2[[
    "factory_id","sku_id","sku_name",
    "days_of_supply_calc","stockout_date_est_calc","stock_needed_next_7d_calc",
    "risk_oos_7d_calc","confidence_calc"
]].head(10)


Unnamed: 0,factory_id,sku_id,sku_name,days_of_supply_calc,stockout_date_est_calc,stock_needed_next_7d_calc,risk_oos_7d_calc,confidence_calc
0,MX_TOL_01,TOL-INF-004,Infant Formula Stage 1 800g,3.393939,2026-01-17,71050.0,0.95,0.9
1,MX_TOL_01,TOL-DAI-002,Milk Whole 1L,4.076923,2026-01-18,94200.0,0.7,0.9
2,MX_IRA_02,IRA-DAI-006,Yogurt Strawberry 1L,6.193548,2026-01-20,23350.0,0.6,0.9
3,MX_VDM_03,VDM-WAT-003,Water 600ml 24-pack,4.043478,2026-01-18,84100.0,0.6,0.9


In [34]:
# === RE-TOP10 POR FÁBRICA (basado en risk_oos_7d_calc) ===
df_topk = (
    df2.sort_values(["factory_id", "risk_oos_7d_calc"], ascending=[True, False])
       .groupby("factory_id", as_index=False, group_keys=False)
       .head(TOP_K)
       .copy()
)

# rank
df_topk["rank_in_factory_calc"] = (
    df_topk.groupby("factory_id").cumcount() + 1
)

df_topk.sort_values(["factory_id", "rank_in_factory_calc"]).head(15)


Unnamed: 0,dataset_version,as_of_date,factory_id,factory_name,state,city,lat,lon,factory_overall_risk_oos_7d,rank_in_factory,...,drivers_json,missingness_ratio_24h,drift_score_7d,days_of_supply_calc,stockout_date_est_calc,stock_needed_next_7d_calc,risk_oos_7d_calc,confidence_calc,drivers_calc_json,rank_in_factory_calc
2,mx_pilot_sku_v1,2026-01-14,MX_IRA_02,Planta Irapuato,Guanajuato,Irapuato,20.6767,-101.3563,0.46,1,...,"[{""feature"": ""days_of_supply"", ""impact"": 0.16}...",,,6.193548,2026-01-20,23350.0,0.6,0.9,"[{""feature"": ""days_of_supply"", ""impact"": 0.081...",1
0,mx_pilot_sku_v1,2026-01-14,MX_TOL_01,Planta Toluca,Estado de México,Toluca,19.2826,-99.6557,0.82,1,...,"[{""feature"": ""days_of_supply"", ""impact"": 0.32}...",,,3.393939,2026-01-17,71050.0,0.95,0.9,"[{""feature"": ""days_of_supply"", ""impact"": 0.361...",1
1,mx_pilot_sku_v1,2026-01-14,MX_TOL_01,Planta Toluca,Estado de México,Toluca,19.2826,-99.6557,0.82,2,...,"[{""feature"": ""promo_uplift_index"", ""impact"": 0...",,,4.076923,2026-01-18,94200.0,0.7,0.9,"[{""feature"": ""days_of_supply"", ""impact"": 0.292...",2
3,mx_pilot_sku_v1,2026-01-14,MX_VDM_03,Planta Valle de México,Estado de México,Cuautitlán Izcalli,19.6469,-99.246,0.68,1,...,"[{""feature"": ""days_of_supply"", ""impact"": 0.24}...",,,4.043478,2026-01-18,84100.0,0.6,0.9,"[{""feature"": ""days_of_supply"", ""impact"": 0.296...",1


In [35]:
# === FACTORY SUMMARY RECALC (overall risk = max SKU risk) + critical SKU ===
df_factory_summary = (
    df_topk.sort_values(["factory_id", "risk_oos_7d_calc"], ascending=[True, False])
           .groupby("factory_id", as_index=False)
           .first()[["factory_id","sku_id","sku_name","risk_oos_7d_calc"]]
           .rename(columns={
               "sku_id":"critical_sku_id",
               "sku_name":"critical_sku_name",
               "risk_oos_7d_calc":"overall_risk_oos_7d"
           })
)

# unir con df_fact (mantener lat/lon, nombre, etc.)
keep_cols = ["factory_id","factory_name","state","city","lat","lon"]
df_fact_min = df_fact[keep_cols].drop_duplicates("factory_id")

df_fact2 = df_fact_min.merge(df_factory_summary, on="factory_id", how="left")

df_fact2


Unnamed: 0,factory_id,factory_name,state,city,lat,lon,critical_sku_id,critical_sku_name,overall_risk_oos_7d
0,MX_TOL_01,Planta Toluca,Estado de México,Toluca,19.2826,-99.6557,TOL-INF-004,Infant Formula Stage 1 800g,0.95
1,MX_IRA_02,Planta Irapuato,Guanajuato,Irapuato,20.6767,-101.3563,IRA-DAI-006,Yogurt Strawberry 1L,0.6
2,MX_VDM_03,Planta Valle de México,Estado de México,Cuautitlán Izcalli,19.6469,-99.246,VDM-WAT-003,Water 600ml 24-pack,0.6


In [36]:
# === SEVERITY LABEL (útil para UI) ===
def severity_label(risk, red=0.7, yellow=0.35):
    if pd.isna(risk):
        return "UNKNOWN"
    if risk >= red:
        return "RED"
    if risk >= yellow:
        return "YELLOW"
    return "GREEN"

df_fact2["severity"] = df_fact2["overall_risk_oos_7d"].apply(severity_label)
df_fact2


Unnamed: 0,factory_id,factory_name,state,city,lat,lon,critical_sku_id,critical_sku_name,overall_risk_oos_7d,severity
0,MX_TOL_01,Planta Toluca,Estado de México,Toluca,19.2826,-99.6557,TOL-INF-004,Infant Formula Stage 1 800g,0.95,RED
1,MX_IRA_02,Planta Irapuato,Guanajuato,Irapuato,20.6767,-101.3563,IRA-DAI-006,Yogurt Strawberry 1L,0.6,YELLOW
2,MX_VDM_03,Planta Valle de México,Estado de México,Cuautitlán Izcalli,19.6469,-99.246,VDM-WAT-003,Water 600ml 24-pack,0.6,YELLOW


In [37]:
# === BUILD UI SNAPSHOTS (JSON) ===

# factories.json: para mapa/lista
factories_snapshot = []
for _, r in df_fact2.iterrows():
    factories_snapshot.append({
        "factory_id": r["factory_id"],
        "factory_name": r["factory_name"],
        "state": r["state"],
        "city": r["city"],
        "geo": {"lat": float(r["lat"]), "lon": float(r["lon"])},
        "overall_risk_oos_7d": float(r["overall_risk_oos_7d"]) if not pd.isna(r["overall_risk_oos_7d"]) else None,
        "severity": r["severity"],
        "critical_sku": {
            "sku_id": r["critical_sku_id"],
            "sku_name": r["critical_sku_name"]
        },
        "as_of_date": AS_OF_DATE
    })

# predictions_latest.json: detalle por fábrica con top10
predictions_snapshot = {
    "dataset_version": "mx_pilot_dummy_scoring_v1",
    "generated_at_utc": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),
    "as_of_date": AS_OF_DATE,
    "horizon_days": HORIZON_DAYS,
    "top_k": TOP_K,
    "factories": []
}

for fid, g in df_topk.groupby("factory_id"):
    g = g.sort_values("rank_in_factory_calc")
    fac_row = df_fact2[df_fact2["factory_id"] == fid].iloc[0]

    top10_list = []
    for _, s in g.iterrows():
        drivers_list = json.loads(s["drivers_calc_json"]) if isinstance(s["drivers_calc_json"], str) else []
        top10_list.append({
            "rank": int(s["rank_in_factory_calc"]),
            "sku_id": s["sku_id"],
            "sku_name": s["sku_name"],
            "sku_family": s.get("sku_family", None),

            "risk_oos_7d": float(s["risk_oos_7d_calc"]) if not pd.isna(s["risk_oos_7d_calc"]) else None,
            "confidence": float(s["confidence_calc"]) if not pd.isna(s["confidence_calc"]) else None,

            "derived": {
                "days_of_supply": float(s["days_of_supply_calc"]) if not pd.isna(s["days_of_supply_calc"]) else None,
                "stockout_date_est": s["stockout_date_est_calc"],
                "stock_needed_next_7d": float(s["stock_needed_next_7d_calc"]) if not pd.isna(s["stock_needed_next_7d_calc"]) else None
            },
            "current_state": {
                "on_hand_units": float(s["on_hand_units"]) if not pd.isna(s["on_hand_units"]) else None,
                "in_transit_units": float(s["in_transit_units"]) if not pd.isna(s["in_transit_units"]) else None,
                "on_order_units": float(s["on_order_units"]) if not pd.isna(s["on_order_units"]) else None,
                "daily_demand_units": float(s["daily_demand_units"]) if not pd.isna(s["daily_demand_units"]) else None,
                "lead_time_days_mean": float(s["lead_time_days_mean"]) if "lead_time_days_mean" in s and not pd.isna(s["lead_time_days_mean"]) else None,
                "lead_time_days_std": float(s["lead_time_days_std"]) if not pd.isna(s["lead_time_days_std"]) else None,
                "promo_uplift_index": float(s["promo_uplift_index"]) if not pd.isna(s["promo_uplift_index"]) else None,
                "scrap_rate_7d": float(s["scrap_rate_7d"]) if not pd.isna(s["scrap_rate_7d"]) else None
            },
            "drivers": drivers_list
        })

    predictions_snapshot["factories"].append({
        "factory_id": fid,
        "factory_name": fac_row["factory_name"],
        "state": fac_row["state"],
        "city": fac_row["city"],
        "geo": {"lat": float(fac_row["lat"]), "lon": float(fac_row["lon"])},
        "overall_risk_oos_7d": float(fac_row["overall_risk_oos_7d"]) if not pd.isna(fac_row["overall_risk_oos_7d"]) else None,
        "severity": fac_row["severity"],
        "critical_sku": {"sku_id": fac_row["critical_sku_id"], "sku_name": fac_row["critical_sku_name"]},
        "top_10_skus": top10_list
    })

len(factories_snapshot), len(predictions_snapshot["factories"])


  "generated_at_utc": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ"),


(3, 3)

In [38]:
# === SAVE SNAPSHOTS ===
factories_json_path = PUBLIC_DIR / "factories.json"
preds_json_path = PUBLIC_DIR / "predictions_latest.json"

with open(factories_json_path, "w", encoding="utf-8") as f:
    json.dump(factories_snapshot, f, ensure_ascii=False, indent=2)

with open(preds_json_path, "w", encoding="utf-8") as f:
    json.dump(predictions_snapshot, f, ensure_ascii=False, indent=2)

factories_json_path, preds_json_path


(PosixPath('public/data/factories.json'),
 PosixPath('public/data/predictions_latest.json'))

In [39]:
# === QUICK SANITY CHECKS ===
print("factories.json factories:", len(factories_snapshot))
print("predictions_latest.json factories:", len(predictions_snapshot["factories"]))

# ver top 3 skus de una fábrica
example = predictions_snapshot["factories"][0]
example["factory_name"], [(x["rank"], x["sku_id"], x["risk_oos_7d"]) for x in example["top_10_skus"][:3]]


factories.json factories: 3
predictions_latest.json factories: 3


('Planta Irapuato', [(1, 'IRA-DAI-006', 0.6)])

In [40]:
# === OPTIONAL: también guardar CSV recalculados (para debug) ===
OUT_DEBUG = Path("data/processed_scored")
OUT_DEBUG.mkdir(parents=True, exist_ok=True)

df_fact2.to_csv(OUT_DEBUG / "factories_scored.csv", index=False)
df_topk.to_csv(OUT_DEBUG / "top10_skus_scored.csv", index=False)

OUT_DEBUG / "factories_scored.csv", OUT_DEBUG / "top10_skus_scored.csv"


(PosixPath('data/processed_scored/factories_scored.csv'),
 PosixPath('data/processed_scored/top10_skus_scored.csv'))

In [41]:
# === END ===
# Ahora tu UI puede leer:
# - /data/factories.json  (mapa + lista)
# - /data/predictions_latest.json (detalle + top10)
#
# Siguiente: Notebook 03 (opcional) para preparar estructura de UI en React (Vite)
# o para convertir esto a un endpoint si algún día lo quieren en backend.

print("Listo. Snapshots generados en:", PUBLIC_DIR.resolve())


Listo. Snapshots generados en: /Users/davidbazalduamendez/Documents/GitHub/Danone_dummy/public/data


In [42]:
import numpy as np
import pandas as pd
import json
from pathlib import Path
from datetime import datetime, timezone

# ==========
# AJUSTA ESTO A TUS DATAFRAMES REALES
# ==========
# df_factories: columnas mínimas: plant_id, name, lat, lon
# df_pred_latest: columnas mínimas: plant_id, sku_id, sku_name (si no, lo armamos), predicted_days_of_coverage (o algo equivalente)
df_factories = df_factories.copy()
df_pred_latest = df_pred_latest.copy()

AS_OF_UTC = datetime.now(timezone.utc).isoformat(timespec="seconds").replace("+00:00","Z")

# ==========
# Helpers
# ==========
def risk_band(score: float) -> str:
    if score >= 90: return "Critical"
    if score >= 70: return "High"
    if score >= 40: return "Medium"
    return "Low"

def clamp(x, lo, hi):
    return max(lo, min(hi, x))

# Si no tienes sku_name, intenta armarlo por catálogo/merge antes.
if "sku_name" not in df_pred_latest.columns:
    df_pred_latest["sku_name"] = df_pred_latest["sku_id"].astype(str)

# ==========
# 1) Enriquecer prediction_latest
# ==========
# Si no tienes stockout_probability / risk_score, los derivamos desde predicted_days_of_coverage (demo)
if "predicted_days_of_coverage" not in df_pred_latest.columns:
    # si tú tienes "days_of_coverage" o "coverage_days", ajusta aquí:
    if "days_of_coverage" in df_pred_latest.columns:
        df_pred_latest["predicted_days_of_coverage"] = df_pred_latest["days_of_coverage"]
    elif "coverage_days" in df_pred_latest.columns:
        df_pred_latest["predicted_days_of_coverage"] = df_pred_latest["coverage_days"]
    else:
        # demo: inventar coverage si no existe (no ideal, pero evita romper)
        df_pred_latest["predicted_days_of_coverage"] = np.random.uniform(2.0, 7.0, size=len(df_pred_latest)).round(2)

# stockout_probability: alta si cobertura baja
if "stockout_probability" not in df_pred_latest.columns:
    doc = df_pred_latest["predicted_days_of_coverage"].astype(float)
    df_pred_latest["stockout_probability"] = (1 - (doc / 6)).clip(0.05, 0.95).round(2)

# risk_score: mezcla prob + cobertura invertida
if "risk_score" not in df_pred_latest.columns:
    doc = df_pred_latest["predicted_days_of_coverage"].astype(float)
    prob = df_pred_latest["stockout_probability"].astype(float)
    score = 100 * (0.65 * prob + 0.35 * ((6 - doc) / 6).clip(0,1))
    df_pred_latest["risk_score"] = score.round().astype(int)

df_pred_latest["risk_band"] = df_pred_latest["risk_score"].apply(risk_band)

# t+1 y t+7 (si no existen, generamos demo proporcional)
if "predicted_closing_stock_units_t1" not in df_pred_latest.columns:
    df_pred_latest["predicted_closing_stock_units_t1"] = (np.random.uniform(2000, 50000, size=len(df_pred_latest))).round().astype(int)

if "predicted_closing_stock_units_t7" not in df_pred_latest.columns:
    df_pred_latest["predicted_closing_stock_units_t7"] = (df_pred_latest["predicted_closing_stock_units_t1"] * np.random.uniform(0.2, 0.9, size=len(df_pred_latest))).round().astype(int)

# expected_stockout_date (si prob alta y cobertura baja)
if "expected_stockout_date" not in df_pred_latest.columns:
    # demo: si cobertura < 3.0 -> stockout en 2-4 días
    days = np.where(df_pred_latest["predicted_days_of_coverage"] < 3.0,
                    np.random.randint(2,5,size=len(df_pred_latest)),
                    np.random.randint(5,10,size=len(df_pred_latest)))
    base = pd.Timestamp(AS_OF_UTC)
    df_pred_latest["expected_stockout_date"] = [(base + pd.Timedelta(days=int(d))).date().isoformat() for d in days]

# drivers (si no tienes, genera 3 drivers fijos con pesos que sumen 1)
def make_drivers(row):
    # pesos demo pero creíbles
    w1 = clamp(0.35 + (3.5 - float(row["predicted_days_of_coverage"])) * 0.08, 0.25, 0.65)
    w2 = clamp(0.25 + float(row["stockout_probability"]) * 0.15, 0.15, 0.45)
    w3 = max(0.05, 1 - w1 - w2)
    # normaliza
    s = w1 + w2 + w3
    w1, w2, w3 = w1/s, w2/s, w3/s
    return [
        {"driver": "Low Days of Coverage", "weight": round(w1, 2)},
        {"driver": "Demand Spike", "weight": round(w2, 2)},
        {"driver": "Lead Time Variability", "weight": round(w3, 2)}
    ]

if "drivers" not in df_pred_latest.columns:
    df_pred_latest["drivers"] = df_pred_latest.apply(make_drivers, axis=1)

# recommended_actions (simple rules)
def make_actions(row):
    actions = []
    if row["risk_band"] in ["Critical","High"]:
        actions.append("Reallocate from lower-risk plant within 48–72h")
        actions.append("Expedite production / prioritise next slot")
    if float(row["predicted_days_of_coverage"]) < 3.5:
        actions.append("Increase replenishment frequency for next 7 days")
    # evita demasiadas
    return actions[:3]

if "recommended_actions" not in df_pred_latest.columns:
    df_pred_latest["recommended_actions"] = df_pred_latest.apply(make_actions, axis=1)

# ==========
# 2) Enriquecer factories agregando agregados desde predictions
# ==========
agg = (df_pred_latest
       .groupby("plant_id")
       .agg(
           global_risk_score=("risk_score","max"),
           skus_at_risk_count=("risk_score", lambda s: int((s>=40).sum())),
           critical_skus_count=("risk_score", lambda s: int((s>=90).sum())),
           avg_stockout_probability=("stockout_probability","mean")
       )
       .reset_index())

# next_stockout_date por planta: el mínimo entre SKUs con riesgo alto
tmp = df_pred_latest.copy()
tmp["expected_stockout_date"] = pd.to_datetime(tmp["expected_stockout_date"], errors="coerce")
next_so = (tmp[tmp["risk_score"]>=70]
           .groupby("plant_id")["expected_stockout_date"]
           .min()
           .reset_index()
           .rename(columns={"expected_stockout_date":"next_stockout_date"}))
next_so["next_stockout_date"] = next_so["next_stockout_date"].dt.date.astype(str)

# top_skus (top 5)
top_skus = []
for pid, g in df_pred_latest.groupby("plant_id"):
    top = (g.sort_values(["risk_score","stockout_probability"], ascending=False)
            .head(5)[["sku_id","sku_name","risk_score","risk_band"]])
    top_skus.append({"plant_id": pid, "top_skus": top.to_dict(orient="records")})
top_skus_df = pd.DataFrame(top_skus)

df_factories = df_factories.merge(agg, on="plant_id", how="left")
df_factories = df_factories.merge(next_so, on="plant_id", how="left")
df_factories = df_factories.merge(top_skus_df, on="plant_id", how="left")

df_factories["global_risk_score"] = df_factories["global_risk_score"].fillna(0).astype(int)
df_factories["risk_band"] = df_factories["global_risk_score"].apply(risk_band)

# revenue/volume demo (si no tienes: derivado del #SKUs en riesgo)
if "estimated_revenue_at_risk_mxn" not in df_factories.columns:
    df_factories["estimated_revenue_at_risk_mxn"] = (df_factories["skus_at_risk_count"].fillna(0) * np.random.uniform(250000, 900000, size=len(df_factories))).round().astype(int)

if "estimated_volume_at_risk_units" not in df_factories.columns:
    df_factories["estimated_volume_at_risk_units"] = (df_factories["skus_at_risk_count"].fillna(0) * np.random.uniform(15000, 90000, size=len(df_factories))).round().astype(int)

# trend + drivers (demo)
def make_trend(score):
    # sparkline que termina en score
    start = clamp(score - np.random.randint(-6, 14), 0, 100)
    vals = np.linspace(start, score, 7) + np.random.normal(0, 1.2, 7)
    vals = np.clip(vals, 0, 100).round().astype(int).tolist()
    vs = int(vals[-1] - vals[0])
    direction = "up" if vs > 0 else ("down" if vs < 0 else "flat")
    return {"vs_prev_7d_pp": vs, "direction": direction, "sparkline_7d": vals}

def make_drivers_plant(score):
    # distribuciones distintas por severidad
    if score >= 90:
        return [
            {"driver":"Low Days of Coverage","contribution_pct":45},
            {"driver":"Demand Spike","contribution_pct":25},
            {"driver":"Lead Time Variability","contribution_pct":20},
            {"driver":"Fill Rate Drop","contribution_pct":10}
        ]
    if score >= 70:
        return [
            {"driver":"Demand Volatility","contribution_pct":34},
            {"driver":"Low Days of Coverage","contribution_pct":28},
            {"driver":"Lead Time Variability","contribution_pct":22},
            {"driver":"Distribution Constraints","contribution_pct":16}
        ]
    return [
        {"driver":"Lead Time Variability","contribution_pct":35},
        {"driver":"Capacity Constraints","contribution_pct":25},
        {"driver":"Low Days of Coverage","contribution_pct":22},
        {"driver":"Demand Volatility","contribution_pct":18}
    ]

df_factories["trend"] = df_factories["global_risk_score"].apply(make_trend)
df_factories["drivers"] = df_factories["global_risk_score"].apply(make_drivers_plant)

# ==========
# 3) Export a Vite public/data
# ==========
# Ajusta esta ruta a tu repo real. Esta asume que tu notebook está en una carpeta hermana a ui-dashboard.
OUT = Path("../ui-dashboard/public/data")
OUT.mkdir(parents=True, exist_ok=True)

# factories.json
factories_out = []
for _, r in df_factories.iterrows():
    factories_out.append({
        "plant_id": r["plant_id"],
        "name": r["name"],
        "location": {"lat": float(r["lat"]), "lon": float(r["lon"])},
        "global_risk_score": int(r["global_risk_score"]),
        "risk_band": r["risk_band"],
        "skus_at_risk_count": int(r.get("skus_at_risk_count", 0) or 0),
        "critical_skus_count": int(r.get("critical_skus_count", 0) or 0),
        "next_stockout_date": r.get("next_stockout_date", None),
        "estimated_revenue_at_risk_mxn": int(r["estimated_revenue_at_risk_mxn"]),
        "estimated_volume_at_risk_units": int(r["estimated_volume_at_risk_units"]),
        "trend": r["trend"],
        "drivers": r["drivers"],
        "top_skus": r["top_skus"] if isinstance(r["top_skus"], list) else []
    })

with open(OUT/"factories.json", "w", encoding="utf-8") as f:
    json.dump(factories_out, f, ensure_ascii=False, indent=2)

# prediction_latest.json
pred_out = {
    "as_of_utc": AS_OF_UTC,
    "horizon_days": 7,
    "rows": []
}

cols_needed = [
    "plant_id","sku_id","sku_name",
    "risk_score","risk_band",
    "predicted_days_of_coverage",
    "expected_stockout_date",
    "stockout_probability",
    "predicted_closing_stock_units_t1",
    "predicted_closing_stock_units_t7",
    "drivers","recommended_actions"
]

for _, r in df_pred_latest[cols_needed].iterrows():
    pred_out["rows"].append({
        "plant_id": r["plant_id"],
        "sku_id": r["sku_id"],
        "sku_name": r["sku_name"],
        "risk_score": int(r["risk_score"]),
        "risk_band": r["risk_band"],
        "predicted_days_of_coverage": float(r["predicted_days_of_coverage"]),
        "expected_stockout_date": r["expected_stockout_date"],
        "stockout_probability": float(r["stockout_probability"]),
        "predicted_closing_stock_units_t1": int(r["predicted_closing_stock_units_t1"]),
        "predicted_closing_stock_units_t7": int(r["predicted_closing_stock_units_t7"]),
        "drivers": r["drivers"],
        "recommended_actions": r["recommended_actions"]
    })

with open(OUT/"prediction_latest.json", "w", encoding="utf-8") as f:
    json.dump(pred_out, f, ensure_ascii=False, indent=2)

print("Wrote:", OUT/"factories.json", "and", OUT/"prediction_latest.json")


NameError: name 'df_factories' is not defined