### 03 - Dataset para modelos de consumo y tiempos

Objetivo: construir un dataset listo para modelar (targets de consumo, scrap y tiempos) a partir de `consumo_filtrado.csv`.


#### Imports y carga

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

pd.set_option("display.max_columns", 120)
pd.set_option("display.width", 160)

PATH = "../../data/proccessed/consumo_filtrado.csv"

df = pd.read_csv(PATH, dtype=str)

# Fechas
date_cols = ["ts_ini","ts_fin","fecha","fecha_recepcion_ts"]
for c in date_cols:
    if c in df.columns:
        df[c] = pd.to_datetime(df[c], errors="coerce")

# Numéricos
num_cols = [
    "piezas_ok","piezas_scrap","qty_plan","qty_estimado","qty_in_almacen_dia",
    "horas_teoricas","reduccion_tco","horas_ajustadas","horas_enfermedad",
    "horas_accidente","horas_permiso","horas_netas","qty_recibida",
    "peso_bruto","uds","throughput_uph","scrap_rate","duracion_min",
    "downtime_min","consumo_materia_kg","lead_time_al_almacen_dias","peso_neto_kg",
    "flag_sin_peso","flag_con_peso"
]
for c in num_cols:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

# Normalizar referencias
if "ref_id_str" in df.columns:
    df["ref_id_str"] = (df["ref_id_str"].astype(str)
                           .str.replace(r"\.0$", "", regex=True)
                           .str.zfill(6))

print("Filas, columnas:", df.shape)
df.head(3)


Filas, columnas: (30948, 40)


Unnamed: 0,work_order_id,op_id,machine_id,machine_name,planta,op_text,ref_id_str,familia,peso_neto_kg,material_lot_id,ref_materia_str,ts_ini,ts_fin,fecha,duracion_min,evento,tipo_incidencia,piezas_ok,piezas_scrap,qty_plan,qty_estimado,qty_in_almacen_dia,año_mes,horas_teoricas,reduccion_tco,horas_ajustadas,horas_enfermedad,horas_accidente,horas_permiso,horas_netas,qty_recibida,peso_bruto,uds,fecha_recepcion_ts,throughput_uph,scrap_rate,downtime_min,consumo_materia_kg,flag_sin_peso,flag_con_peso
0,24/0767,TALLADO,49,Talladora49,Abadiño,TALLADO,305,CORONA DE ARRANQUE,5.0,,,2025-01-28 00:50:00,2025-01-28 01:39:00,2025-01-28,49.0,Preparación,,0,0,0.0,592.0,,2025-01,12350.0,788.0,11562.0,752.0,0.0,390.0,10420.0,,,,NaT,0.0,,0.0,0.0,0,1
1,24/0767,TALLADO,49,Talladora49,Abadiño,TALLADO,305,CORONA DE ARRANQUE,5.0,,,2025-01-28 05:17:00,2025-01-28 05:49:00,2025-01-28,32.0,Incidencia,AUSENCIA,0,0,0.0,592.0,,2025-01,12350.0,788.0,11562.0,752.0,0.0,390.0,10420.0,,,,NaT,0.0,,32.0,0.0,0,1
2,24/0767,TALLADO,49,Talladora49,Abadiño,TALLADO,305,CORONA DE ARRANQUE,5.0,,,2025-01-28 01:39:00,2025-01-28 06:29:00,2025-01-28,290.0,Producción,,105,3,0.0,592.0,,2025-01,12350.0,788.0,11562.0,752.0,0.0,390.0,10420.0,,,,NaT,21.724138,0.027778,0.0,525.0,0,1


#### Targets

In [2]:
df_model = df.copy()

# Targets
df_model["consumo_materia_kg_target"] = df_model["consumo_materia_kg"]
# Scrap en kg estimado: consumo - peso*n_ok (si hay peso), si no usa piezas_scrap * peso
df_model["kg_scrap_target"] = np.nan
if "peso_neto_kg" in df_model:
    kg_ok = df_model["piezas_ok"] * df_model["peso_neto_kg"]
    df_model["kg_scrap_target"] = df_model["consumo_materia_kg"] - kg_ok
    alt = df_model["piezas_scrap"] * df_model["peso_neto_kg"]
    df_model["kg_scrap_target"] = df_model["kg_scrap_target"].fillna(alt)

# Horas de fabricación a partir de duracion_min
df_model["horas_fabricacion_target"] = np.where(df_model["duracion_min"].notna(), df_model["duracion_min"] / 60.0, np.nan)

# Lead time si existe la columna
if "lead_time_al_almacen_dias" in df_model.columns:
    df_model["lead_time_al_almacen_dias_target"] = df_model["lead_time_al_almacen_dias"]

# Drop filas sin consumo o sin ref/peso
df_model = df_model[df_model["ref_id_str"].notna()]
df_model = df_model[df_model["flag_sin_peso"] != 1]

print(df_model[["consumo_materia_kg_target","kg_scrap_target","horas_fabricacion_target"]].describe())


       consumo_materia_kg_target  kg_scrap_target  horas_fabricacion_target
count               30948.000000     3.094800e+04              30948.000000
mean                  743.784050     5.314276e-15                  1.941255
std                  3147.588249     1.817875e-13                  2.850107
min                     0.000000    -1.818989e-12                  0.000000
25%                     0.000000     0.000000e+00                  0.133333
50%                     0.000000     0.000000e+00                  0.350000
75%                   164.116250     0.000000e+00                  2.800000
max                 73440.000000     7.275958e-12                 23.983333


#### Selección de features

In [3]:
# Columnas de entrada (sin targets)
features_cat = [c for c in ["ref_id_str","familia","machine_id","machine_name","op_id","año_mes"] if c in df_model.columns]
features_num = [c for c in ["peso_neto_kg","qty_plan","qty_estimado","throughput_uph","scrap_rate","downtime_min",
                            "piezas_ok","piezas_scrap","qty_in_almacen_dia","horas_teoricas","reduccion_tco",
                            "horas_ajustadas","horas_netas"] if c in df_model.columns]

X = df_model[features_cat + features_num].copy()

# One-hot de categóricas
X = pd.get_dummies(X, columns=features_cat, dummy_na=True)

# Targets a conservar
targets = [c for c in [
    "consumo_materia_kg_target",
    "kg_scrap_target",
    "horas_fabricacion_target",
    "lead_time_al_almacen_dias_target" if "lead_time_al_almacen_dias_target" in df_model.columns else None
] if c is not None]

dataset = pd.concat([X, df_model[targets].reset_index(drop=True)], axis=1)
print("Dataset modelable shape:", dataset.shape)
print("Targets:", targets)


Dataset modelable shape: (30948, 250)
Targets: ['consumo_materia_kg_target', 'kg_scrap_target', 'horas_fabricacion_target']


#### Train/Val/Test split

In [4]:
# Split 70/15/15 con semilla fija
seed = 42
dataset_shuffled = dataset.sample(frac=1, random_state=seed).reset_index(drop=True)

n = len(dataset_shuffled)
train_end = int(0.7 * n)
val_end = int(0.85 * n)

train = dataset_shuffled.iloc[:train_end]
val = dataset_shuffled.iloc[train_end:val_end]
test = dataset_shuffled.iloc[val_end:]

print({
    "train": len(train),
    "val": len(val),
    "test": len(test),
})

base_out = Path("../../data/proccessed/consumo")
base_out.mkdir(parents=True, exist_ok=True)
train.to_csv(base_out / "consumo_model_train.csv", index=False)
val.to_csv(base_out / "consumo_model_val.csv", index=False)
test.to_csv(base_out / "consumo_model_test.csv", index=False)

dataset.to_csv(base_out / "consumo_model_dataset.csv", index=False)
print("Guardados dataset y splits en data/proccessed")


{'train': 21663, 'val': 4642, 'test': 4643}
Guardados dataset y splits en data/proccessed
