# TPST – Notebook local (PySpark) para ML en siniestros (Perú)

Este cuaderno hace **todo** localmente en tu Mac (on‑prem) usando **PySpark** en `local[*]`:

1) Arranque de Spark (local).  
2) Carga del **Parquet largo** (`year, region, metric, dim_name, dim_value, value`).  
3) **Limpieza**: elimina meses y filas con `region = TOTAL`.  
4) Persistencia **Silver** (limpio).  
5) Construcción **Gold (wide)**: tablón pivoteado `year, region` + columnas por métrica/categoría.  
6) **Features**: proporciones, índices (noche/fin de semana) y **lags** por región.  
7) **Entrenamiento** (baseline): GLM **Poisson** con Spark ML + métricas (RMSE, MAE).  
8) Escritura de salidas `Delta/Parquet` locales.

> Requisitos previos (una sola vez):
> ```bash
> python -m venv .venv && source .venv/bin/activate
> python -m pip install -U pip
> python -m pip install pyspark==3.5.1 pyarrow==15.0.2 pandas==2.2.2
> ```

Configura la ruta al Parquet **de entrada** en la celda siguiente.


In [None]:
# === Configuración de rutas ===
from pathlib import Path

# Ruta al Parquet consolidado (tabla larga) generado por tu pipeline
PARQUET_IN = Path("data/processed/siniestros_normalizado.parquet")

# Carpetas de salida (se crean si no existen)
DIR_BRONZE = Path("bronze_local")
DIR_SILVER = Path("silver_local")
DIR_GOLD   = Path("gold_local")

for d in (DIR_BRONZE, DIR_SILVER, DIR_GOLD):
    d.mkdir(parents=True, exist_ok=True)

PARQUET_IN.resolve()

## 1) Inicializar Spark (local)

In [None]:
from pyspark.sql import SparkSession, functions as F, Window

spark = (SparkSession.builder
         .appName("TPST-local")
         .master("local[*]")
         .config("spark.sql.shuffle.partitions", "8")
         .config("spark.driver.memory", "4g")
         .getOrCreate())

spark.version

## 2) Cargar Parquet largo → Bronze

In [None]:
df_bronze = (spark.read.parquet(str(PARQUET_IN))
              .select("year","region","metric","dim_name","dim_value","value"))
df_bronze.printSchema()
df_bronze.show(5, truncate=False)

# Persistir una copia Parquet en bronze_local (opcional)
(df_bronze
 .repartition(1)
 .write.mode("overwrite")
 .parquet(str(DIR_BRONZE / "siniestros_long_raw.parquet")))

df_bronze.count()

## 3) Limpieza → Silver (quitar meses/TOTAL y tipificar)

In [None]:
MESES = ["ENERO","FEBRERO","MARZO","ABRIL","MAYO","JUNIO","JULIO","AGOSTO",
         "SEPTIEMBRE","OCTUBRE","NOVIEMBRE","DICIEMBRE"]

df_silver = (df_bronze
    .withColumn("region_norm", F.upper(F.trim(F.col("region"))))
    .filter(~F.col("region_norm").isin(MESES))
    .filter(~F.col("region_norm").rlike(r'^TOTAL(\s+NACIONAL|\s*GENERAL)?$'))
    .withColumn("year", F.col("year").cast("int"))
    .withColumn("value", F.col("value").cast("double"))
    .drop("region")
    .withColumnRenamed("region_norm","region")
)

df_silver.show(5, truncate=False)
df_silver.count()

(df_silver
 .repartition(1)
 .write.mode("overwrite")
 .parquet(str(DIR_SILVER / "siniestros_long_clean.parquet")))

## 4) Gold (Wide): pivot `year, region` con columnas `metric__categoria`

In [None]:
import unicodedata, re
from pyspark.sql.types import StringType

@F.udf(StringType())
def slug(s):
    if s is None: 
        return "total"
    s = unicodedata.normalize("NFKD", s).encode("ascii","ignore").decode("ascii")
    s = re.sub(r"[^A-Za-z0-9]+","_", s.strip().lower()).strip("_")
    return s or "total"

df_aug = (df_silver
  .withColumn("dim_value_slug", slug(F.col("dim_value")))
  .withColumn("metric_slug", slug(F.col("metric")))
  .withColumn("colname", F.concat_ws("__", F.col("metric_slug"), F.col("dim_value_slug")))
)

df_wide = (df_aug
  .groupBy("year","region")
  .pivot("colname")
  .agg(F.sum("value"))
  .fillna(0.0)
)

df_wide.printSchema()
df_wide.show(5, truncate=False)
df_wide.count()

(df_wide
 .repartition(1)
 .write.mode("overwrite")
 .parquet(str(DIR_GOLD / "siniestros_wide.parquet")))

## 5) Features: proporciones, índices e **lags**

In [None]:
w = df_wide

TOTAL_COL = "siniestros_total__total"
has_total = TOTAL_COL in w.columns

franjas = [c for c in w.columns if c.startswith("siniestros_por_franja_horaria__")]
dias    = [c for c in w.columns if c.startswith("siniestros_por_dia__")]

if has_total:
    for c in franjas:
        w = w.withColumn(f"prop__{c}", F.when(F.col(TOTAL_COL)>0, F.col(c)/F.col(TOTAL_COL)).otherwise(F.lit(None)))
    for c in dias:
        w = w.withColumn(f"prop__{c}", F.when(F.col(TOTAL_COL)>0, F.col(c)/F.col(TOTAL_COL)).otherwise(F.lit(None)))
    w = (w
         .withColumn("idx_noche", F.col("prop__siniestros_por_franja_horaria__18_00_23_59"))
         .withColumn("idx_finde", (F.coalesce(F.col("prop__siniestros_por_dia__sabado"),F.lit(0.0)) +
                                   F.coalesce(F.col("prop__siniestros_por_dia__domingo"),F.lit(0.0)))
        ))

from pyspark.sql import Window
win = Window.partitionBy("region").orderBy("year")
if has_total:
    w = (w
        .withColumn("y_lag1", F.lag(F.col(TOTAL_COL), 1).over(win))
        .withColumn("y_lag2", F.lag(F.col(TOTAL_COL), 2).over(win))
        .withColumn("growth_y", (F.col(TOTAL_COL) - F.col("y_lag1"))/F.col("y_lag1"))
    )

w.printSchema()
w.show(5, truncate=False)

(w
 .repartition(1)
 .write.mode("overwrite")
 .parquet(str(DIR_GOLD / "siniestros_wide_features.parquet")))

## 6) Baseline ML (Spark ML – Poisson)

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

df_feat = spark.read.parquet(str(DIR_GOLD / "siniestros_wide_features.parquet")).na.fill(0.0)

target = "siniestros_total__total"
if target not in df_feat.columns:
    raise ValueError(f"No se encontró la columna objetivo '{target}'. Ajusta el target o revisa si existe la hoja de totales.")

exclude = {"year","region", target}
feature_cols = [c for c in df_feat.columns if c not in exclude]

assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
data = assembler.transform(df_feat).select("year","region","features", F.col(target).alias("label"))

train = data.filter("year <= 2021")
test  = data.filter("year >= 2022")

glm = GeneralizedLinearRegression(family="poisson", link="log", maxIter=200, regParam=0.0)
model = glm.fit(train)

pred = model.transform(test)
rmse = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="rmse").evaluate(pred)
mae  = RegressionEvaluator(labelCol="label", predictionCol="prediction", metricName="mae").evaluate(pred)

print(f"RMSE = {rmse:.2f} | MAE = {mae:.2f}")
pred.select("year","region","label","prediction").orderBy("year","region").show(20, truncate=False)

## 7) Exportaciones adicionales (opcional)

In [None]:
# Exportar CSV de las tablas clave para inspección rápida (opcional)
(df_silver.coalesce(1).write.mode("overwrite").option("header", True).csv(str(DIR_SILVER / "siniestros_long_clean_csv")))
(w.coalesce(1).write.mode("overwrite").option("header", True).csv(str(DIR_GOLD / "siniestros_wide_features_csv")))

# Conteos por año para sanity-check
df_silver.groupBy("year").count().orderBy("year").show(50)