<a href="https://colab.research.google.com/github/gomzkevin/kontempo/blob/main/Early_Warnings_Model_01_de_agosto_2025.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import re
from pathlib import Path
from datetime import timedelta
from typing import List, Dict, Optional

import numpy as np
import pandas as pd

###############################################################################
# CONFIG                                                                      #
###############################################################################

PAYMENTS_PATH = Path("payments.json")  # override with --payments_path
LIMITS_PATH   = Path("Limit - Credit.csv")  # override with --limits_path
OUTPUT_PATH   = Path("snapshots.parquet")  # override with --output_path

WINDOW_LOOKAHEAD_DAYS = 30   # due‑dates inspected after each snapshot
DPD_THRESHOLD         = 35   # default definition: unpaid ≥ 35 DPD
MATERIALITY           = 0.96 # ≥ 96% of Amount Due counts as paid

###############################################################################
# RECEIPTS PARSER                                                             #
###############################################################################

def parse_receipts(raw: str) -> List[Dict[str, Optional[pd.Timestamp]]]:
    """Convert EDN‑style string in *Receipts* column to list of dicts."""
    if not raw or raw.strip() in ("", "[]"):
        return []
    receipts: List[Dict[str, Optional[pd.Timestamp]]] = []
    for block in re.split(r"#ordered/map", raw):
        if not block.strip():
            continue
        kv = dict(re.findall(r"\[:(\w+)\s+([^:\]]+)\]", block))
        try:
            amt = float(str(kv.get("amount_applied", "0")).replace(",", ""))
        except ValueError:
            amt = 0.0
        if "applied_date" in kv:
            try:
                ts = pd.to_datetime(int(str(kv["applied_date"]).replace(",", "")), unit="s")
            except Exception:
                ts = pd.NaT
        else:
            ts = pd.NaT
        receipts.append({"amount_applied": amt, "applied_date": ts})
    return receipts

###############################################################################
# HELPERS                                                                     #
###############################################################################

def _clean_numeric(col: pd.Series) -> pd.Series:
    """Strip commas, convert to float, coerce errors to NaN."""
    return pd.to_numeric(
        col.astype(str)
           .str.replace(",", "", regex=False)
           .replace({"None": np.nan, "": np.nan}),
        errors="coerce",
    )

def _months_between(a: pd.Timestamp, b: pd.Timestamp) -> int:
    """Whole months between two timestamps."""
    return int((b.to_period("M") - a.to_period("M")).n)

###############################################################################
# DATA LOADERS                                                                #
###############################################################################

def load_payments(path: Path) -> pd.DataFrame:
    df = pd.read_json(path, lines=False)
    df = df[df["Status"] != "voided"].copy()

    df["Created_dt"] = pd.to_datetime(
        df["Created"].astype(str)
          .str.replace(",", "", regex=False)
          .astype(int), unit="s"
    )
    df["Due_dt"] = pd.to_datetime(df["Due Date"], errors="coerce")

    num_cols = ["Amount Due", "Amount Paid", "Principal Amount"]
    df[num_cols] = df[num_cols].apply(_clean_numeric)

    df["quota_balance"] = df["Amount Due"] - df["Amount Paid"]
    df["is_unpaid"] = df["Amount Paid"] < MATERIALITY * df["Amount Due"]

    df["_receipts"] = df["Receipts"].apply(parse_receipts)
    return df

def load_limits(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path)
    df = df.groupby("Buyer Account ID", as_index=False)["Limit"].sum()
    df = df.rename(columns={"Buyer Account ID": "Buyer Account", "Limit": "limite_de_credito"})
    df["limite_de_credito"] = _clean_numeric(df["limite_de_credito"])
    return df

###############################################################################
# FEATURE ENGINEERING                                                         #
###############################################################################

def build_snapshot_features(hist: pd.DataFrame, snap: pd.Timestamp) -> pd.DataFrame:
    gb = hist.groupby("Buyer Account")

    feat = pd.DataFrame({
        "snapshot_date": snap,
        "dias_desde_ultimo_prestamo": (snap - gb["Created_dt"].max()).dt.days,
        "acquisition_date": gb["Created_dt"].min(),
        "numero_total_prestamos_historico": gb["Loan ID"].nunique(),
        "saldo_pendiente_actual": gb["quota_balance"].sum(),
        "max_dpd_actual": gb.apply(
            lambda g: ((snap - g.loc[g["is_unpaid"], "Due_dt"]).dt.days).max()
                      if g["is_unpaid"].any() else 0
        ),
    })
    feat["Buyer Account"] = feat.index
    feat.reset_index(drop=True, inplace=True)

    feat["antiguedad_cliente_meses"] = feat.apply(
        lambda r: _months_between(r["acquisition_date"], snap), axis=1
    )
    feat["es_primer_mes_activo"] = (feat["antiguedad_cliente_meses"] == 0).astype(int)

    # Handle installments if present
    if "total_installments" in hist.columns:
        last = hist.sort_values("Created_dt").drop_duplicates("Buyer Account", keep="last").set_index("Buyer Account")
        feat["installments_prestamo_reciente"] = feat["Buyer Account"].map(last["total_installments"])
        feat["_last_principal"] = feat["Buyer Account"].map(last["Principal Amount"])
        feat["promedio_installments_historico"] = feat["Buyer Account"].map(
            gb["total_installments"].mean()
        )
    else:
        feat["installments_prestamo_reciente"] = np.nan
        feat["_last_principal"] = np.nan
        feat["promedio_installments_historico"] = np.nan

    # Rolling windows
    w30 = hist[hist["Created_dt"] >= snap - pd.Timedelta(days=30)].groupby("Buyer Account")
    w60 = hist[(hist["Created_dt"] >= snap - pd.Timedelta(days=60)) &
               (hist["Created_dt"] < snap - pd.Timedelta(days=30))].groupby("Buyer Account")

    feat["monto_dispuesto_ultimos_30d"] = feat["Buyer Account"].map(
        w30["Principal Amount"].sum()
    )
    feat["frecuencia_prestamos_ultimos_30d"] = feat["Buyer Account"].map(
        w30["Loan ID"].nunique()
    )
    feat["monto_dispuesto_31_60d"] = feat["Buyer Account"].map(
        w60["Principal Amount"].sum()
    )
    feat["frecuencia_prestamos_31_60d"] = feat["Buyer Account"].map(
        w60["Loan ID"].nunique()
    )

    feat["aceleracion_monto"] = (
        feat["monto_dispuesto_ultimos_30d"].fillna(0)
        - feat["monto_dispuesto_31_60d"].fillna(0)
    )
    feat["aceleracion_frecuencia"] = (
        feat["frecuencia_prestamos_ultimos_30d"].fillna(0)
        - feat["frecuencia_prestamos_31_60d"].fillna(0)
    )
    feat["cambio_en_installments_reciente"] = (
        feat["installments_prestamo_reciente"]
        - feat["promedio_installments_historico"]
    )

    return feat

###############################################################################
# LABEL GENERATION                                                            #
###############################################################################

def _payment_covered(row: pd.Series, due_date: pd.Timestamp) -> bool:
    deadline = due_date + timedelta(days=DPD_THRESHOLD)
    paid = 0.0
    for rec in row["_receipts"]:
        ts = rec["applied_date"]
        if pd.isna(ts) or ts > deadline:
            continue
        paid += rec["amount_applied"]
    return paid >= MATERIALITY * row["Amount Due"]

def compute_label(future: pd.DataFrame) -> pd.Series:
    return future.groupby("Buyer Account").apply(
        lambda grp: int(not all(_payment_covered(r, r["Due_dt"]) for _, r in grp.iterrows()))
    )

###############################################################################
# MAIN BUILD FUNCTION                                                         #
###############################################################################

def build_snapshot_dataset(
    payments_path: Path = PAYMENTS_PATH,
    limits_path: Path = LIMITS_PATH,
    output_path: Path = OUTPUT_PATH,
) -> None:
    pay = load_payments(payments_path)
    limits = load_limits(limits_path)
    pay = pay.merge(limits, how="left", on="Buyer Account")

    # Impute credit limits
    max_principal = pay.groupby("Buyer Account")["Principal Amount"].transform("max")
    # Impute missing credit limits with max principal
    pay["limite_de_credito"] = pay["limite_de_credito"].fillna(max_principal)
    pay.loc[pay["limite_de_credito"] == 0, "limite_de_credito"] = max_principal

    first_snap = pay["Created_dt"].min().to_period("M").to_timestamp("M")
    last_snap = pay["Created_dt"].max().to_period("M").to_timestamp("M")
    snaps = pd.date_range(first_snap, last_snap, freq="M")

    all_frames: List[pd.DataFrame] = []
    for snap in snaps:
        hist = pay[pay["Created_dt"] <= snap]
        feat = build_snapshot_features(hist, snap)

        # utilisation
        feat = feat.merge(limits, on="Buyer Account", how="left")
        feat["limite_de_credito"] = feat["limite_de_credito"].fillna(feat["_last_principal"])
        feat["porcentaje_utilizacion"] = feat["saldo_pendiente_actual"] / feat["limite_de_credito"]

        # label
        mask_future = (
            (pay["Due_dt"] > snap)
            & (pay["Due_dt"] <= snap + pd.Timedelta(days=WINDOW_LOOKAHEAD_DAYS))
        )
        future = pay[mask_future]
        labels = compute_label(future)
        feat = feat.merge(labels.rename("default_en_35d"),
                          left_on="Buyer Account", right_index=True,
                          how="left")
        feat["default_en_35d"] = feat["default_en_35d"].fillna(0).astype(int)

        # Remove fossil risk
        feat = feat[feat["max_dpd_actual"] < 30]
        all_frames.append(feat.drop(columns=["_last_principal"]))

    snapshots = pd.concat(all_frames, ignore_index=True)
    snapshots.to_parquet(output_path, index=False)
    print(f"✅ Snapshot dataset saved to {output_path} ({len(snapshots):,} rows)")

###############################################################################
# CLI ENTRYPOINT                                                              #
###############################################################################

if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser(
        description="Build snapshot dataset for early-warning model"
    )
    parser.add_argument(
        "--payments_path", type=Path, default=PAYMENTS_PATH,
        help="Path to payments.json"
    )
    parser.add_argument(
        "--limits_path", type=Path, default=LIMITS_PATH,
        help="Path to Limit -Credit.csv"
    )
    parser.add_argument(
        "--output_path", type=Path, default=OUTPUT_PATH,
        help="Where to save snapshots.parquet"
    )
    # Use parse_known_args to ignore extra IPython arguments like -f
    args, _ = parser.parse_known_args()
    build_snapshot_dataset(
        payments_path=args.payments_path,
        limits_path=args.limits_path,
        output_path=args.output_path
    )
    build_snapshot_dataset(
        payments_path=args.payments_path,
        limits_path=args.limits_path,
        output_path=args.output_path
    )


  snaps = pd.date_range(first_snap, last_snap, freq="M")
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_

✅ Snapshot dataset saved to snapshots.parquet (21,903 rows)


  snaps = pd.date_range(first_snap, last_snap, freq="M")
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_dpd_actual": gb.apply(
  return future.groupby("Buyer Account").apply(
  "max_

✅ Snapshot dataset saved to snapshots.parquet (21,903 rows)


  return future.groupby("Buyer Account").apply(
