<a href="https://colab.research.google.com/github/gomzkevin/kontempo/blob/main/early_warnings_4_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# -*- coding: utf-8 -*-
"""
Pipeline Completo y Unificado del Modelo de Alertas Tempranas.

Versión Definitiva.

Este script integra la lógica de cálculo validada por la auditoría para
garantizar la precisión y robustez. Resuelve los errores de acumulación
y los KeyErrors.

El proceso es:
1.  Construcción del Dataset de Análisis con lógica de cálculo robusta.
2.  Entrenamiento del Modelo RandomForest.
3.  Scoring del Portafolio Actual con la lógica de negocio final.
"""
import pandas as pd
import numpy as np
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from IPython.display import display

print("Iniciando el pipeline completo del modelo...")

# =============================================================================
# PASO 1: CONSTRUCCIÓN DEL DATASET DE ANÁLISIS
# =============================================================================
print("\n--- [PASO 1] Iniciando la construcción del dataset de análisis... ---")

# --- 1.1 Carga y Preparación de Datos ---
payments_df = pd.DataFrame()
file_path = 'payments.json'

try:
    print(f"Cargando y preparando los datos desde '{file_path}'...")
    try:
        with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f)
    except json.JSONDecodeError:
        data = []
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line.strip():
                    try: data.append(json.loads(line.strip().rstrip(',')))
                    except json.JSONDecodeError: continue

    payments_df_raw = pd.DataFrame(data)
    payments_df = payments_df_raw.copy()

    if not payments_df.empty:
        rename_map = {
            'ID': 'id', 'Pkey': 'pkey', 'Amount Due': 'amount_due', 'Amount Paid': 'amount_paid',
            'Completed Status': 'completed_status', 'Created': 'created', 'Loan ID': 'loan_id',
            'Due Date': 'due_date', 'Principal Amount': 'principal_amount', 'Buyer Account': 'buyer_account',
            'Payment Number': 'payment_number', 'Status': 'status'
        }
        payments_df.rename(columns=lambda c: rename_map.get(c, c.lower().replace(' ', '_')), inplace=True)
        if 'status' in payments_df.columns:
            payments_df = payments_df[payments_df['status'] != 'voided'].copy()
        numeric_cols = ['amount_due', 'amount_paid', 'principal_amount', 'payment_number', 'created']
        for col in numeric_cols:
            payments_df[col] = pd.to_numeric(payments_df[col].astype(str).str.replace(',', ''), errors='coerce')
        payments_df['created_date'] = pd.to_datetime(payments_df['created'], unit='s', errors='coerce')
        payments_df['due_date'] = pd.to_datetime(payments_df['due_date'], errors='coerce')
        payments_df.dropna(subset=['created_date', 'loan_id', 'buyer_account'], inplace=True)
        print("✅ Datos limpios y preparados.")

except FileNotFoundError:
    print(f"❌ ERROR CRÍTICO: No se encontró el archivo de datos '{file_path}'.")
    payments_df = pd.DataFrame()

# El resto del pipeline solo se ejecuta si los datos se cargaron
if not payments_df.empty:
    # --- 1.2 Enriquecimiento con Límite de Crédito ---
    print("Cargando y fusionando datos de límites de crédito...")
    try:
        limits_df = pd.read_csv('Limit - Credit.csv')
        limits_df.columns = limits_df.columns.str.strip()
        limits_df = limits_df[['Buyer Account ID', 'Limit']]
        limits_df.columns = ['buyer_account', 'limite_de_credito']

        limits_df['limite_de_credito'] = pd.to_numeric(limits_df['limite_de_credito'].astype(str).str.replace('[$,]', '', regex=True), errors='coerce')
        aggregated_limits = limits_df.groupby('buyer_account')['limite_de_credito'].sum().reset_index()

        payments_df = pd.merge(payments_df, aggregated_limits, on='buyer_account', how='left')

        loan_level_df_temp = payments_df.groupby('loan_id').agg(buyer_account=('buyer_account', 'first'), principal_amount=('principal_amount', 'first')).reset_index()
        max_principal_by_buyer = loan_level_df_temp.groupby('buyer_account')['principal_amount'].max()
        payments_df = pd.merge(payments_df, max_principal_by_buyer.rename('max_principal_imputed'), on='buyer_account', how='left')

        payments_df.loc[payments_df['limite_de_credito'].isnull(), 'limite_de_credito'] = payments_df['max_principal_imputed']
        payments_df.loc[payments_df['limite_de_credito'] == 0, 'limite_de_credito'] = payments_df['max_principal_imputed']
        payments_df.drop(columns=['max_principal_imputed'], inplace=True)
        print("✅ Límites de crédito enriquecidos y fusionados.")
    except FileNotFoundError:
        print("⚠️ ADVERTENCIA: No se encontró 'Limit - Credit.csv'. Se imputará el límite con el monto máximo dispuesto.")
        loan_level_df_temp = payments_df.groupby('loan_id').agg(buyer_account=('buyer_account', 'first'), principal_amount=('principal_amount', 'first')).reset_index()
        max_principal_by_buyer = loan_level_df_temp.groupby('buyer_account')['principal_amount'].transform('max')
        payments_df['limite_de_credito'] = max_principal_by_buyer

    # --- 1.3 Cálculo Iterativo de Métricas ---
    print("Creando la estructura del dataset y calculando métricas mensuales...")
    loan_level_df = payments_df.groupby('loan_id').agg(
        issuance_date=('created_date', 'min'), principal_amount=('principal_amount', 'first'),
        buyer_account=('buyer_account', 'first'), total_installments=('payment_number', 'max')
    ).reset_index()
    loan_level_df['total_installments'] = pd.to_numeric(loan_level_df['total_installments'], errors='coerce').fillna(0).astype(int)

    min_date = payments_df['created_date'].min().to_period('M').to_timestamp(how='end')
    max_date = (payments_df['created_date'].max() - pd.DateOffset(days=70)).to_period('M').to_timestamp(how='end')
    snapshot_dates = pd.date_range(min_date, max_date, freq='ME')

    all_snapshots = []
    materiality_threshold = 0.04
    epsilon = 1e-6

    for snapshot_date in snapshot_dates:
        history_payments = payments_df[payments_df['created_date'] <= snapshot_date].copy()
        history_loans = loan_level_df[loan_level_df['issuance_date'] <= snapshot_date]

        active_buyers_in_snapshot = history_loans[history_loans['issuance_date'] >= (snapshot_date - pd.DateOffset(days=180))]['buyer_account'].unique()
        current_snapshot = pd.DataFrame(active_buyers_in_snapshot, columns=['buyer_account'])
        if current_snapshot.empty: continue
        current_snapshot['snapshot_date'] = snapshot_date

        # --- LÓGICA DE CÁLCULO DEFINITIVA Y ROBUSTA (TRASPLANTADA DE LA AUDITORÍA) ---
        hp = history_payments
        hp['amount_due_clean'] = hp['amount_due'].fillna(0)
        hp['amount_paid_clean'] = hp['amount_paid'].fillna(0)
        hp['saldo_individual'] = hp['amount_due_clean'] - hp['amount_paid_clean']

        with np.errstate(divide='ignore', invalid='ignore'):
            hp['paid_ratio'] = hp['amount_paid_clean'].divide(hp['amount_due_clean'])
            hp['paid_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
            hp['paid_ratio'].fillna(1, inplace=True) # Asumir 100% pagado si 'due' es 0 o nulo

        hp['is_open'] = (
            (hp['saldo_individual'] > epsilon) &
            (hp['amount_due_clean'] > epsilon) &
            (hp['paid_ratio'] < (1 - materiality_threshold))
        )

        past_due_df = hp[(hp['is_open']) & (hp['due_date'] < snapshot_date)]
        if not past_due_df.empty:
            max_dpd_series = past_due_df.groupby('buyer_account')['due_date'].max()
            max_dpd = (snapshot_date - max_dpd_series).dt.days
            current_snapshot = current_snapshot.merge(max_dpd.rename('max_dpd_actual'), on='buyer_account', how='left')

        if 'max_dpd_actual' not in current_snapshot.columns:
            current_snapshot['max_dpd_actual'] = 0
        current_snapshot['max_dpd_actual'] = current_snapshot['max_dpd_actual'].fillna(0)

        current_snapshot = current_snapshot[current_snapshot['max_dpd_actual'] <= 35]
        if current_snapshot.empty: continue

        saldo_pendiente = hp[hp['is_open']].groupby('buyer_account')['saldo_individual'].sum()
        current_snapshot = current_snapshot.merge(saldo_pendiente.rename('saldo_pendiente_actual'), on='buyer_account', how='left')

        # --- CÁLCULO DE OTRAS FEATURES ---
        acquisition_dates = history_payments.groupby('buyer_account')['created_date'].min()
        current_snapshot = current_snapshot.merge(acquisition_dates.rename('acquisition_date'), on='buyer_account', how='left')
        current_snapshot['antiguedad_cliente_meses'] = ((snapshot_date.year - current_snapshot['acquisition_date'].dt.year) * 12 + (snapshot_date.month - current_snapshot['acquisition_date'].dt.month))
        num_loans = history_loans.groupby('buyer_account')['loan_id'].nunique()
        current_snapshot = current_snapshot.merge(num_loans.rename('numero_total_prestamos_historico'), on='buyer_account', how='left')
        loans_30d = history_loans[history_loans['issuance_date'] >= (snapshot_date - pd.DateOffset(days=30))]
        monto_30d = loans_30d.groupby('buyer_account')['principal_amount'].sum()
        freq_30d = loans_30d.groupby('buyer_account')['loan_id'].nunique()
        current_snapshot = current_snapshot.merge(monto_30d.rename('monto_dispuesto_ultimos_30d'), on='buyer_account', how='left')
        current_snapshot = current_snapshot.merge(freq_30d.rename('frecuencia_prestamos_ultimos_30d'), on='buyer_account', how='left')
        loans_31_60d = history_loans[(history_loans['issuance_date'] >= (snapshot_date - pd.DateOffset(days=60))) & (history_loans['issuance_date'] < (snapshot_date - pd.DateOffset(days=30)))]
        monto_31_60d = loans_31_60d.groupby('buyer_account')['principal_amount'].sum()
        freq_31_60d = loans_31_60d.groupby('buyer_account')['loan_id'].nunique()
        current_snapshot = current_snapshot.merge(monto_31_60d.rename('monto_dispuesto_31_60d'), on='buyer_account', how='left')
        current_snapshot = current_snapshot.merge(freq_31_60d.rename('frecuencia_prestamos_31_60d'), on='buyer_account', how='left')

        limites = history_payments.groupby('buyer_account')['limite_de_credito'].first()
        current_snapshot = current_snapshot.merge(limites.rename('limite_de_credito'), on='buyer_account', how='left')
        current_snapshot['porcentaje_utilizacion'] = (current_snapshot['saldo_pendiente_actual'] / current_snapshot['limite_de_credito'].replace(0, np.nan)) * 100
        if not history_loans.empty:
            most_recent_loan = history_loans.loc[history_loans.groupby('buyer_account')['issuance_date'].idxmax()]
            loan_terms = most_recent_loan[['buyer_account', 'total_installments']].rename(columns={'total_installments': 'installments_prestamo_reciente'})
            current_snapshot = current_snapshot.merge(loan_terms, on='buyer_account', how='left')
            avg_installments = history_loans.groupby('buyer_account')['total_installments'].mean()
            current_snapshot = current_snapshot.merge(avg_installments.rename('promedio_installments_historico'), on='buyer_account', how='left')

        current_snapshot.fillna(0, inplace=True)
        current_snapshot['es_primer_mes_activo'] = (current_snapshot['antiguedad_cliente_meses'] == 0).astype(int)
        current_snapshot['aceleracion_monto'] = current_snapshot['monto_dispuesto_ultimos_30d'] - current_snapshot['monto_dispuesto_31_60d']
        current_snapshot['aceleracion_frecuencia'] = current_snapshot['frecuencia_prestamos_ultimos_30d'] - current_snapshot['frecuencia_prestamos_31_60d']
        current_snapshot['cambio_en_installments_reciente'] = current_snapshot['installments_prestamo_reciente'] - current_snapshot['promedio_installments_historico']

        future_payments = payments_df[(payments_df['due_date'] > snapshot_date) & (payments_df['due_date'] <= snapshot_date + pd.Timedelta(days=90))].copy()
        future_payments['default_check_date'] = future_payments['due_date'] + pd.Timedelta(days=35)
        future_payments['saldo_individual'] = future_payments['amount_due'].fillna(0) - future_payments['amount_paid'].fillna(0)
        future_payments['is_open'] = (future_payments['saldo_individual'] > epsilon) & (future_payments['amount_due'] > epsilon) & ((future_payments['amount_paid'].fillna(0) / future_payments['amount_due']) < (1 - materiality_threshold))
        defaulted_payments = future_payments[(future_payments['default_check_date'] < pd.Timestamp.now()) & (future_payments['is_open'])]
        defaulted_buyers = defaulted_payments['buyer_account'].unique()
        current_snapshot['default_en_35d'] = np.where(current_snapshot['buyer_account'].isin(defaulted_buyers), 1, 0)

        all_snapshots.append(current_snapshot)

    final_dataset = pd.concat(all_snapshots, ignore_index=True)
    final_dataset.fillna(0, inplace=True)
    final_dataset = final_dataset[final_dataset['snapshot_date'] >= final_dataset['acquisition_date']]
    print(f"✅ Dataset de análisis finalizado con {len(final_dataset)} filas.")

    # =============================================================================
    # PASO 2: ENTRENAMIENTO DEL MODELO
    # =============================================================================
    print("\n--- [PASO 2] Iniciando el entrenamiento del modelo... ---")
    features = [col for col in final_dataset.columns if col not in ['buyer_account', 'snapshot_date', 'acquisition_date', 'default_en_35d']]
    target = 'default_en_35d'

    X = final_dataset[features]
    y = final_dataset[target]

    split_date = '2024-12-31'
    train_mask = final_dataset['snapshot_date'] <= pd.to_datetime(split_date)
    test_mask = final_dataset['snapshot_date'] > pd.to_datetime(split_date)

    X_train, X_test = X[train_mask], X[test_mask]
    y_train, y_test = y[train_mask], y[test_mask]

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = RandomForestClassifier(n_estimators=100, class_weight='balanced_subsample', random_state=42, n_jobs=-1)
    model.fit(X_train_scaled, y_train)
    print("✅ Modelo entrenado exitosamente.")

    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]
    auc = roc_auc_score(y_test, y_pred_proba)
    print(f"📈 Desempeño del modelo en datos de prueba (AUC): {auc:.4f}")


    # =============================================================================
    # PASO 3: SCORING DEL PORTAFOLIO ACTUAL
    # =============================================================================
    print("\n--- [PASO 3] Iniciando el scoring del portafolio actual... ---")

    def recalculate_features_for_buyer(buyer_id, all_payments_df, all_loans_df, today_date, materiality_thresh, epsilon_val):
        client_payments = all_payments_df[all_payments_df['buyer_account'] == buyer_id].copy()
        client_loans = all_loans_df[all_loans_df['buyer_account'] == buyer_id].copy()
        if client_payments.empty: return None

        client_df = pd.DataFrame([{'buyer_account': buyer_id}])

        acquisition_date = client_payments['created_date'].min()
        client_df['acquisition_date'] = acquisition_date
        client_df['antiguedad_cliente_meses'] = ((today_date.year - acquisition_date.year) * 12 + (today_date.month - acquisition_date.month))
        client_df['es_primer_mes_activo'] = (client_df['antiguedad_cliente_meses'] == 0).astype(int)
        client_df['numero_total_prestamos_historico'] = client_loans['loan_id'].nunique()

        loans_30d = client_loans[client_loans['issuance_date'] >= (today_date - pd.DateOffset(days=30))]
        client_df['monto_dispuesto_ultimos_30d'] = loans_30d['principal_amount'].sum()
        client_df['frecuencia_prestamos_ultimos_30d'] = loans_30d['loan_id'].nunique()

        loans_31_60d = client_loans[(client_loans['issuance_date'] >= (today_date - pd.DateOffset(days=60))) & (client_loans['issuance_date'] < (today_date - pd.DateOffset(days=30)))]
        client_df['monto_dispuesto_31_60d'] = loans_31_60d['principal_amount'].sum()
        client_df['frecuencia_prestamos_31_60d'] = loans_31_60d['loan_id'].nunique()

        pay_df = client_payments.copy()
        pay_df['amount_due_clean'] = pd.to_numeric(pay_df['amount_due'], errors='coerce').fillna(0.0)
        pay_df['amount_paid_clean'] = pd.to_numeric(pay_df['amount_paid'], errors='coerce').fillna(0.0)
        pay_df['saldo_individual'] = pay_df['amount_due_clean'] - pay_df['amount_paid_clean']
        with np.errstate(divide='ignore', invalid='ignore'):
            pay_df['paid_ratio'] = pay_df['amount_paid_clean'].divide(pay_df['amount_due_clean'])
            pay_df['paid_ratio'].replace([np.inf, -np.inf], np.nan, inplace=True)
            pay_df['paid_ratio'].fillna(1, inplace=True)
        pay_df['is_open'] = (
            (pay_df['saldo_individual'] > epsilon_val) &
            (pay_df['amount_due_clean'] > epsilon_val) &
            (pay_df['paid_ratio'] < (1 - materiality_thresh))
        )

        client_df['saldo_pendiente_actual'] = pay_df.loc[pay_df['is_open'], 'saldo_individual'].sum()

        past_due_df = pay_df[(pay_df['is_open']) & (pay_df['due_date'] < today_date)]
        client_df['max_dpd_actual'] = (today_date - past_due_df['due_date']).dt.days.max() if not past_due_df.empty else 0

        client_df['limite_de_credito'] = client_payments['limite_de_credito'].iloc[0] if not client_payments['limite_de_credito'].isnull().all() else 0
        client_df['porcentaje_utilizacion'] = (client_df['saldo_pendiente_actual'] / client_df['limite_de_credito'].replace(0, np.nan)) * 100

        if not client_loans.empty:
            most_recent_loan = client_loans.loc[client_loans['issuance_date'].idxmax()]
            client_df['installments_prestamo_reciente'] = most_recent_loan['total_installments']
            client_df['promedio_installments_historico'] = client_loans['total_installments'].mean()
        else:
            client_df['installments_prestamo_reciente'] = 0
            client_df['promedio_installments_historico'] = 0

        client_df.fillna(0, inplace=True)
        client_df['aceleracion_monto'] = client_df['monto_dispuesto_ultimos_30d'] - client_df['monto_dispuesto_31_60d']
        client_df['aceleracion_frecuencia'] = client_df['frecuencia_prestamos_ultimos_30d'] - client_df['frecuencia_prestamos_31_60d']
        client_df['cambio_en_installments_reciente'] = client_df['installments_prestamo_reciente'] - client_df['promedio_installments_historico']

        return client_df

    all_unique_buyers = payments_df['buyer_account'].unique()
    scoring_results = []
    today = payments_df['created_date'].max()

    print("Calculando features para el portafolio actual...")
    for buyer_id in all_unique_buyers:
        client_features = recalculate_features_for_buyer(buyer_id, payments_df, loan_level_df, today, materiality_threshold, epsilon)
        if client_features is not None:
            scoring_results.append(client_features)

    scoring_df = pd.concat(scoring_results, ignore_index=True)

    scoring_df.sort_values('buyer_account', inplace=True)
    scoring_df.reset_index(drop=True, inplace=True)

    for col in features:
        if col not in scoring_df.columns:
            scoring_df[col] = 0

    X_today = scoring_df[features]
    X_today_scaled = scaler.transform(X_today)
    risk_scores = model.predict_proba(X_today_scaled)[:, 1]
    scoring_df['risk_score'] = risk_scores
    print("✅ Scores de riesgo calculados para todo el portafolio.")

    print("Aplicando segmentación y reglas de negocio...")
    bins = [-0.01, 0.10, 0.40, 0.70, 1.01]
    labels = ['Bajo', 'Medio', 'Alto', 'Crítico']
    scoring_df['nivel_de_riesgo_modelo'] = pd.cut(scoring_df['risk_score'], bins=bins, labels=labels)

    scoring_df['segmento_final'] = scoring_df['nivel_de_riesgo_modelo'].astype(str)

    last_loan_dates = loan_level_df.groupby('buyer_account')['issuance_date'].max()
    scoring_df = pd.merge(scoring_df, last_loan_dates.rename('last_loan_date'), on='buyer_account', how='left')
    scoring_df['dias_desde_ultimo_prestamo'] = (today - scoring_df['last_loan_date']).dt.days
    scoring_df.loc[scoring_df['dias_desde_ultimo_prestamo'] > 180, 'segmento_final'] = 'Inactivo'

    good_history_mask = (scoring_df['segmento_final'] == 'Crítico') & \
                        (scoring_df['max_dpd_actual'] == 0) & \
                        (scoring_df['numero_total_prestamos_historico'] > 5)
    scoring_df.loc[good_history_mask, 'segmento_final'] = 'Alto (Revisión por Regla)'

    scoring_df.loc[scoring_df['max_dpd_actual'] > 35, 'segmento_final'] = 'En Cobranza'
    scoring_df.fillna({'segmento_final': 'Bajo'}, inplace=True)

    all_report_cols = [
        'buyer_account', 'risk_score', 'segmento_final', 'nivel_de_riesgo_modelo', 'dias_desde_ultimo_prestamo',
        'limite_de_credito', 'acquisition_date', 'antiguedad_cliente_meses', 'es_primer_mes_activo',
        'numero_total_prestamos_historico', 'monto_dispuesto_ultimos_30d', 'frecuencia_prestamos_ultimos_30d',
        'monto_dispuesto_31_60d', 'frecuencia_prestamos_31_60d', 'saldo_pendiente_actual',
        'porcentaje_utilizacion', 'max_dpd_actual', 'installments_prestamo_reciente',
        'promedio_installments_historico', 'aceleracion_monto', 'aceleracion_frecuencia',
        'cambio_en_installments_reciente'
    ]
    final_cols_to_save = [col for col in all_report_cols if col in scoring_df.columns]
    scoring_df[final_cols_to_save].to_csv('reporte_scoring_completo_unificado.csv', index=False)
    print("✅ Reporte de scoring completo guardado.")

    print("\n" + "="*20 + " Resumen Final del Portafolio por Riesgo " + "="*20)
    risk_summary = scoring_df.groupby('segmento_final', observed=False).agg(
        Numero_de_Clientes=('buyer_account', 'count'),
        Score_Promedio=('risk_score', 'mean')
    ).sort_index()
    risk_summary['Score_Promedio'] = (risk_summary['Score_Promedio'] * 100).map('{:.2f}%'.format)
    display(risk_summary)

    print("\n\n✅ Pipeline completo finalizado.")

else:
    print("\nPipeline detenido porque no se cargaron datos.")

Iniciando el pipeline completo del modelo...

--- [PASO 1] Iniciando la construcción del dataset de análisis... ---
Cargando y preparando los datos desde 'payments.json'...
❌ ERROR CRÍTICO: No se encontró el archivo de datos 'payments.json'.

Pipeline detenido porque no se cargaron datos.
