In [5]:
#!/usr/bin/env python3
"""
MODELO BASELINE: LightGBM + SMOTE
Entrenar, evaluar y guardar modelos
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (classification_report, confusion_matrix, 
                             roc_auc_score, roc_curve, accuracy_score)
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("MODELO BASELINE: LightGBM + SMOTE")
print("="*80)

OUTPUT_DIR = 'output_ml'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# =============================================================================
# PASO 1: CARGAR DATOS
# =============================================================================

print("\n[1/5] Cargando datos preprocesados...")

# Intentar cargar desde pickle primero
try:
    with open(f'{OUTPUT_DIR}/train_test_split.pkl', 'rb') as f:
        data = pickle.load(f)
    
    X_train = data['X_train']
    X_test = data['X_test']
    y_train = data['y_train']
    y_test = data['y_test']
    feature_names = data['feature_names']
    
    print(f" Cargado desde pickle")
except:
    # Cargar desde CSV
    print(f"  Cargando desde CSV...")
    X_train = pd.read_csv(f'{OUTPUT_DIR}/X_train.csv')
    X_test = pd.read_csv(f'{OUTPUT_DIR}/X_test.csv')
    y_train = pd.read_csv(f'{OUTPUT_DIR}/y_train.csv').squeeze()
    y_test = pd.read_csv(f'{OUTPUT_DIR}/y_test.csv').squeeze()
    feature_names = X_train.columns.tolist()
    print(f"  Cargado desde CSV")

print(f"\n  Train: {X_train.shape}")
print(f"  Test: {X_test.shape}")
print(f"  Features: {len(feature_names)}")

print(f"\n  Distribución Train:")
print(f"    Clase 0: {(y_train==0).sum():,} ({(y_train==0).sum()/len(y_train)*100:.1f}%)")
print(f"    Clase 1: {(y_train==1).sum():,} ({(y_train==1).sum()/len(y_train)*100:.1f}%)")

# =============================================================================
# PASO 2: BALANCEO CON SMOTE
# =============================================================================

print("\n[2/5] Balanceo con SMOTE...")

print(f"\n  Antes de SMOTE:")
print(f"    Clase 0: {(y_train==0).sum():,}")
print(f"    Clase 1: {(y_train==1).sum():,}")
print(f"    Ratio: 1:{(y_train==0).sum()/(y_train==1).sum():.2f}")

# Aplicar SMOTE
smote = SMOTE(random_state=42, k_neighbors=5)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"\n  Después de SMOTE:")
print(f"    Clase 0: {(y_train_smote==0).sum():,}")
print(f"    Clase 1: {(y_train_smote==1).sum():,}")
print(f"    Ratio: 1:1.0")

# =============================================================================
# PASO 3: ENTRENAR MODELOS
# =============================================================================

print("\n[3/5] Entrenando modelos...")

# Modelo A: Baseline (sin balanceo)
print("\n  [A] Modelo baseline (sin balanceo)...")

model_baseline = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    num_leaves=31,
    random_state=42,
    verbose=-1
)

model_baseline.fit(X_train, y_train)
print(f"      Entrenado")

# Predicciones baseline
y_pred_base = model_baseline.predict(X_test)
y_proba_base = model_baseline.predict_proba(X_test)[:, 1]

# Modelo B: Con SMOTE
print("\n  [B] Modelo con SMOTE (balanceado)...")

model_smote = lgb.LGBMClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=63,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)

model_smote.fit(X_train_smote, y_train_smote)
print(f"      Entrenado")

# Predicciones SMOTE
y_pred_smote = model_smote.predict(X_test)
y_proba_smote = model_smote.predict_proba(X_test)[:, 1]

# =============================================================================
# PASO 4: EVALUACION
# =============================================================================

print("\n[4/5] Evaluando modelos...")

# Evaluación baseline
print("\n" + "="*80)
print("MODELO BASELINE (Sin balanceo)")
print("="*80)
print(classification_report(y_test, y_pred_base, digits=4))
acc_base = accuracy_score(y_test, y_pred_base)
auc_base = roc_auc_score(y_test, y_proba_base)
print(f"Accuracy: {acc_base:.4f}")
print(f"ROC-AUC:  {auc_base:.4f}")

# Evaluación SMOTE
print("\n" + "="*80)
print("MODELO CON SMOTE (Balanceado)")
print("="*80)
print(classification_report(y_test, y_pred_smote, digits=4))
acc_smote = accuracy_score(y_test, y_pred_smote)
auc_smote = roc_auc_score(y_test, y_proba_smote)
print(f"Accuracy: {acc_smote:.4f}")
print(f"ROC-AUC:  {auc_smote:.4f}")

# =============================================================================
# PASO 5: VISUALIZACIONES
# =============================================================================

print("\n[5/5] Generando visualizaciones...")

plt.style.use('seaborn-v0_8-whitegrid')

# 1. Confusion Matrices
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

cm_base = confusion_matrix(y_test, y_pred_base)
cm_smote = confusion_matrix(y_test, y_pred_smote)

sns.heatmap(cm_base, annot=True, fmt='d', cmap='Blues', ax=axes[0], 
            xticklabels=['Fracaso', 'Éxito'], yticklabels=['Fracaso', 'Éxito'])
axes[0].set_title('Confusion Matrix - Baseline\n(Sin balanceo)', fontsize=13, fontweight='bold')
axes[0].set_xlabel('Predicho', fontweight='bold')
axes[0].set_ylabel('Real', fontweight='bold')

sns.heatmap(cm_smote, annot=True, fmt='d', cmap='Greens', ax=axes[1],
            xticklabels=['Fracaso', 'Éxito'], yticklabels=['Fracaso', 'Éxito'])
axes[1].set_title('Confusion Matrix - SMOTE\n(Con balanceo)', fontsize=13, fontweight='bold')
axes[1].set_xlabel('Predicho', fontweight='bold')
axes[1].set_ylabel('Real', fontweight='bold')

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/01_confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.close()
print(f"  01_confusion_matrices.png")

# 2. ROC Curves
fig, ax = plt.subplots(1, 1, figsize=(10, 8))

fpr_base, tpr_base, _ = roc_curve(y_test, y_proba_base)
fpr_smote, tpr_smote, _ = roc_curve(y_test, y_proba_smote)

ax.plot(fpr_base, tpr_base, label=f'Baseline (AUC={auc_base:.3f})', 
        linewidth=2.5, color='#3498db')
ax.plot(fpr_smote, tpr_smote, label=f'SMOTE (AUC={auc_smote:.3f})', 
        linewidth=2.5, color='#2ecc71')
ax.plot([0, 1], [0, 1], 'k--', linewidth=1.5, label='Random', alpha=0.5)

ax.set_xlabel('False Positive Rate (FPR)', fontsize=12, fontweight='bold')
ax.set_ylabel('True Positive Rate (TPR)', fontsize=12, fontweight='bold')
ax.set_title('ROC Curves - Comparación de Modelos', fontsize=14, fontweight='bold')
ax.legend(fontsize=12, loc='lower right')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/02_roc_curves.png', dpi=300, bbox_inches='tight')
plt.close()
print(f"02_roc_curves.png")

# 3. Feature Importance
fig, ax = plt.subplots(1, 1, figsize=(10, 12))

importance = model_smote.feature_importances_
indices = np.argsort(importance)[-20:]  # Top 20

colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(indices)))

ax.barh(range(len(indices)), importance[indices], color=colors)
ax.set_yticks(range(len(indices)))
ax.set_yticklabels([feature_names[i] for i in indices], fontsize=10)
ax.set_xlabel('Importancia', fontsize=12, fontweight='bold')
ax.set_title('Top 20 Features Más Importantes\n(Modelo SMOTE)', fontsize=14, fontweight='bold')
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/03_feature_importance.png', dpi=300, bbox_inches='tight')
plt.close()
print(f"03_feature_importance.png")

# =============================================================================
# GUARDAR MODELOS Y RESULTADOS
# =============================================================================

print("\n" + "="*80)
print("GUARDANDO MODELOS Y RESULTADOS")
print("="*80)

# Guardar modelos
with open(f'{OUTPUT_DIR}/model_baseline.pkl', 'wb') as f:
    pickle.dump(model_baseline, f)
print(f" model_baseline.pkl")

with open(f'{OUTPUT_DIR}/model_smote.pkl', 'wb') as f:
    pickle.dump(model_smote, f)
print(f"  model_smote.pkl")

# Guardar métricas
metricas = {
    'baseline': {
        'accuracy': acc_base,
        'roc_auc': auc_base,
        'confusion_matrix': cm_base.tolist()
    },
    'smote': {
        'accuracy': acc_smote,
        'roc_auc': auc_smote,
        'confusion_matrix': cm_smote.tolist()
    }
}

with open(f'{OUTPUT_DIR}/metricas.pkl', 'wb') as f:
    pickle.dump(metricas, f)
print(f"  ✓ metricas.pkl")

# Feature importance
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': model_smote.feature_importances_
}).sort_values('importance', ascending=False)

importance_df.to_csv(f'{OUTPUT_DIR}/feature_importance.csv', index=False)
print(f"  ✓ feature_importance.csv")

# =============================================================================
# RESUMEN FINAL
# =============================================================================

print("\n" + "="*80)
print("RESUMEN FINAL")
print("="*80)

print(f"""
MODELO BASELINE (Sin balanceo):
  Accuracy:  {acc_base:.4f}
  ROC-AUC:   {auc_base:.4f}

MODELO SMOTE (Balanceado): ⭐
  Accuracy:  {acc_smote:.4f}
  ROC-AUC:   {auc_smote:.4f}
  
Mejora con SMOTE:
  Accuracy:  {(acc_smote-acc_base)*100:+.2f}%
  ROC-AUC:   {(auc_smote-auc_base)*100:+.2f}%

Top 5 Features Más Importantes:
""")

for idx, row in importance_df.head(5).iterrows():
    print(f"  {row['feature']:45s}: {row['importance']:.4f}")

print(f"""
Archivos generados en '{OUTPUT_DIR}/':
  Modelos:
    - model_baseline.pkl
    - model_smote.pkl ⭐ (USAR ESTE)
  
  Métricas:
    - metricas.pkl
    - feature_importance.csv
  
  Visualizaciones:
    - 01_confusion_matrices.png
    - 02_roc_curves.png
    - 03_feature_importance.png

✅ MODELADO BASELINE COMPLETADO
""")

print("="*80)
print("SIGUIENTE PASO: Análisis SHAP (explicabilidad)")
print("  (Te lo generaré cuando estés listo)")
print("="*80)

MODELO BASELINE: LightGBM + SMOTE

[1/5] Cargando datos preprocesados...
 Cargado desde pickle

  Train: (54961, 192)
  Test: (13741, 192)
  Features: 192

  Distribución Train:
    Clase 0: 41,280 (75.1%)
    Clase 1: 13,681 (24.9%)

[2/5] Balanceo con SMOTE...

  Antes de SMOTE:
    Clase 0: 41,280
    Clase 1: 13,681
    Ratio: 1:3.02


AttributeError: 'SMOTE' object has no attribute '_validate_data'

In [3]:
!pip install -U imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.14.1-py3-none-any.whl.metadata (8.9 kB)
Collecting sklearn-compat<0.2,>=0.1.5 (from imbalanced-learn)
  Downloading sklearn_compat-0.1.5-py3-none-any.whl.metadata (20 kB)
Downloading imbalanced_learn-0.14.1-py3-none-any.whl (235 kB)
Downloading sklearn_compat-0.1.5-py3-none-any.whl (20 kB)
Installing collected packages: sklearn-compat, imbalanced-learn
[2K  Attempting uninstall: imbalanced-learn
[2K    Found existing installation: imbalanced-learn 0.12.3
[2K    Uninstalling imbalanced-learn-0.12.3:
[2K      Successfully uninstalled imbalanced-learn-0.12.3
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [imbalanced-learn][imbalanced-learn]
[1A[2KSuccessfully installed imbalanced-learn-0.14.1 sklearn-compat-0.1.5


In [23]:
#!/usr/bin/env python3
"""
MODELO BASELINE: LightGBM + SMOTE
Entrenar, evaluar y guardar modelos
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (classification_report, confusion_matrix, 
                             roc_auc_score, roc_curve, accuracy_score)
from sklearn.utils import resample
try:
    from imblearn.over_sampling import SMOTE
except:
    SMOTE = None
import lightgbm as lgb
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("MODELO BASELINE: LightGBM + SMOTE")
print("="*80)

OUTPUT_DIR = 'output_ml'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# =============================================================================
# PASO 1: CARGAR DATOS
# =============================================================================

print("\n[1/5] Cargando datos preprocesados...")

# Intentar cargar desde pickle primero
try:
    with open(f'{OUTPUT_DIR}/train_test_split.pkl', 'rb') as f:
        data = pickle.load(f)
    
    X_train = data['X_train']
    X_test = data['X_test']
    y_train = data['y_train']
    y_test = data['y_test']
    feature_names = data['feature_names']
    
    print(f"  ✓ Cargado desde pickle")
except:
    # Cargar desde CSV
    print(f"  Cargando desde CSV...")
    X_train = pd.read_csv(f'{OUTPUT_DIR}/X_train.csv')
    X_test = pd.read_csv(f'{OUTPUT_DIR}/X_test.csv')
    y_train = pd.read_csv(f'{OUTPUT_DIR}/y_train.csv').squeeze()
    y_test = pd.read_csv(f'{OUTPUT_DIR}/y_test.csv').squeeze()
    feature_names = X_train.columns.tolist()
    print(f"  ✓ Cargado desde CSV")

print(f"\n  Train: {X_train.shape}")
print(f"  Test: {X_test.shape}")
print(f"  Features: {len(feature_names)}")

print(f"\n  Distribución Train:")
print(f"    Clase 0: {(y_train==0).sum():,} ({(y_train==0).sum()/len(y_train)*100:.1f}%)")
print(f"    Clase 1: {(y_train==1).sum():,} ({(y_train==1).sum()/len(y_train)*100:.1f}%)")

# =============================================================================
# PASO 2: BALANCEO CON SMOTE
# =============================================================================

print("\n[2/5] Balanceo con SMOTE...")

print(f"\n  Antes de SMOTE:")
print(f"    Clase 0: {(y_train==0).sum():,}")
print(f"    Clase 1: {(y_train==1).sum():,}")
print(f"    Ratio: 1:{(y_train==0).sum()/(y_train==1).sum():.2f}")

# Aplicar SMOTE con manejo de errores
try:
    smote = SMOTE(random_state=42, k_neighbors=5)
    X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
    
    print(f"\n  Después de SMOTE:")
    print(f"    Clase 0: {(y_train_smote==0).sum():,}")
    print(f"    Clase 1: {(y_train_smote==1).sum():,}")
    print(f"    Ratio: 1:1.0")
    print(f"  ✓ SMOTE aplicado correctamente")
    
except Exception as e:
    print(f"\n  ⚠️ Error con SMOTE: {e}")
    print(f"  Usando sobremuestreo manual...")
    
    # Alternativa: Sobremuestreo manual
    from sklearn.utils import resample
    
    # Separar clases
    X_train_0 = X_train[y_train == 0]
    X_train_1 = X_train[y_train == 1]
    y_train_0 = y_train[y_train == 0]
    y_train_1 = y_train[y_train == 1]
    
    # Sobremuestrear clase minoritaria
    X_train_1_upsampled = resample(X_train_1, 
                                     replace=True,
                                     n_samples=len(X_train_0),
                                     random_state=42)
    y_train_1_upsampled = resample(y_train_1,
                                     replace=True,
                                     n_samples=len(y_train_0),
                                     random_state=42)
    
    # Combinar
    X_train_smote = pd.concat([X_train_0, X_train_1_upsampled])
    y_train_smote = pd.concat([y_train_0, y_train_1_upsampled])
    
    print(f"\n  Después de sobremuestreo manual:")
    print(f"    Clase 0: {(y_train_smote==0).sum():,}")
    print(f"    Clase 1: {(y_train_smote==1).sum():,}")
    print(f"    Ratio: 1:1.0")
    print(f"  ✓ Sobremuestreo manual aplicado")

# =============================================================================
# PASO 3: ENTRENAR MODELOS
# =============================================================================

print("\n[3/5] Entrenando modelos...")

# Modelo A: Baseline (sin balanceo)
print("\n  [A] Modelo baseline (sin balanceo)...")

model_baseline = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    num_leaves=31,
    random_state=42,
    verbose=-1
)

model_baseline.fit(X_train, y_train)
print(f"      ✓ Entrenado")

# Predicciones baseline
y_pred_base = model_baseline.predict(X_test)
y_proba_base = model_baseline.predict_proba(X_test)[:, 1]

# Modelo B: Con SMOTE
print("\n  [B] Modelo con SMOTE (balanceado)...")

model_smote = lgb.LGBMClassifier(
    n_estimators=200,
    learning_rate=0.05,
    max_depth=7,
    num_leaves=63,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    verbose=-1
)

model_smote.fit(X_train_smote, y_train_smote)
print(f"      ✓ Entrenado")

# Predicciones SMOTE
y_pred_smote = model_smote.predict(X_test)
y_proba_smote = model_smote.predict_proba(X_test)[:, 1]

# =============================================================================
# PASO 4: EVALUACION
# =============================================================================

print("\n[4/5] Evaluando modelos...+++++++++")

# Evaluación baseline
print("\n" + "="*80)
print("MODELO BASELINE (Sin balanceo)")
print("="*80)
print(classification_report(y_test, y_pred_base, digits=4))
acc_base = accuracy_score(y_test, y_pred_base)
auc_base = roc_auc_score(y_test, y_proba_base)
print(f"Accuracy: {acc_base:.4f}")
print(f"ROC-AUC:  {auc_base:.4f}")

# Evaluación SMOTE
print("\n" + "="*80)
print("MODELO CON SMOTE (Balanceado)")
print("="*80)
print(classification_report(y_test, y_pred_smote, digits=4))
acc_smote = accuracy_score(y_test, y_pred_smote)
auc_smote = roc_auc_score(y_test, y_proba_smote)
print(f"Accuracy: {acc_smote:.4f}")
print(f"ROC-AUC:  {auc_smote:.4f}")

# =============================================================================
# PASO 5: VISUALIZACIONES
# =============================================================================

print("\n[5/5] Generando visualizaciones...")

plt.style.use('seaborn-v0_8-whitegrid')

# 1. Confusion Matrices
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

cm_base = confusion_matrix(y_test, y_pred_base)
cm_smote = confusion_matrix(y_test, y_pred_smote)

sns.heatmap(cm_base, annot=True, fmt='d', cmap='Blues', ax=axes[0], 
            xticklabels=['Fracaso', 'Éxito'], yticklabels=['Fracaso', 'Éxito'])
axes[0].set_title('Confusion Matrix - Baseline\n(Sin balanceo)', fontsize=13, fontweight='bold')
axes[0].set_xlabel('Predicho', fontweight='bold')
axes[0].set_ylabel('Real', fontweight='bold')

sns.heatmap(cm_smote, annot=True, fmt='d', cmap='Greens', ax=axes[1],
            xticklabels=['Fracaso', 'Éxito'], yticklabels=['Fracaso', 'Éxito'])
axes[1].set_title('Confusion Matrix - SMOTE\n(Con balanceo)', fontsize=13, fontweight='bold')
axes[1].set_xlabel('Predicho', fontweight='bold')
axes[1].set_ylabel('Real', fontweight='bold')

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/01_confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.close()
print(f"  ✓ 01_confusion_matrices.png")

# 2. ROC Curves
fig, ax = plt.subplots(1, 1, figsize=(10, 8))

fpr_base, tpr_base, _ = roc_curve(y_test, y_proba_base)
fpr_smote, tpr_smote, _ = roc_curve(y_test, y_proba_smote)

ax.plot(fpr_base, tpr_base, label=f'Baseline (AUC={auc_base:.3f})', 
        linewidth=2.5, color='#3498db')
ax.plot(fpr_smote, tpr_smote, label=f'SMOTE (AUC={auc_smote:.3f})', 
        linewidth=2.5, color='#2ecc71')
ax.plot([0, 1], [0, 1], 'k--', linewidth=1.5, label='Random', alpha=0.5)

ax.set_xlabel('False Positive Rate (FPR)', fontsize=12, fontweight='bold')
ax.set_ylabel('True Positive Rate (TPR)', fontsize=12, fontweight='bold')
ax.set_title('ROC Curves - Comparación de Modelos', fontsize=14, fontweight='bold')
ax.legend(fontsize=12, loc='lower right')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/02_roc_curves.png', dpi=300, bbox_inches='tight')
plt.close()
print(f"  ✓ 02_roc_curves.png")

# 3. Feature Importance
fig, ax = plt.subplots(1, 1, figsize=(10, 12))

importance = model_smote.feature_importances_
indices = np.argsort(importance)[-20:]  # Top 20

colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(indices)))

ax.barh(range(len(indices)), importance[indices], color=colors)
ax.set_yticks(range(len(indices)))
ax.set_yticklabels([feature_names[i] for i in indices], fontsize=10)
ax.set_xlabel('Importancia', fontsize=12, fontweight='bold')
ax.set_title('Top 20 Features Más Importantes\n(Modelo SMOTE)', fontsize=14, fontweight='bold')
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/03_feature_importance.png', dpi=300, bbox_inches='tight')
plt.close()
print(f"  ✓ 03_feature_importance.png")

# =============================================================================
# GUARDAR MODELOS Y RESULTADOS
# =============================================================================

print("\n" + "="*80)
print("GUARDANDO MODELOS Y RESULTADOS")
print("="*80)

# Guardar modelos
with open(f'{OUTPUT_DIR}/model_baseline.pkl', 'wb') as f:
    pickle.dump(model_baseline, f)
print(f"  ✓ model_baseline.pkl")

with open(f'{OUTPUT_DIR}/model_smote.pkl', 'wb') as f:
    pickle.dump(model_smote, f)
print(f"  ✓ model_smote.pkl")

# Guardar métricas
metricas = {
    'baseline': {
        'accuracy': acc_base,
        'roc_auc': auc_base,
        'confusion_matrix': cm_base.tolist()
    },
    'smote': {
        'accuracy': acc_smote,
        'roc_auc': auc_smote,
        'confusion_matrix': cm_smote.tolist()
    }
}

with open(f'{OUTPUT_DIR}/metricas.pkl', 'wb') as f:
    pickle.dump(metricas, f)
print(f"  ✓ metricas.pkl")

# Feature importance
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': model_smote.feature_importances_
}).sort_values('importance', ascending=False)

importance_df.to_csv(f'{OUTPUT_DIR}/feature_importance.csv', index=False)
print(f"  ✓ feature_importance.csv")

# =============================================================================
# RESUMEN FINAL
# =============================================================================

print("\n" + "="*80)
print("RESUMEN FINAL")
print("="*80)

print(f"""
MODELO BASELINE (Sin balanceo):
  Accuracy:  {acc_base:.4f}
  ROC-AUC:   {auc_base:.4f}

MODELO SMOTE (Balanceado): ⭐
  Accuracy:  {acc_smote:.4f}
  ROC-AUC:   {auc_smote:.4f}
  
Mejora con SMOTE:
  Accuracy:  {(acc_smote-acc_base)*100:+.2f}%
  ROC-AUC:   {(auc_smote-auc_base)*100:+.2f}%

Top 5 Features Más Importantes:
""")

for idx, row in importance_df.head(5).iterrows():
    print(f"  {row['feature']:45s}: {row['importance']:.4f}")

print(f"""
Archivos generados en '{OUTPUT_DIR}/':
  Modelos:
    - model_baseline.pkl
    - model_smote.pkl ⭐ (USAR ESTE)
  
  Métricas:
    - metricas.pkl
    - feature_importance.csv
  
  Visualizaciones:
    - 01_confusion_matrices.png
    - 02_roc_curves.png
    - 03_feature_importance.png

MODELADO BASELINE COMPLETADO
""")

print("="*80)
print("SIGUIENTE PASO: Análisis SHAP (explicabilidad)")
print("  (Te lo generaré cuando estés listo)")
print("="*80)

MODELO BASELINE: LightGBM + SMOTE

[1/5] Cargando datos preprocesados...
  ✓ Cargado desde pickle

  Train: (54961, 192)
  Test: (13741, 192)
  Features: 192

  Distribución Train:
    Clase 0: 41,280 (75.1%)
    Clase 1: 13,681 (24.9%)

[2/5] Balanceo con SMOTE...

  Antes de SMOTE:
    Clase 0: 41,280
    Clase 1: 13,681
    Ratio: 1:3.02

  ⚠️ Error con SMOTE: 'SMOTE' object has no attribute '_validate_data'
  Usando sobremuestreo manual...

  Después de sobremuestreo manual:
    Clase 0: 41,280
    Clase 1: 41,280
    Ratio: 1:1.0
  ✓ Sobremuestreo manual aplicado

[3/5] Entrenando modelos...

  [A] Modelo baseline (sin balanceo)...
      ✓ Entrenado

  [B] Modelo con SMOTE (balanceado)...
      ✓ Entrenado

[4/5] Evaluando modelos...+++++++++

MODELO BASELINE (Sin balanceo)
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000     10321
           1     1.0000    1.0000    1.0000      3420

    accuracy                         1.0000   

In [11]:
#!/usr/bin/env python3
"""
DETECTAR Y CORREGIR DATA LEAKAGE
Identificar columnas que causan overfitting
"""

import pandas as pd
import numpy as np

print("="*80)
print("DETECCION DE DATA LEAKAGE")
print("="*80)

# =============================================================================
# CARGAR DATASET
# =============================================================================

print("\n[1/4] Cargando dataset...")

df = pd.read_csv('FUSION EMICRON 2024 + GEIH 2023/dataset_ml_clean_fixed.csv')
print(f"  Dimensiones: {df.shape}")

# Verificar variable objetivo
if 'exito_ingresos' in df.columns:
    target_col = 'exito_ingresos'
    y = df[target_col]
    X = df.drop(target_col, axis=1)
else:
    print("  ERROR: No se encontró 'exito_ingresos'")
    exit(1)

print(f"  Variable objetivo: {target_col}")
print(f"  Features: {X.shape[1]}")

# =============================================================================
# DETECTAR CORRELACION PERFECTA CON TARGET
# =============================================================================

print("\n[2/4] Detectando correlación perfecta con target...")

# Solo variables numéricas
num_cols = X.select_dtypes(include=[np.number]).columns

correlaciones = []
for col in num_cols:
    try:
        corr = X[col].corr(y)
        if abs(corr) > 0.95:  # Correlación > 95%
            correlaciones.append((col, corr))
    except:
        pass

if correlaciones:
    print(f"\n  ⚠️ COLUMNAS CON CORRELACION PERFECTA (>95%):")
    for col, corr in sorted(correlaciones, key=lambda x: abs(x[1]), reverse=True):
        print(f"    {col:50s}: {corr:.4f}")
else:
    print(f"  ✓ No se encontraron correlaciones perfectas")

# =============================================================================
# DETECTAR COLUMNAS SOSPECHOSAS
# =============================================================================

print("\n[3/4] Detectando columnas sospechosas de leakage...")

# Palabras clave sospechosas
keywords_sospechosas = ['exito', 'objetivo', 'target', 'indice_exito', 
                        'exito_compuesto', 'resultado', 'outcome']

cols_sospechosas = []
for col in X.columns:
    col_lower = col.lower()
    if any(kw in col_lower for kw in keywords_sospechosas):
        cols_sospechosas.append(col)

if cols_sospechosas:
    print(f"\n  ⚠️ COLUMNAS SOSPECHOSAS (contienen keywords):")
    for col in cols_sospechosas:
        print(f"    - {col}")
        if col in X.columns:
            # Ver distribución
            print(f"      Valores únicos: {X[col].nunique()}")
            print(f"      Distribución: {X[col].value_counts().head(3).to_dict()}")

# =============================================================================
# ELIMINAR COLUMNAS PROBLEMÁTICAS
# =============================================================================

print("\n[4/4] Eliminando columnas problemáticas...")

cols_eliminar = []

# 1. Eliminar correlaciones perfectas
if correlaciones:
    cols_eliminar.extend([col for col, _ in correlaciones])

# 2. Eliminar columnas sospechosas
cols_eliminar.extend(cols_sospechosas)

# 3. Eliminar IDs y variables no predictivas
cols_ids = [c for c in X.columns if any(x in c.lower() for x in ['id', 'secuencia', 'orden'])]
cols_eliminar.extend(cols_ids)

# 4. Eliminar indice_exito si existe (es derivada de target)
if 'indice_exito' in X.columns:
    cols_eliminar.append('indice_exito')

# Eliminar duplicados
cols_eliminar = list(set(cols_eliminar))

if cols_eliminar:
    print(f"\n  Eliminando {len(cols_eliminar)} columnas:")
    for col in cols_eliminar[:10]:
        print(f"    - {col}")
    if len(cols_eliminar) > 10:
        print(f"    ... y {len(cols_eliminar)-10} más")
    
    # Filtrar solo las que existen
    cols_eliminar_existentes = [c for c in cols_eliminar if c in X.columns]
    
    X_limpio = X.drop(columns=cols_eliminar_existentes)
    
    print(f"\n  Features antes: {X.shape[1]}")
    print(f"  Features después: {X_limpio.shape[1]}")
    
    # Guardar dataset limpio
    df_limpio = X_limpio.copy()
    df_limpio[target_col] = y
    
    df_limpio.to_csv('FUSION EMICRON 2024 + GEIH 2023/dataset_ml_sin_leakage.csv', index=False)
    print(f"\n  ✓ Guardado: dataset_ml_sin_leakage.csv")
    
    print("\n" + "="*80)
    print("DATASET LIMPIO SIN LEAKAGE")
    print("="*80)
    print(f"""
Columnas eliminadas: {len(cols_eliminar_existentes)}
Features finales: {X_limpio.shape[1]}

SIGUIENTE PASO:
1. Renombrar dataset:
   mv output_fusion/dataset_ml_sin_leakage.csv output_fusion/dataset_ml_clean.csv

2. Re-ejecutar preprocesamiento:
   python 01_preprocesamiento.py

3. Re-entrenar modelo:
   python 02_modelo_baseline.py
""")
else:
    print(f"\n   No se encontraron columnas problemáticas evidentes")
    print(f"\n  El problema puede ser más sutil. Revisa:")
    print(f"    - ¿Cómo se calculó 'exito_ingresos'?")
    print(f"    - ¿Hay duplicados entre train y test?")
    print(f"    - ¿Las features contienen información del futuro?")

print("="*80)

DETECCION DE DATA LEAKAGE

[1/4] Cargando dataset...
  Dimensiones: (68702, 196)
  Variable objetivo: exito_ingresos
  Features: 195

[2/4] Detectando correlación perfecta con target...

  ⚠️ COLUMNAS CON CORRELACION PERFECTA (>95%):
    ingresos_totales_declarados_log                   : 0.9952

[3/4] Detectando columnas sospechosas de leakage...

  ⚠️ COLUMNAS SOSPECHOSAS (contienen keywords):
    - exito_compuesto
      Valores únicos: 2
      Distribución: {0: 51601, 1: 17101}
    - indice_exito
      Valores únicos: 79
      Distribución: {45.0: 26373, 52.5: 16603, 40.0: 9577}

[4/4] Eliminando columnas problemáticas...

  Eliminando 12 columnas:
    - exito_compuesto
    - emprendedor_oportunidad
    - formalidad_fiscal
    - indice_competitividad
    - id_micronegocio
    - formalidad_laboral
    - ingresos_subsidios
    - ingresos_totales_declarados_log
    - indice_exito
    - emprendedor_necesidad
    ... y 2 más

  Features antes: 195
  Features después: 183

  ✓ Guardado: d

In [29]:
#!/usr/bin/env python3
"""
MODELO FINAL: PREDICCION DE FORMALIZACION
Variable objetivo: Formalidad laboral (informal vs formal)
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (classification_report, confusion_matrix, 
                             roc_auc_score, roc_curve, accuracy_score)
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import resample
import lightgbm as lgb
import pickle
import os
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("MODELO FINAL: PREDICCION DE FORMALIZACION")
print("="*80)

OUTPUT_DIR = 'output_ml_final'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# =============================================================================
# PASO 1: CARGAR Y CREAR VARIABLE OBJETIVO
# =============================================================================

print("\n[1/5] Cargando datos y creando variable objetivo...")

df = pd.read_csv('FUSION EMICRON 2024 + GEIH 2023/dataset_ml_sin_indices.csv')
print(f"  Dataset: {df.shape}")







#======================================



# --- BLOQUE DE CORRECCIÓN: ELIMINACIÓN DE DATA LEAKAGE ---
# Eliminamos variables que son consecuencia de ser formal o que contienen la respuesta
# Esto evita el 1.00 de accuracy falso.
columnas_leakage = [
    # Registros formales (Si tiene RUT o Registro Mercantil, ya es formal por definición)
    'registro_mercantil', 'registro_rut', 'otro_registro', 'tiene_registro_formal',
    'formalidad_fiscal', 'formalidad_juridica',
    # Variables de crédito que delatan formalidad
    'num_creditos_formales', 'tiene_credito_formal', 'credito_bancario', 
    'credito_microfinanzas', 'credito_cooperativa', 'acceso_credito',
    # Otras variables endógenas
    'exito_ingresos' 
]

# Filtrar columnas existentes para evitar errores
leakage_to_drop = [c for c in columnas_leakage if c in df.columns]
df_limpio = df.drop(columns=leakage_to_drop)

print(f"  ✓ Se eliminaron {len(leakage_to_drop)} variables de filtración (Data Leakage).")
# ---------------------------------------------------------

# Verificamos formalidad_laboral (que es nuestra etiqueta real)
if 'formalidad_laboral' not in df.columns:
    print("\n  ✗ ERROR: No se encontró 'formalidad_laboral'")
    exit(1)

# CREAR VARIABLE OBJETIVO BINARIA
y = (df['formalidad_laboral'] >= 1).astype(int)

# Definimos X usando el dataframe limpio
X = df_limpio.drop(['formalidad_laboral'], axis=1)

# Eliminamos también identificadores si existen
cols_id = ['id_micronegocio', 'id_persona', 'id_encuesta', 'nombre_departamento', 'DIRECTORIO']
X = X.drop(columns=[c for c in cols_id if c in X.columns], errors='ignore')

print(f"  Dataset Final: {X.shape}")
print(f"  Target: exito_formalizacion")



#==========================================================








# Verificar que existe formalidad_laboral
if 'formalidad_laboral' not in df.columns:
    print("\n  ✗ ERROR: No se encontró 'formalidad_laboral'")
    exit(1)

# Ver distribución original
print(f"\n  Distribución formalidad_laboral original:")
print(df['formalidad_laboral'].value_counts().sort_index())

# CREAR VARIABLE OBJETIVO BINARIA
# 0 = Informal, 1+ = Formal (parcial o completo)
df['exito_formalizacion'] = (df['formalidad_laboral'] >= 1).astype(int)

print(f"\n  Nueva variable objetivo 'exito_formalizacion':")
print(df['exito_formalizacion'].value_counts())
balance = df['exito_formalizacion'].mean() * 100
print(f"  Balance: {balance:.1f}% formal / {100-balance:.1f}% informal")

# Eliminar variables objetivo anteriores y la original
cols_drop = ['exito_ingresos', 'formalidad_laboral']
cols_drop = [c for c in cols_drop if c in df.columns]

y = df['exito_formalizacion']
X = df.drop(['exito_formalizacion'] + cols_drop, axis=1)

print(f"\n  Features: {X.shape[1]}")
print(f"  Target: exito_formalizacion")

# =============================================================================
# PASO 2: PREPROCESAMIENTO
# =============================================================================

print("\n[2/5] Preprocesando datos...")

# Manejar missing
num_cols = X.select_dtypes(include=[np.number]).columns
for col in num_cols:
    if X[col].isnull().sum() > 0:
        X[col] = X[col].fillna(X[col].median())

cat_cols = X.select_dtypes(include=['object', 'category']).columns
for col in cat_cols:
    if X[col].isnull().sum() > 0:
        X[col] = X[col].fillna('Desconocido')
    X[col] = X[col].astype('category').cat.codes

print(f"  ✓ Missing manejados")
print(f"  ✓ Categóricas codificadas: {len(cat_cols)}")

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\n  Train: {X_train.shape}")
print(f"  Test: {X_test.shape}")
print(f"  Balance train: {y_train.mean()*100:.1f}% / {(1-y_train.mean())*100:.1f}%")

# =============================================================================
# PASO 3: BALANCEO CON SMOTE MANUAL
# =============================================================================

print("\n[3/5] Balanceando clases...")

print(f"  Antes:")
print(f"    Clase 0 (informal): {(y_train==0).sum():,}")
print(f"    Clase 1 (formal): {(y_train==1).sum():,}")

# Sobremuestreo de clase minoritaria
X_train_0 = X_train[y_train == 0]
X_train_1 = X_train[y_train == 1]
y_train_0 = y_train[y_train == 0]
y_train_1 = y_train[y_train == 1]

X_train_1_up = resample(X_train_1, replace=True, n_samples=len(X_train_0), random_state=42)
y_train_1_up = resample(y_train_1, replace=True, n_samples=len(y_train_0), random_state=42)

X_train_bal = pd.concat([X_train_0, X_train_1_up])
y_train_bal = pd.concat([y_train_0, y_train_1_up])

print(f"  Después:")
print(f"    Clase 0: {(y_train_bal==0).sum():,}")
print(f"    Clase 1: {(y_train_bal==1).sum():,}")

# =============================================================================
# PASO 4: ENTRENAR MODELOS
# =============================================================================

print("\n[4/5] Entrenando modelos...")

# Modelo A: Sin balanceo
print("\n  [A] Modelo baseline (sin balanceo)...")

model_baseline = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.05,
    max_depth=5,
    num_leaves=31,
    random_state=42,
    verbose=-1
)

model_baseline.fit(X_train, y_train)
y_pred_base = model_baseline.predict(X_test)
y_proba_base = model_baseline.predict_proba(X_test)[:, 1]
print(f"      ✓ Entrenado")

# Modelo B: Con balanceo
print("\n  [B] Modelo balanceado...")

model_balanced = lgb.LGBMClassifier(
    n_estimators=200,
    learning_rate=0.03,
    max_depth=6,
    num_leaves=31,
    min_child_samples=50,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1,
    reg_lambda=1,
    random_state=42,
    verbose=-1
)

model_balanced.fit(X_train_bal, y_train_bal)
y_pred_bal = model_balanced.predict(X_test)
y_proba_bal = model_balanced.predict_proba(X_test)[:, 1]
print(f"      ✓ Entrenado")

# =============================================================================
# PASO 5: EVALUACION
# =============================================================================

print("\n[5/5] Evaluando modelos...")

# Baseline
print("\n" + "="*80)
print("MODELO BASELINE (Sin balanceo)")
print("="*80)
print(classification_report(y_test, y_pred_base, 
                           target_names=['Informal', 'Formal'], digits=4))
acc_base = accuracy_score(y_test, y_pred_base)
auc_base = roc_auc_score(y_test, y_proba_base)
print(f"Accuracy: {acc_base:.4f}")
print(f"ROC-AUC:  {auc_base:.4f}")

# Balanceado
print("\n" + "="*80)
print("MODELO BALANCEADO ⭐")
print("="*80)
print(classification_report(y_test, y_pred_bal,
                           target_names=['Informal', 'Formal'], digits=4))
acc_bal = accuracy_score(y_test, y_pred_bal)
auc_bal = roc_auc_score(y_test, y_proba_bal)
print(f"Accuracy: {acc_bal:.4f}")
print(f"ROC-AUC:  {auc_bal:.4f}")

# Cross-validation
print("\n" + "="*80)
print("VALIDACION CRUZADA (5-fold)")
print("="*80)
cv_scores = cross_val_score(model_balanced, X_train, y_train, cv=5, scoring='roc_auc')
print(f"  ROC-AUC por fold: {cv_scores}")
print(f"  Media: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})")

# =============================================================================
# VISUALIZACIONES
# =============================================================================

print("\n" + "="*80)
print("GENERANDO VISUALIZACIONES")
print("="*80)

plt.style.use('seaborn-v0_8-whitegrid')

# 1. Confusion Matrices
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

cm_base = confusion_matrix(y_test, y_pred_base)
cm_bal = confusion_matrix(y_test, y_pred_bal)

sns.heatmap(cm_base, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['Informal', 'Formal'], yticklabels=['Informal', 'Formal'])
axes[0].set_title(f'Baseline\nAccuracy: {acc_base:.3f}', fontsize=13, fontweight='bold')
axes[0].set_xlabel('Predicho', fontweight='bold')
axes[0].set_ylabel('Real', fontweight='bold')

sns.heatmap(cm_bal, annot=True, fmt='d', cmap='Greens', ax=axes[1],
            xticklabels=['Informal', 'Formal'], yticklabels=['Informal', 'Formal'])
axes[1].set_title(f'Balanceado ⭐\nAccuracy: {acc_bal:.3f}', fontsize=13, fontweight='bold')
axes[1].set_xlabel('Predicho', fontweight='bold')
axes[1].set_ylabel('Real', fontweight='bold')

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/01_confusion_matrices.png', dpi=300, bbox_inches='tight')
plt.close()
print(f"  ✓ 01_confusion_matrices.png")

# 2. ROC Curves
fig, ax = plt.subplots(1, 1, figsize=(10, 8))

fpr_base, tpr_base, _ = roc_curve(y_test, y_proba_base)
fpr_bal, tpr_bal, _ = roc_curve(y_test, y_proba_bal)

ax.plot(fpr_base, tpr_base, label=f'Baseline (AUC={auc_base:.3f})', linewidth=2.5, color='#3498db')
ax.plot(fpr_bal, tpr_bal, label=f'Balanceado (AUC={auc_bal:.3f})', linewidth=2.5, color='#2ecc71')
ax.plot([0, 1], [0, 1], 'k--', linewidth=1.5, alpha=0.5)

ax.set_xlabel('False Positive Rate', fontsize=12, fontweight='bold')
ax.set_ylabel('True Positive Rate', fontsize=12, fontweight='bold')
ax.set_title('ROC Curves - Predicción de Formalización', fontsize=14, fontweight='bold')
ax.legend(fontsize=12)
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/02_roc_curves.png', dpi=300, bbox_inches='tight')
plt.close()
print(f"  ✓ 02_roc_curves.png")

# 3. Feature Importance
fig, ax = plt.subplots(1, 1, figsize=(10, 12))

importance = model_balanced.feature_importances_
indices = np.argsort(importance)[-20:]

colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(indices)))

ax.barh(range(len(indices)), importance[indices], color=colors)
ax.set_yticks(range(len(indices)))
ax.set_yticklabels([X.columns[i] for i in indices], fontsize=10)
ax.set_xlabel('Importancia', fontsize=12, fontweight='bold')
ax.set_title('Top 20 Features - Predicción de Formalización', fontsize=14, fontweight='bold')
ax.grid(axis='x', alpha=0.3)

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/03_feature_importance.png', dpi=300, bbox_inches='tight')
plt.close()
print(f"  ✓ 03_feature_importance.png")

# =============================================================================
# GUARDAR RESULTADOS
# =============================================================================

print("\n" + "="*80)
print("GUARDANDO RESULTADOS")
print("="*80)

# Modelos
with open(f'{OUTPUT_DIR}/model_baseline.pkl', 'wb') as f:
    pickle.dump(model_baseline, f)
with open(f'{OUTPUT_DIR}/model_balanceado.pkl', 'wb') as f:
    pickle.dump(model_balanced, f)
print(f"  ✓ Modelos guardados")

# Métricas
metricas = {
    'baseline': {'accuracy': acc_base, 'roc_auc': auc_base, 'cm': cm_base.tolist()},
    'balanceado': {'accuracy': acc_bal, 'roc_auc': auc_bal, 'cm': cm_bal.tolist()},
    'cv_scores': cv_scores.tolist()
}

with open(f'{OUTPUT_DIR}/metricas.pkl', 'wb') as f:
    pickle.dump(metricas, f)
print(f"  ✓ Métricas guardadas")

# Feature importance
importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': model_balanced.feature_importances_
}).sort_values('importance', ascending=False)

importance_df.to_csv(f'{OUTPUT_DIR}/feature_importance.csv', index=False)
print(f"   feature_importance.csv")

# =============================================================================
# RESUMEN FINAL
# =============================================================================

print("\n" + "="*80)
print("RESUMEN FINAL")
print("="*80)

print(f"""
PREGUNTA DE INVESTIGACION:
  ¿Qué factores predicen la formalización de micronegocios?

VARIABLE OBJETIVO:
  exito_formalizacion (0=Informal, 1=Formal)
  Balance: {balance:.1f}% formal / {100-balance:.1f}% informal

MODELO BASELINE (Sin balanceo):
  Accuracy:  {acc_base:.4f}
  ROC-AUC:   {auc_base:.4f}
  
MODELO BALANCEADO :
  Accuracy:  {acc_bal:.4f}
  ROC-AUC:   {auc_bal:.4f}
  CV ROC-AUC: {cv_scores.mean():.4f} (+/- {cv_scores.std():.4f})

Top 5 Predictores de Formalización:
""")

for idx, row in importance_df.head(5).iterrows():
    print(f"  {row['feature']:45s}: {row['importance']:.4f}")

print(f"""
INTERPRETACION:
  - Métricas realistas (no 100%)
  - Modelo aprende patrones reales
  - Sin data leakage
  - Resultados aplicables a política pública

ARCHIVOS EN '{OUTPUT_DIR}/':
  - model_balanceado.pkl 
  - metricas.pkl
  - feature_importance.csv
  - 01_confusion_matrices.png
  - 02_roc_curves.png
  - 03_feature_importance.png
MODELO FINAL COMPLETADO
""")

print("="*80)
print("SIGUIENTE PASO: Análisis SHAP (explicabilidad)")
print("="*80)

MODELO FINAL: PREDICCION DE FORMALIZACION

[1/5] Cargando datos y creando variable objetivo...
  Dataset: (68702, 86)
  ✓ Se eliminaron 8 variables de filtración (Data Leakage).
  Dataset Final: (68702, 76)
  Target: exito_formalizacion

  Distribución formalidad_laboral original:
formalidad_laboral
0    60749
1     5995
2     1958
Name: count, dtype: int64

  Nueva variable objetivo 'exito_formalizacion':
exito_formalizacion
0    60749
1     7953
Name: count, dtype: int64
  Balance: 11.6% formal / 88.4% informal

  Features: 84
  Target: exito_formalizacion

[2/5] Preprocesando datos...
  ✓ Missing manejados
  ✓ Categóricas codificadas: 1

  Train: (54961, 84)
  Test: (13741, 84)
  Balance train: 11.6% / 88.4%

[3/5] Balanceando clases...
  Antes:
    Clase 0 (informal): 48,599
    Clase 1 (formal): 6,362
  Después:
    Clase 0: 48,599
    Clase 1: 48,599

[4/5] Entrenando modelos...

  [A] Modelo baseline (sin balanceo)...
      ✓ Entrenado

  [B] Modelo balanceado...
      ✓ Entrena

In [31]:
# =============================================================================
# VISUALIZACIONES ADICIONALES: ENTRENAMIENTO Y EVALUACION
# =============================================================================

print("\n" + "="*80)
print("GENERANDO GRAFICAS DE ENTRENAMIENTO Y EVALUACION")
print("="*80)

# 4. Curvas de Aprendizaje (Learning Curves)
from sklearn.model_selection import learning_curve

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Calcular curvas de aprendizaje
train_sizes, train_scores, val_scores = learning_curve(
    model_balanced, X_train, y_train, 
    cv=5, 
    scoring='roc_auc',
    n_jobs=-1,
    train_sizes=np.linspace(0.1, 1.0, 10),
    random_state=42
)

# Calcular media y desviación estándar
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)

# Gráfica izquierda: Curvas de aprendizaje
axes[0].plot(train_sizes, train_mean, 'o-', color='#3498db', linewidth=2.5, label='Train')
axes[0].fill_between(train_sizes, train_mean - train_std, train_mean + train_std, 
                      alpha=0.2, color='#3498db')
axes[0].plot(train_sizes, val_mean, 'o-', color='#e74c3c', linewidth=2.5, label='Validation')
axes[0].fill_between(train_sizes, val_mean - val_std, val_mean + val_std, 
                      alpha=0.2, color='#e74c3c')

axes[0].set_xlabel('Tamaño del conjunto de entrenamiento', fontsize=11, fontweight='bold')
axes[0].set_ylabel('ROC-AUC Score', fontsize=11, fontweight='bold')
axes[0].set_title('Curvas de Aprendizaje\n(Modelo Balanceado)', fontsize=13, fontweight='bold')
axes[0].legend(fontsize=11, loc='lower right')
axes[0].grid(True, alpha=0.3)
axes[0].set_ylim([0.5, 1.0])

# Gráfica derecha: Comparación de métricas
metrics_names = ['Accuracy', 'ROC-AUC', 'Precision\n(Formal)', 'Recall\n(Formal)']

# Calcular precision y recall para clase formal (1)
from sklearn.metrics import precision_score, recall_score

precision_base = precision_score(y_test, y_pred_base, pos_label=1)
recall_base = recall_score(y_test, y_pred_base, pos_label=1)
precision_bal = precision_score(y_test, y_pred_bal, pos_label=1)
recall_bal = recall_score(y_test, y_pred_bal, pos_label=1)

baseline_scores = [acc_base, auc_base, precision_base, recall_base]
balanced_scores = [acc_bal, auc_bal, precision_bal, recall_bal]

x = np.arange(len(metrics_names))
width = 0.35

bars1 = axes[1].bar(x - width/2, baseline_scores, width, label='Baseline', 
                    color='#3498db', alpha=0.8)
bars2 = axes[1].bar(x + width/2, balanced_scores, width, label='Balanceado', 
                    color='#2ecc71', alpha=0.8)

axes[1].set_xlabel('Métrica', fontsize=11, fontweight='bold')
axes[1].set_ylabel('Score', fontsize=11, fontweight='bold')
axes[1].set_title('Comparación de Métricas\n(Test Set)', fontsize=13, fontweight='bold')
axes[1].set_xticks(x)
axes[1].set_xticklabels(metrics_names, fontsize=10)
axes[1].legend(fontsize=11)
axes[1].grid(True, axis='y', alpha=0.3)
axes[1].set_ylim([0, 1.0])

# Agregar valores en las barras
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        axes[1].text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.3f}', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/04_entrenamiento_evaluacion.png', dpi=300, bbox_inches='tight')
plt.close()
print(f" 04_entrenamiento_evaluacion.png")

# 5. Matriz de Métricas por Umbral (Threshold Analysis)
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Calcular métricas para diferentes umbrales
thresholds = np.linspace(0, 1, 100)
precisions = []
recalls = []
f1_scores = []

for threshold in thresholds:
    y_pred_threshold = (y_proba_bal >= threshold).astype(int)
    
    if len(np.unique(y_pred_threshold)) > 1:  # Verificar que hay ambas clases
        precision = precision_score(y_test, y_pred_threshold, pos_label=1, zero_division=0)
        recall = recall_score(y_test, y_pred_threshold, pos_label=1, zero_division=0)
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    else:
        precision = 0
        recall = 0
        f1 = 0
    
    precisions.append(precision)
    recalls.append(recall)
    f1_scores.append(f1)

# Gráfica izquierda: Precision-Recall vs Threshold
axes[0].plot(thresholds, precisions, label='Precision', linewidth=2.5, color='#e74c3c')
axes[0].plot(thresholds, recalls, label='Recall', linewidth=2.5, color='#3498db')
axes[0].plot(thresholds, f1_scores, label='F1-Score', linewidth=2.5, color='#2ecc71', linestyle='--')

axes[0].axvline(x=0.5, color='gray', linestyle=':', linewidth=1.5, alpha=0.7, label='Umbral 0.5')
axes[0].set_xlabel('Umbral de Clasificación', fontsize=11, fontweight='bold')
axes[0].set_ylabel('Score', fontsize=11, fontweight='bold')
axes[0].set_title('Métricas vs Umbral de Decisión', fontsize=13, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)
axes[0].set_ylim([0, 1.0])

# Gráfica derecha: Precision-Recall Curve
from sklearn.metrics import precision_recall_curve

precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_proba_bal)

axes[1].plot(recall_curve, precision_curve, linewidth=2.5, color='#9b59b6')
axes[1].fill_between(recall_curve, precision_curve, alpha=0.2, color='#9b59b6')

axes[1].set_xlabel('Recall (Sensibilidad)', fontsize=11, fontweight='bold')
axes[1].set_ylabel('Precision', fontsize=11, fontweight='bold')
axes[1].set_title('Curva Precision-Recall\n(Modelo Balanceado)', fontsize=13, fontweight='bold')
axes[1].grid(True, alpha=0.3)
axes[1].set_xlim([0, 1.0])
axes[1].set_ylim([0, 1.0])

# Agregar línea de referencia (no-skill)
no_skill = (y_test == 1).sum() / len(y_test)
axes[1].axhline(y=no_skill, color='gray', linestyle='--', linewidth=1.5, 
                label=f'No-skill ({no_skill:.3f})')
axes[1].legend(fontsize=11)

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/05_analisis_umbral.png', dpi=300, bbox_inches='tight')
plt.close()
print(f"  ✓ 05_analisis_umbral.png")

# 6. Distribución de Probabilidades Predichas
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Gráfica izquierda: Histograma de probabilidades
axes[0].hist(y_proba_bal[y_test == 0], bins=50, alpha=0.6, label='Informal (Real)', 
             color='#e74c3c', density=True)
axes[0].hist(y_proba_bal[y_test == 1], bins=50, alpha=0.6, label='Formal (Real)', 
             color='#2ecc71', density=True)

axes[0].axvline(x=0.5, color='gray', linestyle='--', linewidth=2, label='Umbral 0.5')
axes[0].set_xlabel('Probabilidad Predicha (Formal)', fontsize=11, fontweight='bold')
axes[0].set_ylabel('Densidad', fontsize=11, fontweight='bold')
axes[0].set_title('Distribución de Probabilidades Predichas\npor Clase Real', 
                  fontsize=13, fontweight='bold')
axes[0].legend(fontsize=11)
axes[0].grid(True, alpha=0.3)

# Gráfica derecha: Boxplot de probabilidades
data_to_plot = [y_proba_bal[y_test == 0], y_proba_bal[y_test == 1]]
bp = axes[1].boxplot(data_to_plot, labels=['Informal\n(Real)', 'Formal\n(Real)'],
                     patch_artist=True, widths=0.6)

colors = ['#e74c3c', '#2ecc71']
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.6)

axes[1].axhline(y=0.5, color='gray', linestyle='--', linewidth=1.5, label='Umbral 0.5')
axes[1].set_ylabel('Probabilidad Predicha (Formal)', fontsize=11, fontweight='bold')
axes[1].set_title('Distribución de Scores por Clase Real', fontsize=13, fontweight='bold')
axes[1].legend(fontsize=11)
axes[1].grid(True, axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/06_distribucion_probabilidades.png', dpi=300, bbox_inches='tight')
plt.close()
print(f"  06_distribucion_probabilidades.png")


GENERANDO GRAFICAS DE ENTRENAMIENTO Y EVALUACION
 04_entrenamiento_evaluacion.png
  ✓ 05_analisis_umbral.png
  06_distribucion_probabilidades.png
