In [None]:
"""
PREPROCESAMIENTO PARA ML
Manejo de missing, encoding, split train/test
"""

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("PREPROCESAMIENTO PARA MODELADO ML")
print("="*80)

# =============================================================================
# PASO 1: CARGAR DATASET
# =============================================================================

print("\n[1/7] Cargando dataset limpio...")

df = pd.read_csv('output_fusion/dataset_ml_clean.csv')

print(f"  Dimensiones: {df.shape}")
print(f"  Micronegocios: {df.shape[0]:,}")
print(f"  Features: {df.shape[1]}")

# =============================================================================
# PASO 2: IDENTIFICAR VARIABLE OBJETIVO
# =============================================================================

print("\n[2/7] Identificando variable objetivo...")

# Buscar variables objetivo
vars_objetivo = [c for c in df.columns if 'exito' in c.lower()]

if not vars_objetivo:
    print("  No se encontró variable objetivo, creando una...")
    
    # Crear variable objetivo: éxito basado en ingresos
    if 'ingresos_totales_declarados' in df.columns:
        # Comparar con mediana por sector
        if 'sector_economico' in df.columns:
            df['mediana_sector'] = df.groupby('sector_economico')['ingresos_totales_declarados'].transform('median')
            df['exito'] = (df['ingresos_totales_declarados'] > df['mediana_sector']).astype(int)
            print("  ✓ Variable 'exito' creada (ingresos > mediana del sector)")
        else:
            # Comparar con mediana general
            mediana = df['ingresos_totales_declarados'].median()
            df['exito'] = (df['ingresos_totales_declarados'] > mediana).astype(int)
            print("  ✓ Variable 'exito' creada (ingresos > mediana general)")
        
        target_col = 'exito'
    else:
        print("  ✗ ERROR: No hay columna de ingresos para crear objetivo")
        exit(1)
else:
    # Usar la primera variable objetivo encontrada
    target_col = vars_objetivo[0]
    print(f"  Variable objetivo: {target_col}")

# Verificar distribución
if target_col in df.columns:
    print(f"\n  Distribución de {target_col}:")
    print(df[target_col].value_counts())
    balance = df[target_col].mean() * 100
    print(f"  Balance: {balance:.1f}% clase 1 / {100-balance:.1f}% clase 0")

# =============================================================================
# PASO 3: SEPARAR FEATURES Y TARGET
# =============================================================================

print("\n[3/7] Separando features y target...")

# Columnas a excluir de features
cols_excluir = [target_col, 'id_micronegocio']
if 'mediana_sector' in df.columns:
    cols_excluir.append('mediana_sector')

# Identificar columnas de features
feature_cols = [c for c in df.columns if c not in cols_excluir]

X = df[feature_cols].copy()
y = df[target_col].copy()

print(f"  Features: {X.shape[1]}")
print(f"  Target: {y.shape[0]:,} valores")

# =============================================================================
# PASO 4: ANALIZAR Y MANEJAR DATOS FALTANTES
# =============================================================================

print("\n[4/7] Manejando datos faltantes...")

# Reporte de missing
missing = X.isnull().sum()
missing_pct = (missing / len(X)) * 100
missing_df = pd.DataFrame({
    'columna': missing.index,
    'missing': missing.values,
    'porcentaje': missing_pct.values
}).sort_values('porcentaje', ascending=False)

missing_df = missing_df[missing_df['missing'] > 0]

if len(missing_df) > 0:
    print(f"\n  Columnas con missing: {len(missing_df)}")
    print(f"\n  Top 10:")
    for idx, row in missing_df.head(10).iterrows():
        print(f"    {row['columna']:40s}: {row['missing']:8.0f} ({row['porcentaje']:5.1f}%)")
    
    # ESTRATEGIA DE IMPUTACIÓN
    print(f"\n  Aplicando imputación...")
    
    # 1. Numéricas: mediana
    num_cols = X.select_dtypes(include=[np.number]).columns
    for col in num_cols:
        if X[col].isnull().sum() > 0:
            median_val = X[col].median()
            X[col] = X[col].fillna(median_val)
    
    # 2. Categóricas: moda o 'Desconocido'
    cat_cols = X.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols:
        if X[col].isnull().sum() > 0:
            if X[col].mode().shape[0] > 0:
                X[col] = X[col].fillna(X[col].mode()[0])
            else:
                X[col] = X[col].fillna('Desconocido')
    
    missing_after = X.isnull().sum().sum()
    print(f"  Missing antes: {missing.sum():,}")
    print(f"  Missing después: {missing_after:,}")
else:
    print("  No hay valores faltantes")

# =============================================================================
# PASO 5: CODIFICAR VARIABLES CATEGORICAS
# =============================================================================

print("\n[5/7] Codificando variables categóricas...")

# Identificar categóricas
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

if len(cat_cols) > 0:
    print(f"  Variables categóricas: {len(cat_cols)}")
    
    # LightGBM maneja categóricas nativamente, pero necesitan ser tipo 'category'
    for col in cat_cols:
        if X[col].dtype == 'object':
            X[col] = X[col].astype('category')
        
        # Imprimir info
        n_unique = X[col].nunique()
        print(f"    {col:40s}: {n_unique:4d} categorías")
    
    print(f"  Variables convertidas a tipo 'category'")
else:
    print("   No hay variables categóricas")

# =============================================================================
# PASO 6: DETECTAR Y MANEJAR OUTLIERS
# =============================================================================

print("\n[6/7] Detectando outliers extremos...")

# Solo en variables numéricas clave
num_cols = X.select_dtypes(include=[np.number]).columns

outliers_detectados = []
for col in num_cols:
    Q1 = X[col].quantile(0.25)
    Q3 = X[col].quantile(0.75)
    IQR = Q3 - Q1
    
    # Outliers extremos (más allá de 3*IQR)
    lower_bound = Q1 - 3 * IQR
    upper_bound = Q3 + 3 * IQR
    
    outliers = ((X[col] < lower_bound) | (X[col] > upper_bound)).sum()
    
    if outliers > 0:
        pct = outliers / len(X) * 100
        if pct > 1:  # Solo mostrar si >1%
            outliers_detectados.append((col, outliers, pct))

if outliers_detectados:
    print(f"  Columnas con outliers extremos (>1%):")
    for col, count, pct in sorted(outliers_detectados, key=lambda x: x[2], reverse=True)[:10]:
        print(f"    {col:40s}: {count:6,} ({pct:5.1f}%)")
    
    print(f"\n  Nota: LightGBM es robusto ante outliers, no se eliminan")
else:
    print("  No se detectaron outliers extremos significativos")

# =============================================================================
# PASO 7: TRAIN/TEST SPLIT
# =============================================================================

print("\n[7/7] Creando train/test split...")

# Split estratificado (mantiene proporción de clases)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"  Train: {X_train.shape[0]:,} muestras ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"  Test:  {X_test.shape[0]:,} muestras ({X_test.shape[0]/len(X)*100:.1f}%)")

print(f"\n  Balance en Train:")
print(f"    Clase 0: {(y_train==0).sum():,} ({(y_train==0).sum()/len(y_train)*100:.1f}%)")
print(f"    Clase 1: {(y_train==1).sum():,} ({(y_train==1).sum()/len(y_train)*100:.1f}%)")

print(f"\n  Balance en Test:")
print(f"    Clase 0: {(y_test==0).sum():,} ({(y_test==0).sum()/len(y_test)*100:.1f}%)")
print(f"    Clase 1: {(y_test==1).sum():,} ({(y_test==1).sum()/len(y_test)*100:.1f}%)")

# =============================================================================
# GUARDAR DATASETS PROCESADOS
# =============================================================================

print("\n" + "="*80)
print("GUARDANDO DATASETS PROCESADOS")
print("="*80)

# Guardar train
train_df = X_train.copy()
train_df[target_col] = y_train
train_df.to_csv('output_fusion/train.csv', index=False)
print(f"  train.csv: {train_df.shape}")

# Guardar test
test_df = X_test.copy()
test_df[target_col] = y_test
test_df.to_csv('output_fusion/test.csv', index=False)
print(f"  test.csv: {test_df.shape}")

# Guardar info de features
feature_info = pd.DataFrame({
    'feature': X.columns,
    'dtype': X.dtypes.values,
    'missing_pct': (X.isnull().sum() / len(X) * 100).values,
    'nunique': [X[col].nunique() for col in X.columns]
})
feature_info.to_csv('output_fusion/feature_info.csv', index=False)
print(f"  feature_info.csv")

# =============================================================================
# RESUMEN FINAL
# =============================================================================

print("\n" + "="*80)
print("RESUMEN FINAL")
print("="*80)

print(f"""
Dataset original:
  Filas: {df.shape[0]:,}
  Columnas: {df.shape[1]}

Dataset procesado:
  Features: {X.shape[1]}
  Target: {target_col}
  
Train/Test split:
  Train: {X_train.shape[0]:,} ({X_train.shape[0]/len(X)*100:.1f}%)
  Test:  {X_test.shape[0]:,} ({X_test.shape[0]/len(X)*100:.1f}%)
  
Tipos de features:
  Numéricas: {len(X.select_dtypes(include=[np.number]).columns)}
  Categóricas: {len(X.select_dtypes(include=['category']).columns)}
  
Missing values: {X.isnull().sum().sum()} (todos imputados ✓)

Balance de clases:
  Train: {(y_train==1).sum()/len(y_train)*100:.1f}% / {(y_train==0).sum()/len(y_train)*100:.1f}%
  Test:  {(y_test==1).sum()/len(y_test)*100:.1f}% / {(y_test==0).sum()/len(y_test)*100:.1f}%
""")

print("="*80)
print("PREPROCESAMIENTO COMPLETADO")
print("="*80)
print("\nArchivos generados:")
print("  - train.csv (para entrenar)")
print("  - test.csv (para evaluar)")
print("  - feature_info.csv (metadata)")
print("\nSiguiente paso:")
print("  → python 03_modelo_baseline.py")
print("="*80)