In [3]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.ensemble import HistGradientBoostingRegressor

from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# ============================
# Intentar XGBoost
# ============================
try:
    import xgboost as xgb
    XGBOOST_OK = True
    print("✅ XGBoost disponible")
except Exception:
    XGBOOST_OK = False
    print("⚠️ XGBoost no disponible (Mac sin OpenMP). Uso modelo sklearn.")

# ============================
# 1. Cargar datos
# ============================
ruta_train = "../data/train.csv"
df = pd.read_csv(ruta_train)

# ============================
# 2. Feature engineering
# ============================
rating_order = {'1C': 1, '2A': 2, '2B': 3, '3': 4, '4': 5, '5': 6}
df['peorcalificacionBCU_encoded'] = df['peorcalificacionBCU'].map(rating_order)
df['peorcalificacionBCU_encoded'].fillna(
    df['peorcalificacionBCU_encoded'].mode()[0],
    inplace=True
)
df.drop('peorcalificacionBCU', axis=1, inplace=True)

X = df.drop('ingreso', axis=1)
y = df['ingreso']
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()

# ============================
# 3. Imputación
# ============================
imputer = IterativeImputer(max_iter=10, random_state=42)
X[num_cols] = imputer.fit_transform(X[num_cols])

# ============================
# 4. Manejo de outliers
# ============================
z_scores = np.abs(stats.zscore(X[num_cols]))
mask = (z_scores < 3).all(axis=1)

X = X[mask]
y = y.loc[X.index]

# ============================
# 5. Train / Validation
# ============================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# ============================
# 6. Modelo
# ============================
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1]
}

if XGBOOST_OK:
    xgb_model = xgb.XGBRegressor(random_state=42)

    grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid,
        cv=5,
        scoring='neg_mean_absolute_percentage_error',
        n_jobs=1,
        verbose=1
    )

    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    print("Mejor modelo XGBoost:", grid_search.best_params_)

else:
    best_model = HistGradientBoostingRegressor(
        learning_rate=0.05,
        max_depth=6,
        max_iter=300,
        random_state=42
    )
    best_model.fit(X_train, y_train)
    print("Modelo fallback entrenado (sklearn)")

# ============================
# 7. Evaluación
# ============================
y_val_pred = best_model.predict(X_val)
mape = mean_absolute_percentage_error(y_val, y_val_pred) * 100

print(f"\nMAPE en validación: {mape:.2f}%")

# ============================
# 8. Predicción final
# ============================
ruta_test = "../data/test.csv"
df_test = pd.read_csv(ruta_test)

if 'peorcalificacionBCU' in df_test.columns:
    df_test['peorcalificacionBCU_encoded'] = df_test['peorcalificacionBCU'].map(rating_order)
    df_test['peorcalificacionBCU_encoded'].fillna(
        df_test['peorcalificacionBCU_encoded'].mode()[0],
        inplace=True
    )
    df_test.drop('peorcalificacionBCU', axis=1, inplace=True)

df_test = df_test.reindex(columns=X.columns, fill_value=np.nan)
df_test[num_cols] = imputer.transform(df_test[num_cols])

predictions = best_model.predict(df_test)
predictions = np.round(predictions).astype(int)

output = pd.DataFrame({
    'id': range(1, len(df_test) + 1),
    'ingreso': predictions
})

ruta_salida = "../outputs/predicciones.csv"
output.to_csv(ruta_salida, index=False)

print("\nPredicciones guardadas en:", ruta_salida)



⚠️ XGBoost no disponible (Mac sin OpenMP). Uso modelo sklearn.
Modelo fallback entrenado (sklearn)

MAPE en validación: 12.56%

Predicciones guardadas en: ../outputs/predicciones.csv
