In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import KNNImputer, IterativeImputer
import xgboost as xgb
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# ============================
# 1. Cargar Datos
# ============================
ruta_train = "../data/train.csv"
df = pd.read_csv(ruta_train)

# Mapeo de la columna 'peorcalificacionBCU'
rating_order = {'1C': 1, '2A': 2, '2B': 3, '3': 4, '4': 5, '5': 6}
df['peorcalificacionBCU_encoded'] = df['peorcalificacionBCU'].map(rating_order)
df['peorcalificacionBCU_encoded'].fillna(df['peorcalificacionBCU_encoded'].mode()[0], inplace=True)
df.drop('peorcalificacionBCU', axis=1, inplace=True)

# ============================
# 2. Definir Variables Predictoras y Objetivo
# ============================
X = df.drop('ingreso', axis=1)
y = df['ingreso']
num_cols_features = X.select_dtypes(include=[np.number]).columns.tolist()

# ============================
# 3. Imputación Mejorada
# ============================
# Seleccionar la estrategia de imputación ('iterative' o 'knn')
imputation_method = 'iterative'  # Cambia a 'knn' si prefieres KNN Imputer

if imputation_method == 'iterative':
    imputer = IterativeImputer(max_iter=10, random_state=42)
elif imputation_method == 'knn':
    imputer = KNNImputer(n_neighbors=5)

X[num_cols_features] = imputer.fit_transform(X[num_cols_features])

# ============================
# 4. Manejo de Outliers
# ============================
z_scores = np.abs(stats.zscore(X[num_cols_features]))
threshold = 3
X = X[(z_scores < threshold).all(axis=1)]
y = y[X.index]

# ============================
# 5. División en Train/Test
# ============================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# ============================
# 6. Modelo y Búsqueda de Hiperparámetros con XGBoost
# ============================
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 6, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1],
    'colsample_bytree': [0.8, 0.9, 1]
}

xgb_model = xgb.XGBRegressor(random_state=42)

# Búsqueda de hiperparámetros usando GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    cv=5,
    scoring='neg_mean_absolute_percentage_error',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)

# ============================
# 7. Evaluación del Modelo
# ============================
best_xgb = grid_search.best_estimator_
y_val_pred = best_xgb.predict(X_val)
nuevo_mape = np.mean(np.abs((y_val - y_val_pred) / y_val)) * 100
print("\nNuevo MAPE en validación: {:.2f}%".format(nuevo_mape))

# ============================
# 8. Predicción y Guardado en Excel
# ============================
ruta_test = "../data/test.csv"
df_test = pd.read_csv(ruta_test)

if 'peorcalificacionBCU' in df_test.columns:
    df_test['peorcalificacionBCU_encoded'] = df_test['peorcalificacionBCU'].map(rating_order)
    df_test['peorcalificacionBCU_encoded'].fillna(df_test['peorcalificacionBCU_encoded'].mode()[0], inplace=True)
    df_test.drop('peorcalificacionBCU', axis=1, inplace=True)

df_test = df_test.reindex(columns=X.columns, fill_value=np.nan)
df_test[num_cols_features] = imputer.transform(df_test[num_cols_features])

test_predictions = best_xgb.predict(df_test)
test_predictions = np.round(test_predictions).astype(int)

output = pd.DataFrame({
    'id': range(1, len(df_test) + 1),
    'ingreso': test_predictions
})

ruta_salida_excel = "../outputs/prediccionesOUTMejXB.xlsx"
output.to_excel(ruta_salida_excel, index=False, engine='openpyxl')

print("\nArchivo de predicciones guardado en:", ruta_salida_excel)


XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/denisejones/Desktop/prediccion-ingresos/.venv/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <89AD948E-E564-3266-867D-7AF89D6488F0> /Users/denisejones/Desktop/prediccion-ingresos/.venv/lib/python3.9/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/opt/homebrew/opt/libomp/lib/libomp.dylib' (no such file)"]
