In [3]:
# -*- coding: utf-8 -*-
# Entrenamiento supervisado con PyCaret (XGBoost) + chequeo anti-leakage (PyCaret 3.x)

import os
import warnings
import pandas as pd
from pycaret.classification import (
    setup, compare_models, create_model, tune_model,
    plot_model, predict_model, finalize_model, save_model, pull
)

warnings.filterwarnings("ignore")

# ---------------------------------------------------------
# 1) Configuración
# ---------------------------------------------------------
RUTA_CSV = r"C:\Users\FABIO\Downloads\dataset_hoteles_tunja_50000_segmentado.csv"
TARGET = "segmento"
SEED = 42
FOLDS = 5
RESULTS_DIR = os.path.abspath("./resultados_segmento")
os.makedirs(RESULTS_DIR, exist_ok=True)

# ---------------------------------------------------------
# 2) Cargar datos y detectar columnas a ignorar
# ---------------------------------------------------------
df = pd.read_csv(RUTA_CSV, encoding="utf-8-sig")
if TARGET not in df.columns:
    raise ValueError(f"No se encontró la columna objetivo '{TARGET}'. ¿Usas el CSV segmentado?")

DERIVADAS_CANDIDATAS = {
    'score_premium','amenities_count','cal_norm','amenities_norm',
    'reputacion_score','reputacion_norm','limpieza_score','limpieza_norm',
    'nivel_servicios','indice_valor'
}
derivadas_presentes = [c for c in df.columns if c in DERIVADAS_CANDIDATAS]

texto_largo = []
for c in df.columns:
    if c != TARGET and df[c].dtype == object:
        try:
            if df[c].str.len().fillna(0).mean() > 60:
                texto_largo.append(c)
        except Exception:
            pass

print("Columnas derivadas (posible leakage):", derivadas_presentes)
print("Columnas de texto largo ignoradas:", texto_largo)

# ---------------------------------------------------------
# 3) Función de experimento
# ---------------------------------------------------------
def correr_experimento(nombre, ignore_features=None):
    print(f"\n=== Experimento: {nombre} ===")
    s = setup(
        data=df,
        target=TARGET,
        session_id=SEED,
        fold=FOLDS,
        verbose=False,           # <- quitar 'silent'
        ignore_features=ignore_features
    )

    # Comparación para documentación
    _ = compare_models(sort='F1')
    tabla_compare = pull()
    tabla_compare.to_csv(os.path.join(RESULTS_DIR, f"compare_models_{nombre}.csv"),
                         index=False, encoding="utf-8-sig")

    # XGBoost: crear, afinar y evaluar
    xgb = create_model('xgboost')
    pull().to_csv(os.path.join(RESULTS_DIR, f"xgb_cv_{nombre}.csv"),
                  index=False, encoding="utf-8-sig")

    xgb_tuned = tune_model(xgb, optimize='F1')
    pull().to_csv(os.path.join(RESULTS_DIR, f"xgb_tuned_cv_{nombre}.csv"),
                  index=False, encoding="utf-8-sig")

    # Plots (se guardan en ./Plots)
    plot_model(xgb_tuned, plot='confusion_matrix', save=True)
    plot_model(xgb_tuned, plot='feature', save=True)
    try:
        plot_model(xgb_tuned, plot='class_report', save=True)
    except Exception as e:
        print("Aviso: 'class_report' no disponible en esta versión ->", e)

    # Interpretabilidad (opcional, requiere shap)
    try:
        from pycaret.classification import interpret_model
        interpret_model(xgb_tuned, plot='summary', save=True)
    except Exception as e:
        print("Aviso: SHAP no disponible / no se pudo generar interpretabilidad ->", e)

    # Predicción holdout y métricas
    pred_holdout = predict_model(xgb_tuned)
    pred_holdout.to_csv(os.path.join(RESULTS_DIR, f"holdout_preds_{nombre}.csv"),
                        index=False, encoding="utf-8-sig")
    pull().to_csv(os.path.join(RESULTS_DIR, f"holdout_metrics_{nombre}.csv"),
                  index=False, encoding="utf-8-sig")

    # Modelo final
    final_xgb = finalize_model(xgb_tuned)
    save_model(final_xgb, os.path.join(RESULTS_DIR, f"modelo_xgb_{nombre}"))

# ---------------------------------------------------------
# 4) Ejecutar escenarios
# ---------------------------------------------------------
# A) Con todo (menos texto enorme)
IGNORE_A = texto_largo.copy()
correr_experimento("A_todo", ignore_features=IGNORE_A)

# B) Anti-leakage (quita derivadas + texto enorme)
IGNORE_B = list(set(texto_largo + derivadas_presentes))
correr_experimento("B_antileak", ignore_features=IGNORE_B)

print("\nListo. Revisa la carpeta:", RESULTS_DIR, "y la subcarpeta ./Plots")


Columnas derivadas (posible leakage): ['reputacion_score', 'limpieza_score', 'cal_norm', 'amenities_norm', 'reputacion_norm', 'limpieza_norm', 'amenities_count', 'nivel_servicios', 'indice_valor', 'score_premium']
Columnas de texto largo ignoradas: []

=== Experimento: A_todo ===


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
dt,Decision Tree Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.544
rf,Random Forest Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.674
ada,Ada Boost Classifier,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.972
gbc,Gradient Boosting Classifier,1.0,0.0,1.0,1.0,1.0,1.0,1.0,3.22
xgboost,Extreme Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.636
lightgbm,Light Gradient Boosting Machine,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.61
catboost,CatBoost Classifier,1.0,1.0,1.0,1.0,1.0,1.0,1.0,5.626
et,Extra Trees Classifier,0.9994,1.0,0.9994,0.9994,0.9994,0.9991,0.9991,0.81
lr,Logistic Regression,0.9993,0.0,0.9993,0.9993,0.9993,0.999,0.999,4.708
lda,Linear Discriminant Analysis,0.9531,0.0,0.9531,0.955,0.9535,0.9296,0.9302,0.506


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Mean,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Std,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Mean,1.0,1.0,1.0,1.0,1.0,1.0,1.0
Std,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Transformation Pipeline and Model Successfully Saved

=== Experimento: B_antileak ===


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,1.0,0.0,1.0,1.0,1.0,1.0,1.0,2.652
catboost,CatBoost Classifier,0.999,1.0,0.999,0.999,0.999,0.9985,0.9985,5.288
xgboost,Extreme Gradient Boosting,0.9938,0.9998,0.9938,0.9938,0.9938,0.9906,0.9907,0.87
lightgbm,Light Gradient Boosting Machine,0.9933,0.9998,0.9933,0.9933,0.9933,0.9899,0.9899,0.886
gbc,Gradient Boosting Classifier,0.9683,0.0,0.9683,0.9683,0.9682,0.9524,0.9525,4.178
et,Extra Trees Classifier,0.9673,0.9975,0.9673,0.9673,0.9673,0.9509,0.9509,1.028
dt,Decision Tree Classifier,0.9632,0.9725,0.9632,0.9632,0.9632,0.9447,0.9447,0.336
rf,Random Forest Classifier,0.9601,0.9966,0.9601,0.9601,0.96,0.9401,0.9401,0.82
lda,Linear Discriminant Analysis,0.9558,0.0,0.9558,0.9572,0.9561,0.9337,0.9341,0.402
svm,SVM - Linear Kernel,0.9133,0.0,0.9133,0.9228,0.9124,0.8699,0.8751,0.984


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9937,0.9998,0.9937,0.9937,0.9937,0.9906,0.9906
1,0.993,0.9999,0.993,0.993,0.993,0.9895,0.9895
2,0.9941,0.9998,0.9941,0.9941,0.9941,0.9912,0.9912
3,0.9941,0.9998,0.9941,0.9941,0.9941,0.9912,0.9912
4,0.9939,0.9998,0.9939,0.9939,0.9939,0.9908,0.9908
Mean,0.9938,0.9998,0.9938,0.9938,0.9938,0.9906,0.9907
Std,0.0004,0.0,0.0004,0.0004,0.0004,0.0006,0.0006


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9943,0.9997,0.9943,0.9943,0.9943,0.9914,0.9914
1,0.9916,0.9998,0.9916,0.9916,0.9916,0.9873,0.9873
2,0.9927,0.9998,0.9927,0.9927,0.9927,0.9891,0.9891
3,0.9916,0.9996,0.9916,0.9916,0.9916,0.9873,0.9874
4,0.9936,0.9999,0.9936,0.9936,0.9936,0.9903,0.9903
Mean,0.9927,0.9998,0.9927,0.9927,0.9927,0.9891,0.9891
Std,0.0011,0.0001,0.0011,0.0011,0.0011,0.0016,0.0016


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extreme Gradient Boosting,0.9959,0.9999,0.9959,0.9959,0.9959,0.9939,0.9939


Transformation Pipeline and Model Successfully Saved

Listo. Revisa la carpeta: c:\Users\FABIO\Downloads\IAhotelera\resultados_segmento y la subcarpeta ./Plots
