### Imports + carga de dataset + funciones utiles para imprimir metricas y guardar datos en disco 

In [1]:
# ===========================
# Utils: m√©tricas + leaderboard
# ===========================
import os, time, json, joblib
import numpy as np
import pandas as pd
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_auc_score, average_precision_score, brier_score_loss
)

def print_metrics(y_true, y_pred, y_proba=None, title=None):
    """Imprime m√©tricas al estilo RF que mostraste; si pas√°s y_proba agrega ROC-AUC, PR-AUC y Brier."""
    if title:
        print(f"\n=== {title} ===")
    acc = accuracy_score(y_true, y_pred)
    print(f"\nüéØ Accuracy: {acc:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, digits=4))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    if y_proba is not None:
        print("\nProb-metrics:")
        print(f"ROC-AUC : {roc_auc_score(y_true, y_proba):.4f}")
        print(f"PR-AUC  : {average_precision_score(y_true, y_proba):.4f}")
        print(f"Brier   : {brier_score_loss(y_true, y_proba):.4f}")

def save_experiment(exp_dir, model_tag, best_estimator, y_true, y_pred, y_proba, metrics_extra=None):
    """Guarda artefactos est√°ndar del experimento y devuelve dict con m√©tricas."""
    os.makedirs(exp_dir, exist_ok=True)
    # M√©tricas base
    mets = {
        "model": model_tag,
        "accuracy": float(accuracy_score(y_true, y_pred)),
        "roc_auc": float(roc_auc_score(y_true, y_proba)) if y_proba is not None else None,
        "pr_auc": float(average_precision_score(y_true, y_proba)) if y_proba is not None else None,
        "f1": float(f1_score(y_true, y_pred)),
        "brier": float(brier_score_loss(y_true, y_proba)) if y_proba is not None else None,
        "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
        "exp_dir": exp_dir,
    }
    if metrics_extra:
        mets.update(metrics_extra)

    # Guardados
    pd.DataFrame({"y_true": y_true, "y_pred": y_pred, "y_proba": y_proba}).to_csv(
        os.path.join(exp_dir, "test_predictions.csv"), index=False
    )
    with open(os.path.join(exp_dir, "metrics.json"), "w") as f:
        json.dump({model_tag: mets}, f, indent=2)
    joblib.dump(best_estimator, os.path.join(exp_dir, f"{model_tag}.pkl"))

    # Leaderboard (append)
    lb_path = "experiments/leaderboard.csv"
    row = {k: mets[k] for k in ["timestamp","model","accuracy","roc_auc","pr_auc","brier","exp_dir"]}
    df = pd.DataFrame([row])
    os.makedirs("experiments", exist_ok=True)
    if os.path.exists(lb_path):
        df.to_csv(lb_path, mode="a", header=False, index=False)
    else:
        df.to_csv(lb_path, index=False)
    return mets

DATA_PATH = "../datasets/final/ico_dataset_final_v2_clean_enriquecido_feature_engineering_preico_v1.csv"
STAMP = time.strftime("%Y%m%d_%H%M%S")

### XGBoost (grid + m√©tricas + save)

In [6]:
%%time
import os, time, json, joblib
import numpy as np, pandas as pd

from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss
from sklearn.utils import check_random_state
from joblib import Memory

# -------- data
DATA_PATH = "../datasets/final/ico_dataset_final_v2_clean_enriquecido_feature_engineering_preico_v1.csv"
df = pd.read_csv(DATA_PATH)
target = "ico_successful"
df[target] = df[target].astype(int)
df = df.dropna(subset=[target])

cat_cols = [c for c in ["platform","category","location"] if c in df.columns]
bin_cols = [c for c in ["mvp","has_twitter","has_facebook","is_tax_regulated","has_github",
                        "has_reddit","has_website","has_whitepaper","kyc",
                        "accepts_BTC","accepts_ETH","has_contract_address"] if c in df.columns]
num_cols = [c for c in df.columns if c not in cat_cols + bin_cols + [target]]

# -------- pre: sin scaler (√°rboles no lo precisan), OHE densa->sparse para acelerar
num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median"))])  # <-- sin StandardScaler
cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")),
                     ("ohe", OneHotEncoder(handle_unknown="ignore", drop="first"))])
bin_pipe = Pipeline([("imputer", SimpleImputer(strategy="constant", fill_value=0))])

pre = ColumnTransformer(
    [("num", num_pipe, num_cols), ("cat", cat_pipe, cat_cols), ("bin", bin_pipe, bin_cols)],
    remainder="drop", sparse_threshold=0.5  # fuerza salida sparse si es posible
)

# -------- split
X = df.drop(columns=[target]); y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)

# -------- cache del pipeline para no re-OHE en cada fit
CACHE_DIR = "experiments/_sk_cache"
os.makedirs(CACHE_DIR, exist_ok=True)
mem = Memory(location=CACHE_DIR, verbose=0)

# -------- classifier: sin nested threads
xgb = XGBClassifier(
    objective="binary:logistic",
    eval_metric="logloss",
    tree_method="hist",
    random_state=42,
    n_jobs=1  # 1 para evitar anidado, se usa n_jobs en RandomizedSearch
)

pipe = Pipeline([("pre", pre), ("clf", xgb)], memory=mem)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
rng = check_random_state(42)

# -------- espacio de b√∫squeda m√°s acotado
param_dist = {
    # "clf__n_estimators": rng.randint(300, 1200, 10),
    "clf__n_estimators": [300, 600, 900],
    "clf__learning_rate": [0.02, 0.03, 0.05, 0.1],
    "clf__max_depth": [3, 4, 5, 6],
    "clf__min_child_weight": [1, 3, 5, 10],
    "clf__subsample": [0.7, 0.85, 1.0],
    "clf__colsample_bytree": [0.6, 0.8, 1.0],
    "clf__reg_lambda": [1.0, 2.0, 4.0],
    "clf__scale_pos_weight": [1.0, 1.3, 1.6, 2.0],
}

# 48 iteraciones 
rs = RandomizedSearchCV(
    pipe, param_distributions=param_dist, n_iter=48,
    cv=cv, scoring="average_precision",
    n_jobs=4, 
    random_state=42, refit=True, verbose=1
)
rs.fit(X_train, y_train)

best_xgb = rs.best_estimator_
y_proba = best_xgb.predict_proba(X_test)[:, 1]
y_pred  = (y_proba >= 0.5).astype(int)

print(f"[XGB] Best params: {best_xgb}")
print_metrics(y_test, y_pred, y_proba, title="XGBoost (raw)")

# -------- Prueba con Calibraci√≥n isot√≥nica
xgb_cal = CalibratedClassifierCV(estimator=best_xgb, method="isotonic", cv=5)
xgb_cal.fit(X_train, y_train)
y_proba_cal = xgb_cal.predict_proba(X_test)[:, 1]
y_pred_cal  = (y_proba_cal >= 0.5).astype(int)
print_metrics(y_test, y_pred_cal, y_proba_cal, title="XGBoost (calibrated)")

# -------- Grabado de experimento
EXP_DIR = f"experiments/xgboost_{time.strftime('%Y%m%d_%H%M%S')}"
_ = save_experiment(EXP_DIR, "xgboost_isotonic_calibrated", xgb_cal, y_test.values, y_pred_cal, y_proba_cal,
                    metrics_extra={"best_params": gs.best_params_})

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[XGB] Best params: Pipeline(memory=Memory(location=experiments/_sk_cache\joblib),
         steps=[('pre',
                 ColumnTransformer(sparse_threshold=0.5,
                                   transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median'))]),
                                                  ['total_tokens', 'ico_score',
                                                   'start_year', 'start_qtr',
                                                   'soft_cap', 'hard_cap',
                                                   'token_price',
                                                   'tokens_for_sale',
                                                   'min_purchase',
                                                   'max_purchase',
                                 

### LightGBM (grid + m√©tricas + save)

In [None]:
%%time

# ===========================
# LightGBM (grid + m√©tricas + save)
# ===========================
import os, time
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.calibration import CalibratedClassifierCV

df = pd.read_csv(DATA_PATH)
target = "ico_successful"
df[target] = df[target].astype(int)
df = df.dropna(subset=[target])

cat_cols = [c for c in ["platform","category","location","caps_unit"] if c in df.columns]
bin_cols = [c for c in ["mvp","has_twitter","has_facebook","is_tax_regulated","has_github",
                        "has_reddit","has_website","has_whitepaper","kyc",
                        "accepts_BTC","accepts_ETH","has_contract_address"] if c in df.columns]
num_cols = [c for c in df.columns if c not in cat_cols + bin_cols + [target]]

num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore", drop="first"))])
bin_pipe = Pipeline([("imputer", SimpleImputer(strategy="constant", fill_value=0))])
pre = ColumnTransformer([("num", num_pipe, num_cols), ("cat", cat_pipe, cat_cols), ("bin", bin_pipe, bin_cols)],
                        remainder="drop", sparse_threshold=0.3)

X = df.drop(columns=[target]); y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)
print(f"Variables finales: {X.shape[1]}  |  Filas usadas: {X.shape[0]}")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lgb = LGBMClassifier(random_state=42, n_jobs=-1)
lgb_pipe = Pipeline([("pre", pre), ("clf", lgb)])
lgb_grid = {
    "clf__n_estimators": [400, 600, 800],
    "clf__num_leaves": [31, 63, 127],
    "clf__max_depth": [-1, 10, 20],
    "clf__learning_rate": [0.03, 0.1],
    "clf__subsample": [0.7, 1.0],
    "clf__colsample_bytree": [0.6, 1.0]
}
gs = GridSearchCV(lgb_pipe, lgb_grid, cv=cv, scoring="average_precision", n_jobs=-1, refit=True)
gs.fit(X_train, y_train)

best_lgb = gs.best_estimator_
y_proba = best_lgb.predict_proba(X_test)[:, 1]
y_pred  = (y_proba >= 0.5).astype(int)
print(f"[LGBM] Best params: {gs.best_params_}")
print_metrics(y_test, y_pred, y_proba, title="LightGBM (raw)")

# -------- Prueba con Calibraci√≥n isot√≥nica
cal = CalibratedClassifierCV(estimator=best_lgb.named_steps["clf"], method="isotonic", cv=5)
lgb_cal = Pipeline([("pre", pre), ("cal", cal)])
lgb_cal.fit(X_train, y_train)
y_proba_cal = lgb_cal.predict_proba(X_test)[:, 1]
y_pred_cal  = (y_proba_cal >= 0.5).astype(int)
print_metrics(y_test, y_pred_cal, y_proba_cal, title="LightGBM (calibrated)")

EXP_DIR = f"experiments/lightgbm_{time.strftime('%Y%m%d_%H%M%S')}"
_ = save_experiment(EXP_DIR, "lightgbm_isotonic_calibrated", lgb_cal, y_test.values, y_pred_cal, y_proba_cal,
                    metrics_extra={"best_params": gs.best_params_})


### CatBoost (grid + m√©tricas + save)

In [None]:
%%time

# ===========================
# CatBoost (grid + m√©tricas + save)
# ===========================
import os, time
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.calibration import CalibratedClassifierCV

df = pd.read_csv(DATA_PATH)
target = "ico_successful"
df[target] = df[target].astype(int)
df = df.dropna(subset=[target])

cat_cols = [c for c in ["platform","category","location","caps_unit"] if c in df.columns]
bin_cols = [c for c in ["mvp","has_twitter","has_facebook","is_tax_regulated","has_github",
                        "has_reddit","has_website","has_whitepaper","kyc",
                        "accepts_BTC","accepts_ETH","has_contract_address"] if c in df.columns]
num_cols = [c for c in df.columns if c not in cat_cols + bin_cols + [target]]

num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="most_frequent")), ("ohe", OneHotEncoder(handle_unknown="ignore", drop="first"))])
bin_pipe = Pipeline([("imputer", SimpleImputer(strategy="constant", fill_value=0))])
pre = ColumnTransformer([("num", num_pipe, num_cols), ("cat", cat_pipe, cat_cols), ("bin", bin_pipe, bin_cols)],
                        remainder="drop", sparse_threshold=0.3)

X = df.drop(columns=[target]); y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, stratify=y, random_state=42)
print(f"Variables finales: {X.shape[1]}  |  Filas usadas: {X.shape[0]}")

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cat = CatBoostClassifier(random_state=42, loss_function="Logloss", verbose=False, allow_writing_files=False)
cat_pipe = Pipeline([("pre", pre), ("clf", cat)])
cat_grid = {
    "clf__iterations": [400, 600, 800],
    "clf__depth": [4, 6, 8],
    "clf__learning_rate": [0.03, 0.1],
    "clf__l2_leaf_reg": [3.0, 5.0, 7.0],
}
gs = GridSearchCV(cat_pipe, cat_grid, cv=cv, scoring="average_precision", n_jobs=-1, refit=True)
gs.fit(X_train, y_train)

best_cat = gs.best_estimator_
y_proba = best_cat.predict_proba(X_test)[:, 1]
y_pred  = (y_proba >= 0.5).astype(int)
print(f"[CAT] Best params: {gs.best_params_}")
print_metrics(y_test, y_pred, y_proba, title="CatBoost (raw)")

cal = CalibratedClassifierCV(estimator=best_cat.named_steps["clf"], method="isotonic", cv=5)
cat_cal = Pipeline([("pre", pre), ("cal", cal)])
cat_cal.fit(X_train, y_train)
y_proba_cal = cat_cal.predict_proba(X_test)[:, 1]
y_pred_cal  = (y_proba_cal >= 0.5).astype(int)
print_metrics(y_test, y_pred_cal, y_proba_cal, title="CatBoost (calibrated)")

EXP_DIR = f"experiments/catboost_{time.strftime('%Y%m%d_%H%M%S')}"
_ = save_experiment(EXP_DIR, "catboost_isotonic_calibrated", cat_cal, y_test.values, y_pred_cal, y_proba_cal,
                    metrics_extra={"best_params": gs.best_params_})


In [None]:
# ===========================
# Leaderboard (lee el CSV acumulado)
# ===========================
import pandas as pd

lb_path = "experiments/leaderboard.csv"
lb = pd.read_csv(lb_path)
# Orden√° por PR-AUC y, de empate, por ROC-AUC (desc)
lb_sorted = lb.sort_values(by=["pr_auc","roc_auc"], ascending=False).reset_index(drop=True)
lb_sorted
