In [None]:
import os
import random

import numpy as np
import optuna
import pandas as pd
from joblib import dump
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

RANDOM_SEED = 42

random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
os.environ["PYTHONHASHSEED"] = str(RANDOM_SEED)

In [None]:
TRAIN_METADATA_CSV = "/kaggle/input/kaggleisic-challenge/new-train-metadata.csv"
TEST_METADATA_CSV = "/kaggle/input/kaggleisic-challenge/students-test-metadata.csv"
TRAIN_METADATA_PROCESSED_CSV = (
    "/kaggle/input/kaggleisic-challenge/train-metadata-processed.csv"
)
TEST_METADATA_PROCESSED_CSV = (
    "/kaggle/input/kaggleisic-challenge/test-metadata-processed.csv"
)
TRAIN_HDF5 = "/kaggle/input/kaggleisic-challenge/train-image.hdf5"
TEST_HDF5 = "/kaggle/input/kaggleisic-challenge/test-image.hdf5"

TRAIN_METADATA_AUGMENTED_CSV = (
    "/kaggle/input/kaggleisic-challenge/train-metadata-augmented.csv"
)
TRAIN_AUGMENTED_HDF5 = "/kaggle/input/kaggleisic-challenge/train-image-augmented.hdf5"

OUTPUT_FINAL_MODEL_LGBM = "/kaggle/working/lgbm_model.pth"
OUTPUT_FINAL_SUBMISSION_LGBM = "/kaggle/working/lgbm_submission.csv"

OUTPUT_FINAL_MODEL_XGB = "/kaggle/working/xgb_model.pth"
OUTPUT_FINAL_SUBMISSION_XGB = "/kaggle/working/xgb_submission.csv"

DROP_COLUMNS = [
    "image_type",
    "patient_id",
    "copyright_license",
    "attribution",
    "anatom_site_general",
    "tbp_lv_location_simple",
]

In [None]:
def load_metadata_dataset(
    train_frac=0.8, seed=42, is_subsampled=False, is_augmented=False
) -> tuple:
    if is_augmented:
        train_file = TRAIN_METADATA_AUGMENTED_CSV
    else:
        train_file = TRAIN_METADATA_PROCESSED_CSV

    # Load the metadata CSV files
    train_df = pd.read_csv(train_file)
    test_df = pd.read_csv(TEST_METADATA_PROCESSED_CSV)

    # Perform stratified train/validation split to maintain class distribution
    train_dataset, valid_dataset = train_test_split(
        train_df, train_size=train_frac, stratify=train_df["target"], random_state=seed
    )

    # Reset index for train and validation datasets
    train_dataset = train_dataset.reset_index(drop=True)
    valid_dataset = valid_dataset.reset_index(drop=True)
    test_dataset = test_df.reset_index(drop=True)

    # Optionally create a balanced subset
    if is_subsampled:
        train_dataset = create_balanced_subset(train_dataset)
        valid_dataset = create_balanced_subset(valid_dataset)

    print(f"train_dataset shape: {train_dataset.shape}")
    print(f"valid_dataset shape: {valid_dataset.shape}")
    print(f"test_dataset shape:  {test_dataset.shape}")

    return train_dataset, valid_dataset, test_dataset


def create_balanced_subset(
    df: pd.DataFrame, target_col="target", seed=42
) -> pd.DataFrame:
    # Just keep all the cancer cases and subsample the healthy cases (2:1 ratio)
    positives = df[df[target_col] == 1]

    n_negatives = len(positives) * 2  # 2:1 ratio
    negatives = df[df[target_col] == 0].sample(
        n=min(n_negatives, len(df[df[target_col] == 0])), random_state=seed
    )
    balanced_df = (
        pd.concat([positives, negatives])
        .sample(frac=1, random_state=seed)
        .reset_index(drop=True)
    )
    return balanced_df

In [None]:
train_meta_df, valid_meta_df, test_meta_df = load_metadata_dataset(
    is_subsampled=True, is_augmented=False
)

In [None]:
inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_SEED)
outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

AUC_scorer = make_scorer(roc_auc_score)
accuracy_scorer = make_scorer(accuracy_score)

inner_AUC = {}
outer_results = {}

full_meta_df = pd.concat([train_meta_df, valid_meta_df], axis=0)
X_train = full_meta_df.drop(columns=["target"])
Y_train = full_meta_df["target"]
X_test = test_meta_df.drop(columns=["target"])
Y_test = test_meta_df["target"]


def lgbm_objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 750),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "random_state": RANDOM_SEED,
        "verbosity": -1,
        "class_weight": "balanced",
    }

    model = LGBMClassifier(**params)

    return cross_val_score(model, X_train, Y_train, cv=inner, scoring="roc_auc").mean()


sampler = optuna.samplers.TPESampler(seed=RANDOM_SEED)
lgbm_study = optuna.create_study(direction="maximize", sampler=sampler)
lgbm_study.optimize(lgbm_objective, n_trials=30)

best_params_lgbm = lgbm_study.best_params
best_params_lgbm["random_state"] = RANDOM_SEED
best_params_lgbm["verbosity"] = -1
best_params_lgbm["class_weight"] = "balanced"

print("Best params:", best_params_lgbm)

lgbm_model = LGBMClassifier(**best_params_lgbm)
outer_auc = cross_val_score(
    lgbm_model, X_train, Y_train, cv=outer, scoring="roc_auc"
).mean()

inner_AUC["LightGBM"] = lgbm_study.best_value
outer_results["LightGBM"] = outer_auc

In [None]:
inner = StratifiedKFold(n_splits=3, shuffle=True, random_state=RANDOM_SEED)
outer = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_SEED)

AUC_scorer = make_scorer(roc_auc_score)
accuracy_scorer = make_scorer(accuracy_score)

inner_AUC = {}
outer_results = {}

full_meta_df = pd.concat([train_meta_df, valid_meta_df], axis=0)
X_train = full_meta_df.drop(columns=["target"])
Y_train = full_meta_df["target"]
X_test = test_meta_df.drop(columns=["target"])
Y_test = test_meta_df["target"]


def xgb_objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 750),
        "max_depth": trial.suggest_int("max_depth", 3, 20),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "random_state": RANDOM_SEED,
    }

    model = XGBClassifier(**params)

    return cross_val_score(model, X_train, Y_train, cv=inner, scoring="roc_auc").mean()


sampler = optuna.samplers.TPESampler(seed=RANDOM_SEED)
xgb_study = optuna.create_study(direction="maximize", sampler=sampler)
xgb_study.optimize(xgb_objective, n_trials=30)

best_params_xgb = xgb_study.best_params
best_params_xgb["random_state"] = RANDOM_SEED

print("Best params:", best_params_xgb)

xgb_model = XGBClassifier(**best_params_xgb)
outer_auc = cross_val_score(
    xgb_model, X_train, Y_train, cv=outer, scoring="roc_auc"
).mean()

inner_AUC["XGBoost"] = xgb_study.best_value
outer_results["XGBoost"] = outer_auc

In [None]:
print("Inner AUC scores:", inner_AUC)
print("Outer AUC scores:", outer_results)

In [None]:
# Entrenar los modelos en todo el conjunto de entrenamiento
lgbm_model.fit(X_train, Y_train)
xgb_model.fit(X_train, Y_train)

# Realizar predicciones en el conjunto de test
lgbm_preds = lgbm_model.predict(X_test)
xgb_preds = xgb_model.predict(X_test)

# Calcular Accuracy y AUC en test
lgbm_test_accuracy = accuracy_score(Y_test, lgbm_preds)
lgbm_test_auc = roc_auc_score(Y_test, lgbm_model.predict_proba(X_test)[:, 1])

xgb_test_accuracy = accuracy_score(Y_test, xgb_preds)
xgb_test_auc = roc_auc_score(Y_test, xgb_model.predict_proba(X_test)[:, 1])

In [None]:
print("LightGBM Test Accuracy:", lgbm_test_accuracy)
print("LightGBM Test AUC:", lgbm_test_auc)
print("XGBoost Test Accuracy:", xgb_test_accuracy)
print("XGBoost Test AUC:", xgb_test_auc)

In [None]:
# Results table
results_table = pd.DataFrame(
    {
        "Model": [model for model in inner_AUC.keys()],
        "AUC": [inner_AUC[model] for model in inner_AUC.keys()],
    }
)

results_table = results_table.style.set_properties(
    **{"text-align": "center", "border": "1px solid black", "background-color": "white"}
).highlight_max(subset=["AUC"], axis=0, color="lightgreen")

# Display both tables
display(results_table)

In [None]:
# ---- PREDICCIÓN FINAL CON LGBM ----
# 1. Preparar datos
train_full = full_meta_df.drop(columns=["target"]).copy()
test_full = test_meta_df.drop(columns=["target"]).copy()

# Asegurar orden de columnas
test_full = test_full[train_full.columns]

# 2. Codificar target
label_encoder = LabelEncoder()
y_train_full = label_encoder.fit_transform(full_meta_df["target"])

# 3. Instanciar modelo con mejores hiperparámetros
lgbm_final = LGBMClassifier(**best_params_lgbm)

# 4. Entrenar en todos los datos
lgbm_final.fit(train_full, y_train_full)

# 5. Predecir probabilidades sobre el conjunto de test
lgbm_test_probs = lgbm_final.predict_proba(test_full)[:, 1]

# 6. Guardar en CSV
lgbm_submission = pd.DataFrame(
    {"isic_id": test_meta_df["isic_id"], "lgbm_proba": lgbm_test_probs}
)

lgbm_submission.rename(columns={"lgbm_proba": "target"}, inplace=True)
lgbm_submission.to_csv(OUTPUT_FINAL_SUBMISSION_LGBM, index=False)

print("✅ Predicciones LGBM guardadas en 'lgbm_predictions.csv'")
display(lgbm_submission.head())

# 7. Guardar modelo
dump(lgbm_final, OUTPUT_FINAL_MODEL_LGBM)
print(f"✅ Modelo LGBM guardado en '{OUTPUT_FINAL_MODEL_LGBM}'")

In [None]:
# ---- PREDICCIÓN FINAL CON XGBOOST ----
# 1. Preparar datos (ya definidos antes)
# train_full, test_full, label_encoder, y_train_full

# 2. Instanciar modelo con mejores hiperparámetros
xgb_final = XGBClassifier(**best_params_xgb)

# 3. Entrenar en todos los datos
xgb_final.fit(train_full, y_train_full)

# 4. Predecir probabilidades sobre el conjunto de test
xgb_test_probs = xgb_final.predict_proba(test_full)[:, 1]

# 5. Guardar en CSV
xgb_submission = pd.DataFrame(
    {"isic_id": test_meta_df["isic_id"], "xgb_proba": xgb_test_probs}
)

xgb_submission.rename(columns={"xgb_proba": "target"}, inplace=True)
xgb_submission.to_csv(OUTPUT_FINAL_SUBMISSION_XGB, index=False)

print("✅ Predicciones XGBoost guardadas en 'xgb_predictions.csv'")
display(xgb_submission.head())

# 6. Guardar modelo
dump(xgb_final, OUTPUT_FINAL_MODEL_XGB)
print(f"✅ Modelo XGBoost guardado en '{OUTPUT_FINAL_MODEL_XGB}'")