In [0]:
%python
%pip install optuna xgboost==2.0.3

In [0]:
%python
# ============================================================
# XGBoost optimisé (scikit-learn API + Optuna)
# ============================================================

# (0) Assurer la dispo d'XGBoost sur le cluster (une seule fois si besoin)
# %pip install xgboost==2.0.3

import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

# (1) Charger & nettoyer
train_df_spark = spark.read.format("csv") \
    .option("header", "true").option("inferSchema", "true") \
    .load("/Volumes/ngow_lakehouse/ml_sandbox/data/train.csv")

for col in ["PassengerId", "Cabin", "Name"]:
    if col in train_df_spark.columns:
        train_df_spark = train_df_spark.drop(col)

target_col = "HomePlanet"
train_pd = train_df_spark.toPandas()
train_pd = train_pd.dropna(subset=[target_col])

X = train_pd.drop(columns=[target_col])
y = train_pd[target_col]

numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

# (2) Préprocesseurs
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

# (3) Fonction objectif Optuna
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1200),
        "max_depth": trial.suggest_int("max_depth", 3, 12),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 1e-1, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 1.0, log=True),
        "objective": "multi:softmax",
        "eval_metric": "mlogloss",
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist"  # rapide et adapté CPU
    }

    xgb = XGBClassifier(**params)

    clf = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", xgb)
    ])

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(clf, X, y, cv=cv, scoring="accuracy", n_jobs=-1)
    return scores.mean()

# (4) Optimisation
study_xgb = optuna.create_study(direction="maximize")
study_xgb.optimize(objective, n_trials=40, show_progress_bar=True)

print("Meilleurs hyperparamètres XGB :", study_xgb.best_params)
print("Meilleure accuracy CV XGB     :", round(study_xgb.best_value, 4))

# (5) Entraînement final + holdout
best_xgb = XGBClassifier(**{**study_xgb.best_params, "random_state":42, "n_jobs":-1, "tree_method":"hist"})
clf_xgb = Pipeline(steps=[("preprocessor", preprocessor), ("model", best_xgb)])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
clf_xgb.fit(X_train, y_train)
y_pred = clf_xgb.predict(X_test)

print("Accuracy XGB holdout :", round(accuracy_score(y_test, y_pred), 4))
print(classification_report(y_test, y_pred))

# (6) Importance des features (gain-based)
# Récupération des noms après OHE
ohe = clf_xgb.named_steps["preprocessor"].named_transformers_["cat"].named_steps["onehot"]
ohe_feature_names = ohe.get_feature_names_out(categorical_cols)
feature_names = np.concatenate([numeric_cols, ohe_feature_names])

# importance depuis booster
booster = clf_xgb.named_steps["model"].get_booster()
score_dict = booster.get_score(importance_type="gain")

# mapper f0..fn -> noms colonnes
# XGBoost mappe les colonnes dans l'ordre du vectoriseur (preprocessor)
# on construit un tableau importance aligné
importances = np.zeros(len(feature_names))
for k, v in score_dict.items():
    # k est de la forme "f123"
    idx = int(k[1:])
    if idx < len(importances):
        importances[idx] = v

fi_xgb = pd.DataFrame({"feature": feature_names, "importance_gain": importances}) \
            .sort_values("importance_gain", ascending=False)
print(fi_xgb.head(20))