In [0]:
%python
%pip install optuna

In [0]:
%python
       
# ============================================================
# 1. Importations
# ============================================================

import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# ============================================================
# 2. Charger les données en Pandas (Optuna nécessite Pandas)
# ============================================================

train_df_spark = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/Volumes/ngow_lakehouse/ml_sandbox/data/train.csv")

train_df_spark = train_df_spark.drop("PassengerId", "Cabin", "Name")  # éviter fuite de données
train_pd = train_df_spark.toPandas()

target_col = "HomePlanet"
train_pd = train_pd.dropna(subset=[target_col])
X = train_pd.drop(columns=[target_col])
y = train_pd[target_col]

# ============================================================
# 3. Identifier colonnes numériques et catégorielles
# ============================================================

numeric_cols = X.select_dtypes(include=["int64","float64"]).columns
categorical_cols = X.select_dtypes(include=["object"]).columns

# ============================================================
# 4. Fonction objectif Optuna
# ============================================================

def objective(trial):

    # Hyperparamètres à optimiser
    n_estimators = trial.suggest_int("n_estimators", 50, 500)
    max_depth = trial.suggest_int("max_depth", 3, 30)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 20)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

    # Préprocessing scikit-learn
    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median"))
    ])

    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_transformer, numeric_cols),
            ("cat", categorical_transformer, categorical_cols)
        ]
    )

    # Modèle scikit-learn
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )

    # Pipeline complet
    clf = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("model", model)
    ])

    # Split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Entraînement
    clf.fit(X_train, y_train)

    # Évaluation
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

# ============================================================
# 5. Lancer l'optimisation Optuna
# ============================================================

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)  # tu peux augmenter à 100 si besoin

print("Meilleurs hyperparamètres :", study.best_params)
print("Meilleure accuracy :", study.best_value)