In [0]:
%python
# ============================================================
# 1. Imports
# ============================================================

import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier

# Best hyperparameters from Optuna optimization
best_params = {'n_estimators': 440, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 6}

# ============================================================
# 2. Préparer les données (Spark → Pandas)
# ============================================================

train_df_spark = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/Volumes/ngow_lakehouse/ml_sandbox/data/train.csv")

train_df_spark = train_df_spark.drop("PassengerId", "Cabin", "Name")
train_pd = train_df_spark.toPandas()

target_col = "HomePlanet"
train_pd = train_pd.dropna(subset=[target_col])
X = train_pd.drop(columns=[target_col])
y = train_pd[target_col]

numeric_cols = X.select_dtypes(include=["int64","float64"]).columns
categorical_cols = X.select_dtypes(include=["object"]).columns

# ============================================================
# 3. Construire le pipeline final avec les meilleurs hyperparamètres
# ============================================================

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

# Modele optimisé (Optuna)
best_model = RandomForestClassifier(
    n_estimators=best_params["n_estimators"],
    max_depth=best_params["max_depth"],
    min_samples_split=best_params["min_samples_split"],
    min_samples_leaf=best_params["min_samples_leaf"],
    random_state=42
)

clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", best_model)
])

# ============================================================
# 4. Split des données
# ============================================================

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ============================================================
# 5. Entraînement + tracking MLflow
# ============================================================

mlflow.set_experiment("/Users/wilson.ngo@he-arc.ch/Spaceship-Titanic")  # ton dossier MLflow

with mlflow.start_run():

    # ---- Log des hyperparamètres ----
    mlflow.log_params(best_params)

    # ---- Entraînement du modèle ----
    clf.fit(X_train, y_train)

    # ---- Prédictions ----
    y_pred = clf.predict(X_test)

    # ---- Accuracy ----
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)

    print("Accuracy loguée dans MLflow :", accuracy)

    # ---- Log du modèle ----
    signature = mlflow.models.infer_signature(X_train, y_pred)
    mlflow.sklearn.log_model(
        clf, 
        "model",
        signature=signature,
        registered_model_name="SpaceshipRF"
    )

# ============================================================
# Fin : le run apparaîtra dans Mlflow UI
# ============================================================