In [0]:
%python
       
# 1. Charger les données (Spark → Pandas)
train_df = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/Volumes/ngow_lakehouse/ml_sandbox/data/train.csv")

train_pd = train_df.toPandas()
train_pd.head()

# 2. Séparer les features et la cible
train_df.printSchema() 
# Définir la cible
target_col = "HomePlanet"
# Remove rows with null target values
train_pd = train_pd.dropna(subset=[target_col])
X = train_pd.drop(columns=[target_col])
y = train_pd[target_col]


# Identifier les types de colonnes
numeric_cols = X.select_dtypes(include=["int64", "float64"]).columns
categorical_cols = X.select_dtypes(include=["object"]).columns
numeric_cols, categorical_cols

# Créer un pipeline de prétraitement + modèle
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Prétraitement pour colonnes numériques
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

# Prétraitement pour colonnes catégorielles
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

# Combine les deux types de colonnes
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ]
)

# Modèle final
clf = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestClassifier(n_estimators=200, random_state=42))
])

#3. Spli train/test & entrainement
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
clf.fit(X_train, y_train)

#4. Evaluation
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
clf.fit(X_train, y_train)
