# 06 - Classification Random Forest

EntraÃ®nement d'un modÃ¨le MLlib pour classifier les phases de vol.

## Configuration

In [None]:
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, IndexToString
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from config import get_s3_path, create_spark_session

SILVER_ML_PATH = get_s3_path("silver", "flights_ml")
spark = create_spark_session("RandomForestClassification")

print(f"âœ… Input: {SILVER_ML_PATH}")

## Lecture des donnÃ©es

In [None]:
feature_columns = [
    "altitude_meters", "velocity_kmh", "altitude_change", "velocity_change",
    "rolling_avg_altitude", "rolling_std_altitude", "rolling_avg_velocity"
]

df = spark.read.format("delta").load(SILVER_ML_PATH)
df_clean = df.filter(col("flight_phase").isNotNull()).fillna(0, subset=feature_columns)

print(f"ðŸ“Š {df_clean.count():,} lignes")
df_clean.groupBy("flight_phase").count().orderBy("count", ascending=False).show()

## Pipeline MLlib

In [None]:
label_indexer = StringIndexer(inputCol="flight_phase", outputCol="label", handleInvalid="skip")
vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features_raw", handleInvalid="skip")
scaler = StandardScaler(inputCol="features_raw", outputCol="features", withStd=True, withMean=False)
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100, maxDepth=10, seed=42)
label_converter = IndexToString(inputCol="prediction", outputCol="predicted_label", labels=label_indexer.fit(df_clean).labels)

pipeline = Pipeline(stages=[label_indexer, vector_assembler, scaler, rf, label_converter])

print("âœ… Pipeline crÃ©Ã©")

## EntraÃ®nement

In [None]:
train_df, test_df = df_clean.randomSplit([0.8, 0.2], seed=42)

print(f"ðŸ“Š Train: {train_df.count():,} | Test: {test_df.count():,}")
print("ðŸš€ EntraÃ®nement...")

model = pipeline.fit(train_df)

print("âœ… ModÃ¨le entraÃ®nÃ©")

## Ã‰valuation

In [None]:
predictions = model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"ðŸŽ¯ Accuracy: {accuracy:.4f}")
print(f"ðŸŽ¯ F1 Score: {f1:.4f}")

## Matrice de confusion

In [None]:
predictions.groupBy("flight_phase", "predicted_label").count() \
    .orderBy("flight_phase", "predicted_label").show(50)

## Importance des features

In [None]:
import pandas as pd

rf_model = model.stages[3]
importances = rf_model.featureImportances.toArray()

pd.DataFrame({
    "feature": feature_columns,
    "importance": importances
}).sort_values("importance", ascending=False)