# 06 - Classification Random Forest

EntraÃ®nement d'un modÃ¨le MLlib pour classifier les phases de vol.

## Configuration

In [25]:
from pyspark.sql.functions import col, sum as _sum, when
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, IndexToString
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from config import get_s3_path, create_spark_session

SILVER_ML_PATH = get_s3_path("silver", "flights_ml")
GOLD_MODEL_PATH = get_s3_path("gold", "models", "rf_flight_phase")
GOLD_PREDICTIONS_PATH = get_s3_path("gold", "predictions", "flight_phase")

spark = create_spark_session("RandomForestClassification")

print(f"âœ… Input: {SILVER_ML_PATH}")

âœ… Spark Session 'RandomForestClassification' configurÃ©e
âœ… Input: s3a://datalake/silver/flights_ml


## Lecture des donnÃ©es

In [26]:
feature_columns = [
    "altitude_meters", "velocity_kmh", "altitude_change", "velocity_change",
    "rolling_avg_altitude", "rolling_std_altitude", "rolling_avg_velocity"
]

df = spark.read.format("delta").load(SILVER_ML_PATH)
df_clean = df.filter(col("flight_phase").isNotNull()).fillna(0, subset=feature_columns)

print(f"ðŸ“Š {df_clean.count():,} lignes")
df_clean.groupBy("flight_phase").count().orderBy("count", ascending=False).show()

ðŸ“Š 88,432 lignes
+------------+-----+
|flight_phase|count|
+------------+-----+
|      CRUISE|37190|
|  TRANSITION|29374|
|     DESCENT|10699|
|       CLIMB| 8877|
|     TAKEOFF| 2035|
|      GROUND|  257|
+------------+-----+



## Pipeline MLlib

In [27]:
label_indexer = StringIndexer(inputCol="flight_phase", outputCol="label", handleInvalid="skip")
vector_assembler = VectorAssembler(inputCols=feature_columns, outputCol="features_raw", handleInvalid="skip")
scaler = StandardScaler(inputCol="features_raw", outputCol="features", withStd=True, withMean=False)
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=100, maxDepth=10, seed=42)
label_converter = IndexToString(inputCol="prediction", outputCol="predicted_label", labels=label_indexer.fit(df_clean).labels)

pipeline = Pipeline(stages=[label_indexer, vector_assembler, scaler, rf, label_converter])

print("âœ… Pipeline crÃ©Ã©")

                                                                                

âœ… Pipeline crÃ©Ã©


## EntraÃ®nement

In [28]:
train_df, test_df = df_clean.randomSplit([0.8, 0.2], seed=42)

print(f"ðŸ“Š Train: {train_df.count():,} | Test: {test_df.count():,}")
print("ðŸš€ EntraÃ®nement...")

model = pipeline.fit(train_df)

print("âœ… ModÃ¨le entraÃ®nÃ©")

ðŸ“Š Train: 70,885 | Test: 17,547
ðŸš€ EntraÃ®nement...


26/01/23 14:02:57 WARN DAGScheduler: Broadcasting large task binary with size 1067.6 KiB
26/01/23 14:03:01 WARN DAGScheduler: Broadcasting large task binary with size 1776.5 KiB
26/01/23 14:03:04 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
26/01/23 14:03:07 WARN DAGScheduler: Broadcasting large task binary with size 4.4 MiB
26/01/23 14:03:10 WARN DAGScheduler: Broadcasting large task binary with size 6.5 MiB
26/01/23 14:03:13 WARN DAGScheduler: Broadcasting large task binary with size 1015.9 KiB
                                                                                

âœ… ModÃ¨le entraÃ®nÃ©


## Ã‰valuation

In [29]:
predictions = model.transform(test_df)
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"ðŸŽ¯ Accuracy: {accuracy:.4f}")
print(f"ðŸŽ¯ F1 Score: {f1:.4f}")

26/01/23 14:03:15 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
26/01/23 14:03:17 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
[Stage 395:>                                                        (0 + 3) / 3]

ðŸŽ¯ Accuracy: 0.9422
ðŸŽ¯ F1 Score: 0.9414


                                                                                

## Matrice de confusion

In [30]:
predictions.groupBy("flight_phase", "predicted_label").count() \
    .orderBy("flight_phase", "predicted_label").show(50)

26/01/23 14:03:18 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
26/01/23 14:03:20 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB


+------------+---------------+-----+
|flight_phase|predicted_label|count|
+------------+---------------+-----+
|       CLIMB|          CLIMB| 1731|
|       CLIMB|        TAKEOFF|   69|
|      CRUISE|         CRUISE| 7237|
|      CRUISE|        DESCENT|   36|
|      CRUISE|     TRANSITION|   39|
|     DESCENT|         CRUISE|    5|
|     DESCENT|        DESCENT| 2113|
|     DESCENT|     TRANSITION|    7|
|      GROUND|         GROUND|   22|
|      GROUND|     TRANSITION|   31|
|     TAKEOFF|          CLIMB|    4|
|     TAKEOFF|        TAKEOFF|  382|
|  TRANSITION|         CRUISE|  750|
|  TRANSITION|        DESCENT|   65|
|  TRANSITION|         GROUND|    8|
|  TRANSITION|     TRANSITION| 5048|
+------------+---------------+-----+



                                                                                

## Importance des features

In [31]:
import pandas as pd

rf_model = model.stages[3]
importances = rf_model.featureImportances.toArray()

pd.DataFrame({
    "feature": feature_columns,
    "importance": importances
}).sort_values("importance", ascending=False)

Unnamed: 0,feature,importance
2,altitude_change,0.383648
0,altitude_meters,0.194921
4,rolling_avg_altitude,0.158527
5,rolling_std_altitude,0.138574
1,velocity_kmh,0.065699
3,velocity_change,0.029685
6,rolling_avg_velocity,0.028947


## Sauvegarde

In [32]:
model.write().overwrite().save(GOLD_MODEL_PATH)
print(f"âœ… ModÃ¨le: {GOLD_MODEL_PATH}")

predictions.select(
    "event_timestamp", "icao24", "callsign", "flight_phase", "predicted_label", "probability"
).write.format("delta").mode("overwrite").save(GOLD_PREDICTIONS_PATH)

print(f"âœ… PrÃ©dictions: {GOLD_PREDICTIONS_PATH}")

âœ… ModÃ¨le: s3a://datalake/gold/models/rf_flight_phase


26/01/23 14:03:25 WARN DAGScheduler: Broadcasting large task binary with size 4.5 MiB
                                                                                

âœ… PrÃ©dictions: s3a://datalake/gold/predictions/flight_phase
