# Classification Random Forest - Silver_ML to Gold

Ce notebook impl√©mente un pipeline MLlib complet pour classifier les phases de vol :
- Lecture des donn√©es Silver_ML
- Pipeline de pr√©paration et classification
- Random Forest Classifier
- √âvaluation et m√©triques
- Sauvegarde dans la couche Gold

In [None]:
# Cellule 1 : Configuration
%pip install python-dotenv

import os
from dotenv import load_dotenv
from pyspark.sql import SparkSession

# Charger les variables d'environnement depuis le fichier .env
load_dotenv()

GARAGE_ENDPOINT = os.getenv("GARAGE_ENDPOINT", "http://garage:3900")
ACCESS_KEY = os.getenv("ACCESS_KEY")
SECRET_KEY = os.getenv("SECRET_KEY")
BUCKET_NAME = os.getenv("BUCKET_NAME", "datalake")

if not ACCESS_KEY or not SECRET_KEY:
    raise ValueError("‚ùå ACCESS_KEY et SECRET_KEY doivent √™tre d√©finis dans le fichier .env")

# D√©finir les chemins
SILVER_ML_PATH = f"s3a://{BUCKET_NAME}/silver/flights_ml"
GOLD_MODEL_PATH = f"s3a://{BUCKET_NAME}/gold/models/rf_flight_phase"
GOLD_PREDICTIONS_PATH = f"s3a://{BUCKET_NAME}/gold/predictions/flight_phase"

print("‚úÖ Configuration charg√©e depuis .env")
print(f"üìÇ Silver ML Path: {SILVER_ML_PATH}")
print(f"üìÇ Gold Model Path: {GOLD_MODEL_PATH}")
print(f"üìÇ Gold Predictions Path: {GOLD_PREDICTIONS_PATH}")

In [None]:
# Initialisation de Spark avec configuration S3/Delta

# 1. Packages
packages = [
    "org.apache.hadoop:hadoop-aws:3.3.4",
    "com.amazonaws:aws-java-sdk-bundle:1.12.262",
    "org.apache.spark:spark-hadoop-cloud_2.12:3.5.3",
    "io.delta:delta-spark_2.12:3.0.0"
]

# 2. Configuration Spark
spark = SparkSession.builder \
    .appName("RandomForestClassification_Gold") \
    .config("spark.jars.packages", ",".join(packages)) \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.endpoint", GARAGE_ENDPOINT) \
    .config("spark.hadoop.fs.s3a.access.key", ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.secret.key", SECRET_KEY) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.endpoint.region", "garage") \
    .config("spark.hadoop.fs.s3a.committer.name", "filesystem") \
    .config("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2") \
    .config("spark.hadoop.fs.s3a.multiobjectdelete.enable", "false") \
    .config("spark.sql.shuffle.partitions", "10") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print("‚úÖ Spark Session configur√©e pour Random Forest Classification")

In [None]:
# Cellule 2 : Lecture et pr√©paration des donn√©es

from pyspark.sql.functions import col

print(f"üìñ Lecture des donn√©es Silver_ML depuis {SILVER_ML_PATH}...")
df_ml = spark.read.format("delta").load(SILVER_ML_PATH)

# Afficher le sch√©ma
print("\nüìã Sch√©ma des donn√©es :")
df_ml.printSchema()

# Compter avant filtrage
count_before = df_ml.count()
print(f"\nüìä Nombre de lignes avant filtrage : {count_before:,}")

# D√©finir les colonnes de features
feature_columns = [
    "altitude_meters",
    "velocity_kmh",
    "altitude_change",
    "velocity_change",
    "rolling_avg_altitude",
    "rolling_std_altitude",
    "rolling_avg_velocity"
]

# Filtrer uniquement les lignes avec label NULL
print("\nüßπ Filtrage des lignes avec label NULL...")
df_clean = df_ml.filter(col("flight_phase").isNotNull())

# Remplacer les NULL dans les features par 0 au lieu de supprimer les lignes
print("üîß Remplacement des NULL dans les features par 0...")
df_clean = df_clean.fillna(0, subset=feature_columns)

count_after = df_clean.count()
removed = count_before - count_after
print(f"Lignes apr√®s nettoyage : {count_after:,}")
print(f"Lignes supprim√©es : {removed:,} ({100 * removed / count_before:.2f}% if count_before > 0 else 0}%)")

# Afficher la distribution du label
print("\nüìä Distribution du label 'flight_phase' :")
df_clean.groupBy("flight_phase").count().orderBy("count", ascending=False).show()

print("\n‚úÖ Donn√©es pr√™tes pour l'entra√Ænement")

In [None]:
# Cellule 3 : Construction du Pipeline MLlib

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler, IndexToString
from pyspark.ml.classification import RandomForestClassifier

print("üî® Construction du Pipeline MLlib...")

# Stage 1 : StringIndexer - Convertir flight_phase en label num√©rique
print("\n1Ô∏è‚É£ StringIndexer : flight_phase -> label")
label_indexer = StringIndexer(
    inputCol="flight_phase",
    outputCol="label",
    handleInvalid="skip"
)

# Stage 2 : VectorAssembler - Assembler les features en vecteur
print("2Ô∏è‚É£ VectorAssembler : features num√©riques -> features_raw")
print(f"   Features utilis√©es : {', '.join(feature_columns)}")
vector_assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol="features_raw",
    handleInvalid="skip"
)

# Stage 3 : StandardScaler - Normaliser les features
print("3Ô∏è‚É£ StandardScaler : features_raw -> features")
scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features",
    withStd=True,
    withMean=False
)

# Stage 4 : RandomForestClassifier
print("4Ô∏è‚É£ RandomForestClassifier : 100 arbres, maxDepth=10")
rf = RandomForestClassifier(
    labelCol="label",
    featuresCol="features",
    predictionCol="prediction",
    probabilityCol="probability",
    numTrees=100,
    maxDepth=10,
    seed=42
)

# Stage 5 : IndexToString - Reconvertir les pr√©dictions en labels lisibles
print("5Ô∏è‚É£ IndexToString : prediction -> predicted_label")
label_converter = IndexToString(
    inputCol="prediction",
    outputCol="predicted_label",
    labels=label_indexer.fit(df_clean).labels
)

# Cr√©er le pipeline
pipeline = Pipeline(stages=[
    label_indexer,
    vector_assembler,
    scaler,
    rf,
    label_converter
])

print("\n‚úÖ Pipeline cr√©√© avec 5 stages")

In [None]:
# Cellule 4 : Split et entra√Ænement

print("üìä Split des donn√©es en train/test (80/20)...")
train_df, test_df = df_clean.randomSplit([0.8, 0.2], seed=42)

train_count = train_df.count()
test_count = test_df.count()
print(f"üìä Train : {train_count:,} lignes ({100 * train_count / (train_count + test_count):.1f}%)")
print(f"üìä Test  : {test_count:,} lignes ({100 * test_count / (train_count + test_count):.1f}%)")

# Afficher la distribution des labels dans train et test
print("\nüìä Distribution du label dans le set d'entra√Ænement :")
train_df.groupBy("flight_phase").count().orderBy("count", ascending=False).show()

print("\nüöÄ Entra√Ænement du mod√®le Random Forest...")
print("‚è≥ Cela peut prendre quelques minutes...")
model = pipeline.fit(train_df)

print("\n‚úÖ Mod√®le entra√Æn√© avec succ√®s !")

In [None]:
# Cellule 5 : Pr√©dictions et √©valuation

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

print("üîÆ G√©n√©ration des pr√©dictions sur le set de test...")
predictions = model.transform(test_df)

# Afficher quelques pr√©dictions
print("\nüîç Aper√ßu des pr√©dictions :")
predictions.select(
    "icao24", "event_timestamp", "altitude_meters", "velocity_kmh",
    "flight_phase", "predicted_label", "probability"
).show(10, truncate=False)

# Calculer les m√©triques
print("\nüìä Calcul des m√©triques d'√©valuation...")

evaluator_acc = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="f1"
)

evaluator_precision = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="weightedPrecision"
)

evaluator_recall = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="weightedRecall"
)

accuracy = evaluator_acc.evaluate(predictions)
f1 = evaluator_f1.evaluate(predictions)
precision = evaluator_precision.evaluate(predictions)
recall = evaluator_recall.evaluate(predictions)

print("\n" + "="*50)
print("           M√âTRIQUES D'√âVALUATION")
print("="*50)
print(f"üéØ Accuracy  : {accuracy:.4f} ({100*accuracy:.2f}%)")
print(f"üéØ F1 Score  : {f1:.4f}")
print(f"üéØ Precision : {precision:.4f}")
print(f"üéØ Recall    : {recall:.4f}")
print("="*50)

In [None]:
# Cellule 6 : Matrice de confusion

print("üìä Matrice de confusion :")
print("\nLignes = Label r√©el | Colonnes = Label pr√©dit\n")

confusion_matrix = predictions.groupBy("flight_phase", "predicted_label") \
    .count() \
    .orderBy("flight_phase", "predicted_label")

confusion_matrix.show(100, truncate=False)

# Calculer l'accuracy par classe
print("\nüìä Accuracy par classe :")
from pyspark.sql.functions import sum as _sum, col, when

class_accuracy = predictions.groupBy("flight_phase").agg(
    _sum(when(col("flight_phase") == col("predicted_label"), 1).otherwise(0)).alias("correct"),
    _sum(when(col("flight_phase") != col("predicted_label"), 1).otherwise(0)).alias("incorrect")
).withColumn(
    "accuracy",
    col("correct") / (col("correct") + col("incorrect"))
).orderBy("accuracy", ascending=False)

class_accuracy.show(truncate=False)

In [None]:
# Cellule 7 : Feature Importance

import pandas as pd

print("üìä Analyse de l'importance des features...")

# Extraire le Random Forest du pipeline (stage 3, car index√© √† partir de 0)
# Stages: 0=StringIndexer, 1=VectorAssembler, 2=StandardScaler, 3=RandomForest, 4=IndexToString
rf_model = model.stages[3]

# Extraire les importances
importances = rf_model.featureImportances.toArray()

# Cr√©er un DataFrame pandas pour un affichage plus lisible
feature_importance_df = pd.DataFrame({
    "feature": feature_columns,
    "importance": importances,
    "importance_pct": [f"{100*imp:.2f}%" for imp in importances]
}).sort_values("importance", ascending=False)

print("\n" + "="*60)
print("           IMPORTANCE DES FEATURES")
print("="*60)
print(feature_importance_df.to_string(index=False))
print("="*60)

# Statistiques suppl√©mentaires sur le mod√®le
print(f"\nüå≥ Nombre d'arbres : {rf_model.getNumTrees}")
print(f"üå≥ Profondeur max  : {rf_model.getMaxDepth()}")
print(f"üå≥ Nombre de features : {len(feature_columns)}")

In [None]:
# Cellule 8 : Sauvegarder le mod√®le et les pr√©dictions

print("üíæ Sauvegarde du mod√®le et des pr√©dictions...")

# Sauvegarder le mod√®le
print(f"\nüì¶ Sauvegarde du mod√®le dans {GOLD_MODEL_PATH}...")
model.write().overwrite().save(GOLD_MODEL_PATH)
print(f"‚úÖ Mod√®le sauvegard√© dans {GOLD_MODEL_PATH}")

# Sauvegarder les pr√©dictions
print(f"\nüì¶ Sauvegarde des pr√©dictions dans {GOLD_PREDICTIONS_PATH}...")
predictions_to_save = predictions.select(
    "event_timestamp",
    "icao24",
    "callsign",
    "origin_country",
    "altitude_meters",
    "velocity_kmh",
    "flight_phase",
    "predicted_label",
    "probability"
)

predictions_to_save.write.format("delta").mode("overwrite").save(GOLD_PREDICTIONS_PATH)
print(f"‚úÖ Pr√©dictions sauvegard√©es dans {GOLD_PREDICTIONS_PATH}")
print(f"üìä Nombre de pr√©dictions sauvegard√©es : {predictions_to_save.count():,}")

# V√©rification
print("\nüîç V√©rification des donn√©es sauvegard√©es :")
saved_predictions = spark.read.format("delta").load(GOLD_PREDICTIONS_PATH)
print(f"‚úÖ {saved_predictions.count():,} lignes lues depuis {GOLD_PREDICTIONS_PATH}")
saved_predictions.show(5, truncate=False)

print("\n" + "="*60)
print("   ‚úÖ PIPELINE COMPLET TERMIN√â AVEC SUCC√àS !")
print("="*60)
print(f"üìÇ Mod√®le      : {GOLD_MODEL_PATH}")
print(f"üìÇ Pr√©dictions : {GOLD_PREDICTIONS_PATH}")
print(f"üéØ Accuracy    : {accuracy:.4f}")
print(f"üéØ F1 Score    : {f1:.4f}")
print("="*60)