# 05 - Feature Engineering

CrÃ©ation des features pour le Machine Learning (Silver â†’ Silver_ML).

## Configuration

In [None]:
from pyspark.sql.functions import col, lag, avg, stddev, row_number, when, sqrt, pow, lit, min as spark_min, broadcast
from pyspark.sql.window import Window
from config import get_s3_path, create_spark_session

SILVER_PATH = get_s3_path("silver", "flights")
SILVER_ML_PATH = get_s3_path("silver", "flights_ml")
AIRPORTS_CSV = "./data/airports.csv"

spark = create_spark_session("FeatureEngineering")

print(f"âœ… Input:  {SILVER_PATH}")
print(f"âœ… Output: {SILVER_ML_PATH}")

## Lecture et nettoyage

In [None]:
df = spark.read.format("delta").load(SILVER_PATH)

df_clean = df \
    .filter(col("icao24").isNotNull()) \
    .filter(col("altitude_meters").between(-500, 15000)) \
    .filter(col("velocity_kmh").between(0, 1200))

print(f"ðŸ“Š {df_clean.count():,} lignes aprÃ¨s nettoyage")

## Features temporelles (Window Functions)

In [None]:
window_aircraft = Window.partitionBy("icao24").orderBy("event_timestamp")

df_temporal = df_clean \
    .withColumn("prev_altitude", lag("altitude_meters", 1).over(window_aircraft)) \
    .withColumn("prev_velocity", lag("velocity_kmh", 1).over(window_aircraft)) \
    .withColumn("altitude_change", col("altitude_meters") - col("prev_altitude")) \
    .withColumn("velocity_change", col("velocity_kmh") - col("prev_velocity")) \
    .withColumn("observation_rank", row_number().over(window_aircraft))

print("âœ… Features temporelles crÃ©Ã©es")

## Jointure avec les aÃ©roports

In [None]:
df_airports = spark.read.option("header", "true").csv(AIRPORTS_CSV).select(
    col("ident").alias("airport_icao"),
    col("name").alias("airport_name"),
    col("iso_country").alias("airport_country"),
    col("latitude_deg").cast("double").alias("airport_lat"),
    col("longitude_deg").cast("double").alias("airport_lon")
).filter(col("type").isin("large_airport", "medium_airport"))

df_on_ground = df_temporal.filter(col("on_ground") == True)
df_in_flight = df_temporal.filter(col("on_ground") == False)

df_with_airports = df_on_ground.crossJoin(broadcast(df_airports)).withColumn(
    "dist", sqrt(pow(col("latitude") - col("airport_lat"), 2) + pow(col("longitude") - col("airport_lon"), 2))
)

w = Window.partitionBy("icao24", "event_timestamp")
df_closest = df_with_airports.withColumn("min_dist", spark_min("dist").over(w)) \
    .filter(col("dist") == col("min_dist")) \
    .drop("dist", "min_dist", "airport_lat", "airport_lon")

df_enriched = df_closest.unionByName(
    df_in_flight.withColumn("airport_icao", lit(None))
                .withColumn("airport_name", lit(None))
                .withColumn("airport_country", lit(None)),
    allowMissingColumns=True
)

print("âœ… Jointure aÃ©roports rÃ©alisÃ©e")
print("\nðŸ“Š Top 10 des aÃ©roports avec le plus d'avions au sol (dÃ©collage/atterrissage) :")
df_closest.groupBy("airport_icao", "airport_name", "airport_country") \
    .count() \
    .orderBy("count", ascending=False) \
    .show(10, truncate=False)

## Features rolling window

In [None]:
rolling_window = Window.partitionBy("icao24").orderBy("event_timestamp").rowsBetween(-5, 0)

df_rolling = df_enriched \
    .withColumn("rolling_avg_altitude", avg("altitude_meters").over(rolling_window)) \
    .withColumn("rolling_std_altitude", stddev("altitude_meters").over(rolling_window)) \
    .withColumn("rolling_avg_velocity", avg("velocity_kmh").over(rolling_window))

print("âœ… Features rolling crÃ©Ã©es")
print("\nðŸ“Š AperÃ§u des features rolling (5 premiers avions) :")
df_rolling.select(
    "icao24", "event_timestamp", "altitude_meters", 
    "rolling_avg_altitude", "rolling_std_altitude", "rolling_avg_velocity"
).show(10, truncate=False)

## Label flight_phase

In [None]:
df_ml = df_rolling.withColumn(
    "flight_phase",
    when(col("on_ground") == True, "GROUND")
    .when((col("altitude_change") > 50) & (col("altitude_meters") < 3000), "TAKEOFF")
    .when(col("altitude_change") > 20, "CLIMB")
    .when(col("altitude_change").between(-20, 20) & (col("altitude_meters") > 8000), "CRUISE")
    .when(col("altitude_change") < -20, "DESCENT")
    .otherwise("TRANSITION")
)

print("ðŸ“Š Distribution :")
df_ml.groupBy("flight_phase").count().orderBy("count", ascending=False).show()

## Sauvegarde

In [None]:
df_ml.write.format("delta").mode("overwrite").save(SILVER_ML_PATH)

print(f"âœ… {df_ml.count():,} lignes sauvegardÃ©es")