# 05 - Feature Engineering

CrÃ©ation des features pour le Machine Learning (Silver â†’ Silver_ML).

## Configuration

In [1]:
from pyspark.sql.functions import col, lag, avg, stddev, row_number, when, sqrt, pow, lit, min as spark_min, broadcast
from pyspark.sql.window import Window
from config import get_s3_path, create_spark_session

SILVER_PATH = get_s3_path("silver", "flights")
SILVER_ML_PATH = get_s3_path("silver", "flights_ml")
AIRPORTS_CSV = "./data/airports.csv"

spark = create_spark_session("FeatureEngineering")

print(f"âœ… Input:  {SILVER_PATH}")
print(f"âœ… Output: {SILVER_ML_PATH}")

âœ… Configuration chargÃ©e depuis .env
:: loading settings :: url = jar:file:/opt/conda/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
org.apache.spark#spark-hadoop-cloud_2.12 added as a dependency
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-17b83040-b284-473d-b13e-d3ce89c58e28;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.apache.spark#spark-hadoop-cloud_2.12;3.5.3 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.c

âœ… Spark Session 'FeatureEngineering' configurÃ©e
âœ… Input:  s3a://datalake/silver/flights
âœ… Output: s3a://datalake/silver/flights_ml


## Lecture et nettoyage

In [2]:
df = spark.read.format("delta").load(SILVER_PATH)

df_clean = df \
    .filter(col("icao24").isNotNull()) \
    .filter(col("altitude_meters").between(-500, 15000)) \
    .filter(col("velocity_kmh").between(0, 1200))

print(f"ðŸ“Š {df_clean.count():,} lignes aprÃ¨s nettoyage")

26/01/23 13:44:39 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
26/01/23 13:44:43 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

ðŸ“Š 88,432 lignes aprÃ¨s nettoyage


## Features temporelles (Window Functions)

In [3]:
window_aircraft = Window.partitionBy("icao24").orderBy("event_timestamp")

df_temporal = df_clean \
    .withColumn("prev_altitude", lag("altitude_meters", 1).over(window_aircraft)) \
    .withColumn("prev_velocity", lag("velocity_kmh", 1).over(window_aircraft)) \
    .withColumn("altitude_change", col("altitude_meters") - col("prev_altitude")) \
    .withColumn("velocity_change", col("velocity_kmh") - col("prev_velocity")) \
    .withColumn("observation_rank", row_number().over(window_aircraft))

print("âœ… Features temporelles crÃ©Ã©es")

âœ… Features temporelles crÃ©Ã©es


## Jointure avec les aÃ©roports

In [4]:
df_airports = spark.read.option("header", "true").csv(AIRPORTS_CSV).select(
    col("ident").alias("airport_icao"),
    col("name").alias("airport_name"),
    col("iso_country").alias("airport_country"),
    col("latitude_deg").cast("double").alias("airport_lat"),
    col("longitude_deg").cast("double").alias("airport_lon")
).filter(col("type").isin("large_airport", "medium_airport"))

df_on_ground = df_temporal.filter(col("on_ground") == True)
df_in_flight = df_temporal.filter(col("on_ground") == False)

df_with_airports = df_on_ground.crossJoin(broadcast(df_airports)).withColumn(
    "dist", sqrt(pow(col("latitude") - col("airport_lat"), 2) + pow(col("longitude") - col("airport_lon"), 2))
)

w = Window.partitionBy("icao24", "event_timestamp")
df_closest = df_with_airports.withColumn("min_dist", spark_min("dist").over(w)) \
    .filter(col("dist") == col("min_dist")) \
    .drop("dist", "min_dist", "airport_lat", "airport_lon")

df_enriched = df_closest.unionByName(
    df_in_flight.withColumn("airport_icao", lit(None))
                .withColumn("airport_name", lit(None))
                .withColumn("airport_country", lit(None)),
    allowMissingColumns=True
)

print("âœ… Jointure aÃ©roports rÃ©alisÃ©e")
print("\nðŸ“Š Top 10 des aÃ©roports avec le plus d'avions au sol (dÃ©collage/atterrissage) :")
df_closest.groupBy("airport_icao", "airport_name", "airport_country") \
    .count() \
    .orderBy("count", ascending=False) \
    .show(10, truncate=False)

âœ… Jointure aÃ©roports rÃ©alisÃ©e

ðŸ“Š Top 10 des aÃ©roports avec le plus d'avions au sol (dÃ©collage/atterrissage) :


[Stage 13:=====>                                                   (1 + 8) / 10]

+------------+-------------------------------------------------+---------------+-----+
|airport_icao|airport_name                                     |airport_country|count|
+------------+-------------------------------------------------+---------------+-----+
|CYYZ        |Toronto Pearson International Airport            |CA             |92   |
|LSZH        |ZÃ¼rich Airport                                   |CH             |73   |
|VECC        |Netaji Subhash Chandra Bose International Airport|IN             |36   |
|EDDS        |Stuttgart Airport                                |DE             |21   |
|LILN        |Varese-Venegono Airport                          |IT             |10   |
|PANC        |Ted Stevens Anchorage International Airport      |US             |9    |
|EYVI        |Vilnius International Airport                    |LT             |6    |
|OTHH        |Hamad International Airport                      |QA             |4    |
|LFBO        |Toulouse-Blagnac Airport    

                                                                                

## Features rolling window

In [5]:
rolling_window = Window.partitionBy("icao24").orderBy("event_timestamp").rowsBetween(-5, 0)

df_rolling = df_enriched \
    .withColumn("rolling_avg_altitude", avg("altitude_meters").over(rolling_window)) \
    .withColumn("rolling_std_altitude", stddev("altitude_meters").over(rolling_window)) \
    .withColumn("rolling_avg_velocity", avg("velocity_kmh").over(rolling_window))

print("âœ… Features rolling crÃ©Ã©es")
print("\nðŸ“Š AperÃ§u des features rolling (5 premiers avions) :")
df_rolling.select(
    "icao24", "event_timestamp", "altitude_meters", 
    "rolling_avg_altitude", "rolling_std_altitude", "rolling_avg_velocity"
).show(10, truncate=False)

âœ… Features rolling crÃ©Ã©es

ðŸ“Š AperÃ§u des features rolling (5 premiers avions) :




+------+-------------------+---------------+--------------------+--------------------+--------------------+
|icao24|event_timestamp    |altitude_meters|rolling_avg_altitude|rolling_std_altitude|rolling_avg_velocity|
+------+-------------------+---------------+--------------------+--------------------+--------------------+
|008081|2026-01-23 13:35:51|10431.78       |10431.7802734375    |NULL                |716.04              |
|008081|2026-01-23 13:36:01|10500.36       |10466.0703125       |48.49343829649071   |714.3299999999999   |
|008081|2026-01-23 13:36:01|10500.36       |10477.500325520834  |39.59472656651432   |713.7599999999999   |
|008081|2026-01-23 13:36:13|10591.8        |10506.0751953125    |65.66014251959966   |712.4399999999999   |
|008081|2026-01-23 13:36:26|10660.38       |10536.9361328125    |89.41715749041195   |711.986             |
|008081|2026-01-23 13:36:30|10683.24       |10561.320149739584  |99.81895677642521   |711.6833333333333   |
|008081|2026-01-23 13:36:44|

                                                                                

## Label flight_phase

In [6]:
df_ml = df_rolling.withColumn(
    "flight_phase",
    when(col("on_ground") == True, "GROUND")
    .when((col("altitude_change") > 50) & (col("altitude_meters") < 3000), "TAKEOFF")
    .when(col("altitude_change") > 20, "CLIMB")
    .when(col("altitude_change").between(-20, 20) & (col("altitude_meters") > 8000), "CRUISE")
    .when(col("altitude_change") < -20, "DESCENT")
    .otherwise("TRANSITION")
)

print("ðŸ“Š Distribution :")
df_ml.groupBy("flight_phase").count().orderBy("count", ascending=False).show()

ðŸ“Š Distribution :




+------------+-----+
|flight_phase|count|
+------------+-----+
|      CRUISE|37190|
|  TRANSITION|29374|
|     DESCENT|10699|
|       CLIMB| 8877|
|     TAKEOFF| 2035|
|      GROUND|  257|
+------------+-----+



                                                                                

## Sauvegarde

In [7]:
df_ml.write.format("delta").mode("overwrite").save(SILVER_ML_PATH)

print(f"âœ… {df_ml.count():,} lignes sauvegardÃ©es")

                                                                                

âœ… 88,432 lignes sauvegardÃ©es
