# 04 - Exploration SQL

Analyse exploratoire des données Silver avec SparkSQL.

## Configuration

In [None]:
from config import get_s3_path, create_spark_session

SILVER_PATH = get_s3_path("silver", "flights")

spark = create_spark_session("ExplorationSQL")

df = spark.read.format("delta").load(SILVER_PATH)
df.createOrReplaceTempView("flights")

print(f"✅ {df.count():,} lignes chargées")

## Schéma des données

In [None]:
df.printSchema()

## Statistiques par pays

In [None]:
spark.sql("""
    SELECT 
        origin_country,
        COUNT(*) AS nb_observations,
        ROUND(AVG(altitude_meters), 2) AS avg_altitude,
        ROUND(AVG(velocity_kmh), 2) AS avg_velocity
    FROM flights
    WHERE origin_country IS NOT NULL
    GROUP BY origin_country
    ORDER BY nb_observations DESC
    LIMIT 10
""").show(truncate=False)

## Avions au sol vs en vol

In [None]:
spark.sql("""
    SELECT 
        on_ground,
        COUNT(*) AS count,
        ROUND(AVG(velocity_kmh), 2) AS avg_velocity
    FROM flights
    GROUP BY on_ground
""").show()

## Distribution des altitudes

In [None]:
spark.sql("""
    SELECT 
        CASE 
            WHEN altitude_meters IS NULL THEN 'NULL'
            WHEN altitude_meters < 1000 THEN '0-1000m'
            WHEN altitude_meters < 5000 THEN '1000-5000m'
            WHEN altitude_meters < 10000 THEN '5000-10000m'
            ELSE '10000m+'
        END AS altitude_range,
        COUNT(*) AS count
    FROM flights
    GROUP BY 1
    ORDER BY count DESC
""").show()

## Requête Batch avec Window Function

Analyse de l'évolution de chaque avion : altitude précédente, variation, et rang par vitesse dans son pays.

In [None]:
spark.sql("""
    SELECT
        icao24,
        callsign,
        origin_country,
        event_timestamp,
        altitude_meters,
        LAG(altitude_meters) OVER (PARTITION BY icao24 ORDER BY event_timestamp) AS prev_altitude,
        altitude_meters - LAG(altitude_meters) OVER (PARTITION BY icao24 ORDER BY event_timestamp) AS altitude_change,
        ROUND(AVG(altitude_meters) OVER (PARTITION BY icao24 ORDER BY event_timestamp ROWS BETWEEN 5 PRECEDING AND CURRENT ROW), 2) AS rolling_avg_altitude,
        RANK() OVER (PARTITION BY origin_country ORDER BY velocity_kmh DESC) AS velocity_rank_in_country
    FROM flights
    WHERE icao24 IS NOT NULL AND altitude_meters IS NOT NULL
    ORDER BY icao24, event_timestamp
    LIMIT 20
""").show(truncate=False)

## Aperçu des données

In [None]:
df.show(10, truncate=False)