# 07 - Streaming Gold

Agrégations temps réel avec fenêtres temporelles sur les données Silver.

**Requêtes implémentées:**
1. **Comptage par phase de vol** - Tumbling window 1 minute
2. **Alertes anomalies** - Sliding window 5 minutes (slide 1 min)

## Configuration

In [None]:
from pyspark.sql.functions import (
    col, window, count, avg, stddev, max as spark_max, min as spark_min,
    when, lit, current_timestamp
)
from config import get_s3_path, create_spark_session

SILVER_PATH = get_s3_path("silver", "flights")
GOLD_COUNTRY_STATS_PATH = get_s3_path("gold", "country_stats")
GOLD_COUNTRY_ANOMALIES_PATH = get_s3_path("gold", "country_anomalies")
CHECKPOINT_COUNTRY_STATS = get_s3_path("checkpoints", "gold_country_stats")
CHECKPOINT_COUNTRY_ANOMALIES = get_s3_path("checkpoints", "gold_country_anomalies")

spark = create_spark_session("StreamingGold")

print(f"Input:  {SILVER_PATH}")
print(f"Output Country Stats:    {GOLD_COUNTRY_STATS_PATH}")
print(f"Output Country Anomalies: {GOLD_COUNTRY_ANOMALIES_PATH}")

## Lecture du stream Silver

In [None]:
df_silver_stream = spark.readStream \
    .format("delta") \
    .load(SILVER_PATH)

print(f"Stream Silver initialisé")
print(f"Colonnes: {df_silver_stream.columns}")

## Stream 1 : Statistiques par pays (Tumbling Window 1 min)

Agrégation temps réel des statistiques de vol par pays d'origine avec une fenêtre tumbling de 1 minute.

In [None]:
df_country_stats = df_silver_stream \
    .withWatermark("event_timestamp", "2 minutes") \
    .groupBy(
        window(col("event_timestamp"), "1 minute"),
        col("origin_country")
    ) \
    .agg(
        count("*").alias("flight_count"),
        avg("altitude_meters").alias("avg_altitude"),
        avg("velocity_kmh").alias("avg_velocity"),
        count(when(col("on_ground") == True, 1)).alias("ground_count"),
        count(when(col("on_ground") == False, 1)).alias("airborne_count")
    ) \
    .select(
        col("window.start").alias("window_start"),
        col("window.end").alias("window_end"),
        col("origin_country"),
        col("flight_count"),
        col("avg_altitude"),
        col("avg_velocity"),
        col("ground_count"),
        col("airborne_count")
    )

print("Stream 1: Statistiques par pays (Tumbling Window 1 min)")

In [None]:
query_country_stats = df_country_stats.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", CHECKPOINT_COUNTRY_STATS) \
    .start(GOLD_COUNTRY_STATS_PATH)

print(f"Stream 1 démarré -> {GOLD_COUNTRY_STATS_PATH}")

## Stream 2 : Alertes anomalies par pays (Sliding Window 5 min, slide 1 min)

Détection de vitesses et altitudes anormales par pays d'origine avec une fenêtre glissante de 5 minutes.

In [None]:
# Supprimé - fusionné avec la cellule suivante

In [None]:
# Seuils d'anomalie
ALTITUDE_MAX_THRESHOLD = 12000  # mètres
VELOCITY_MAX_THRESHOLD = 1000   # km/h
ALTITUDE_MIN_THRESHOLD = -100   # mètres (sous le niveau de la mer)
VELOCITY_MIN_THRESHOLD = 0      # km/h

print(f"Seuils d'anomalie:")
print(f"  Altitude: {ALTITUDE_MIN_THRESHOLD}m - {ALTITUDE_MAX_THRESHOLD}m")
print(f"  Vitesse:  {VELOCITY_MIN_THRESHOLD} - {VELOCITY_MAX_THRESHOLD} km/h")

# Relecture du stream pour le second pipeline
df_silver_stream_2 = spark.readStream \
    .format("delta") \
    .load(SILVER_PATH)

df_anomalies = df_silver_stream_2 \
    .withColumn(
        "is_altitude_anomaly",
        when(
            (col("altitude_meters") > ALTITUDE_MAX_THRESHOLD) | 
            (col("altitude_meters") < ALTITUDE_MIN_THRESHOLD),
            1
        ).otherwise(0)
    ) \
    .withColumn(
        "is_velocity_anomaly",
        when(
            (col("velocity_kmh") > VELOCITY_MAX_THRESHOLD) | 
            (col("velocity_kmh") < VELOCITY_MIN_THRESHOLD),
            1
        ).otherwise(0)
    ) \
    .withWatermark("event_timestamp", "6 minutes") \
    .groupBy(
        window(col("event_timestamp"), "5 minutes", "1 minute"),
        col("origin_country")
    ) \
    .agg(
        count("*").alias("total_observations"),
        count(when(col("is_altitude_anomaly") == 1, 1)).alias("altitude_anomalies"),
        count(when(col("is_velocity_anomaly") == 1, 1)).alias("velocity_anomalies"),
        spark_max("altitude_meters").alias("max_altitude"),
        spark_min("altitude_meters").alias("min_altitude"),
        spark_max("velocity_kmh").alias("max_velocity"),
        avg("altitude_meters").alias("avg_altitude"),
        avg("velocity_kmh").alias("avg_velocity"),
        stddev("altitude_meters").alias("stddev_altitude"),
        stddev("velocity_kmh").alias("stddev_velocity")
    ) \
    .withColumn(
        "anomaly_rate",
        (col("altitude_anomalies") + col("velocity_anomalies")) / col("total_observations")
    ) \
    .select(
        col("window.start").alias("window_start"),
        col("window.end").alias("window_end"),
        col("origin_country"),
        col("total_observations"),
        col("altitude_anomalies"),
        col("velocity_anomalies"),
        col("anomaly_rate"),
        col("max_altitude"),
        col("min_altitude"),
        col("max_velocity"),
        col("avg_altitude"),
        col("avg_velocity"),
        col("stddev_altitude"),
        col("stddev_velocity")
    )

print("Stream 2: Alertes anomalies par pays (Sliding Window 5 min)")

In [None]:
query_country_anomalies = df_anomalies.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", CHECKPOINT_COUNTRY_ANOMALIES) \
    .start(GOLD_COUNTRY_ANOMALIES_PATH)

print(f"Stream 2 démarré -> {GOLD_COUNTRY_ANOMALIES_PATH}")

## Monitoring des streams

In [None]:
import time

print("Monitoring des streams Gold (Ctrl+C pour arrêter)")
print("=" * 60)

try:
    while True:
        print(f"\n{time.strftime('%H:%M:%S')}")
        print(f"  Country Stats:    {query_country_stats.status}")
        print(f"  Country Anomalies: {query_country_anomalies.status}")
        time.sleep(30)
except KeyboardInterrupt:
    print("\nArrêt demandé...")

## Arrêt des streams

In [None]:
query_country_stats.stop()
query_country_anomalies.stop()
print("Tous les streams Gold arrêtés")

## Vérification des données Gold

In [None]:
print("Statistiques Gold:")

try:
    df_stats = spark.read.format("delta").load(GOLD_COUNTRY_STATS_PATH)
    print(f"  Country Stats: {df_stats.count():,} lignes")
    print("\n  Dernières statistiques par pays:")
    df_stats.orderBy(col("window_start").desc()).limit(10).show(truncate=False)
except Exception as e:
    print(f"  Country Stats: Table non disponible ({e})")

try:
    df_anom = spark.read.format("delta").load(GOLD_COUNTRY_ANOMALIES_PATH)
    print(f"\n  Country Anomalies: {df_anom.count():,} lignes")
    print("\n  Pays avec le plus d'anomalies:")
    df_anom.filter(col("anomaly_rate") > 0) \
        .orderBy(col("anomaly_rate").desc()) \
        .limit(10).show(truncate=False)
except Exception as e:
    print(f"  Country Anomalies: Table non disponible ({e})")