In [None]:
# Variables globales - Chargement depuis .env

import os
import threading
import time
import json
import requests
from kafka import KafkaProducer
from datetime import datetime
from dotenv import load_dotenv

# Charger les variables d'environnement depuis le fichier .env
load_dotenv()

KAFKA_BOOTSTRAP = os.getenv("KAFKA_BOOTSTRAP", "kafka1:9092")
TOPIC_NAME = os.getenv("TOPIC_NAME", "opensky-data")

GARAGE_ENDPOINT = os.getenv("GARAGE_ENDPOINT", "http://garage:3900")
GARAGE_HOST = os.getenv("GARAGE_HOST", "garage")
GARAGE_PORT = os.getenv("GARAGE_PORT", "3900")
ACCESS_KEY = os.getenv("ACCESS_KEY")
SECRET_KEY = os.getenv("SECRET_KEY")
BUCKET_NAME = os.getenv("BUCKET_NAME", "datalake")

API_BASE_URL = os.getenv("API_BASE_URL", "https://opensky-network.org/api/states/all")

if not ACCESS_KEY or not SECRET_KEY:
    raise ValueError("‚ùå ACCESS_KEY et SECRET_KEY doivent √™tre d√©finis dans le fichier .env")

print("‚úÖ Configuration charg√©e depuis .env")

In [None]:
# Initialisation de Spark avec r√©gion
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, BooleanType, LongType

# 1. Packages
packages = [
    "org.apache.hadoop:hadoop-aws:3.3.4",
    "com.amazonaws:aws-java-sdk-bundle:1.12.262",
    "org.apache.spark:spark-hadoop-cloud_2.12:3.5.3",
    "io.delta:delta-spark_2.12:3.0.0",
    "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3"
]

# 2. Configuration S3
spark = SparkSession.builder \
    .appName("OpenSkyFinal") \
    .config("spark.jars.packages", ",".join(packages)) \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.fs.s3a.endpoint", GARAGE_ENDPOINT) \
    .config("spark.hadoop.fs.s3a.access.key", ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.secret.key", SECRET_KEY) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false") \
    .config("spark.hadoop.fs.s3a.endpoint.region", "garage") \
    .config("spark.hadoop.fs.s3a.committer.name", "filesystem") \
    .config("spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2") \
    .config("spark.hadoop.fs.s3a.multiobjectdelete.enable", "false") \
    .config("spark.sql.shuffle.partitions", "10") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")
print("‚úÖ Spark Session configur√©e (R√©gion 'garage' forc√©e).")

In [None]:
# V√©rification du bucket
from minio import Minio

client = Minio(
    f"{GARAGE_HOST}:{GARAGE_PORT}",
    access_key=ACCESS_KEY,
    secret_key=SECRET_KEY,
    secure=False,
    region="garage"
)

if not client.bucket_exists(BUCKET_NAME):
    print(f"‚ö†Ô∏è Le bucket '{BUCKET_NAME}' n'existait pas, cr√©ation en cours...")
    client.make_bucket(BUCKET_NAME)
else:
    print(f"‚úÖ Le bucket '{BUCKET_NAME}' existe d√©j√†.")

In [None]:
# Thread producer qui envoie les donn√©es √† Kafka depuis l'API OpenSky
stop_producer = False

def run_producer():
    print(f"üöÄ [THREAD PRODUCER] D√©marrage vers {KAFKA_BOOTSTRAP}...")
    
    try:
        producer = KafkaProducer(
            bootstrap_servers=KAFKA_BOOTSTRAP,
            value_serializer=lambda v: json.dumps(v).encode('utf-8')
        )
    except Exception as e:
        print(f"‚ùå [THREAD PRODUCER] Erreur connexion Kafka: {e}")
        return

    api_url = "https://opensky-network.org/api/states/all"

    while not stop_producer:
        try:
            response = requests.get(api_url)
            if response.status_code == 200:
                data = response.json()
                states = data.get('states') or []
                timestamp = data['time']

                for s in states:
                    record = {
                        "time": timestamp,
                        "icao24": s[0],
                        "callsign": s[1].strip() if s[1] else None,
                        "origin_country": s[2],
                        "time_position": s[3],
                        "last_contact": s[4],
                        "longitude": s[5],
                        "latitude": s[6],
                        "baro_altitude": s[7],
                        "on_ground": s[8],
                        "velocity": s[9],
                        "true_track": s[10],
                        "vertical_rate": s[11],
                        "geo_altitude": s[13],
                        "squawk": s[14],
                        "spi": s[15],
                        "position_source": s[16],
                        "category": s[17] if len(s) > 17 else None
                    }
                    producer.send(TOPIC_NAME, record)
                
                producer.flush()
                print(f"üì° [THREAD PRODUCER] {len(states)} vols envoy√©s √† {datetime.now().strftime('%H:%M:%S')}")
            
            else:
                print(f"‚ö†Ô∏è [THREAD PRODUCER] API Status: {response.status_code}")

        except Exception as e:
            print(f"‚ö†Ô∏è [THREAD PRODUCER] Erreur: {e}")

        # Pause de 15s pour l'API
        time.sleep(15)
    
    print("üõë [THREAD PRODUCER] Arr√™t√©.")

# Lancement du Thread
producer_thread = threading.Thread(target=run_producer, daemon=True)
producer_thread.start()
print("‚úÖ Le Producer tourne en arri√®re-plan ! Passez √† la suite.")

In [None]:
# A utiliser pour arr√™ter le thread du producer
stop_producer = True
print("Demande d'arr√™t du producer envoy√©e.")

In [None]:
# 1. D√©finition du sch√©ma
schema = StructType([
    StructField("time", LongType(), True),
    StructField("icao24", StringType(), True),
    StructField("callsign", StringType(), True),
    StructField("origin_country", StringType(), True),
    StructField("time_position", LongType(), True),
    StructField("last_contact", LongType(), True),
    StructField("longitude", FloatType(), True),
    StructField("latitude", FloatType(), True),
    StructField("baro_altitude", FloatType(), True),
    StructField("on_ground", BooleanType(), True),
    StructField("velocity", FloatType(), True),
    StructField("true_track", FloatType(), True),
    StructField("vertical_rate", FloatType(), True),
    StructField("geo_altitude", FloatType(), True),
    StructField("squawk", StringType(), True),
    StructField("spi", BooleanType(), True),
    StructField("position_source", IntegerType(), True),
    StructField("category", IntegerType(), True)
])

# 2. Lecture Kafka
kafka_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP) \
    .option("subscribe", TOPIC_NAME) \
    .option("startingOffsets", "latest") \
    .load()

# 3. Parsing
parsed_df = kafka_df.select(
    from_json(col("value").cast("string"), schema).alias("data")
).select("data.*")

# 4. √âcriture S3 (Bronze)
checkpoint_path = f"s3a://{BUCKET_NAME}/checkpoints/bronze_flights"
output_path = f"s3a://{BUCKET_NAME}/bronze/flights"

print(f"üöÄ D√©marrage du Stream Spark vers {output_path}...")

query = parsed_df.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", checkpoint_path) \
    .start(output_path)

query.awaitTermination()

In [None]:
from pyspark.sql.functions import col, from_unixtime, to_timestamp, when, round

INPUT_BRONZE_PATH = f"s3a://{BUCKET_NAME}/bronze/flights"
OUTPUT_SILVER_PATH = f"s3a://{BUCKET_NAME}/silver/flights"
CHECKPOINT_SILVER = f"s3a://{BUCKET_NAME}/checkpoints/silver_flights"

# 1. Lecture en Streaming depuis la table Bronze (Delta)
# Spark surveille le dossier Bronze. D√®s qu'un fichier arrive, il le prend.
print("üìñ Lecture du flux Bronze...")
df_bronze = spark.readStream \
    .format("delta") \
    .load(INPUT_BRONZE_PATH)

# 2. Transformations (Nettoyage & Enrichissement)
df_silver = df_bronze \
    .filter(col("icao24").isNotNull()) \
    .filter(col("latitude").isNotNull() & col("longitude").isNotNull()) \
    .withColumn("event_timestamp", to_timestamp(from_unixtime(col("time")))) \
    .withColumn("velocity_kmh", round(col("velocity") * 3.6, 2)) \
    .withColumn("altitude_meters", col("baro_altitude")) \
    .select(
        "event_timestamp",
        "icao24",
        "callsign",
        "origin_country",
        "longitude",
        "latitude",
        "velocity_kmh",
        "altitude_meters",
        "on_ground",
        "category"
    )

# 3. √âcriture en Streaming vers Silver (Delta)
print(f"üöÄ D√©marrage du Stream vers Silver : {OUTPUT_SILVER_PATH}")
query_silver = df_silver.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", CHECKPOINT_SILVER) \
    .option("mergeSchema", "true") \
    .start(OUTPUT_SILVER_PATH)

query_silver.awaitTermination()

In [None]:
# Afficher les 5 premi√®res lignes de la table Silver
print("üîç Aper√ßu de la table Silver :")
spark.read.format("delta").load(OUTPUT_SILVER_PATH).show(5)