# 02 - Streaming Bronze

Ingestion des donnÃ©es brutes depuis Kafka vers la couche Bronze (Delta Lake).

## Configuration

In [4]:
import os
from dotenv import load_dotenv
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, BooleanType, LongType
from config import get_s3_path, create_spark_session

load_dotenv()

KAFKA_BOOTSTRAP = os.getenv("KAFKA_BOOTSTRAP", "kafka1:9092")
TOPIC_NAME = os.getenv("TOPIC_NAME", "opensky-data")
BRONZE_PATH = get_s3_path("bronze", "flights")
CHECKPOINT_PATH = get_s3_path("checkpoints", "bronze_flights")

spark = create_spark_session("StreamingBronze", extra_packages=["org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.3"])

print(f"âœ… Output: {BRONZE_PATH}")

âœ… Spark Session 'StreamingBronze' configurÃ©e
âœ… Output: s3a://datalake/bronze/flights


                                                                                

## SchÃ©ma des donnÃ©es

In [5]:
schema = StructType([
    StructField("time", LongType(), True),
    StructField("icao24", StringType(), True),
    StructField("callsign", StringType(), True),
    StructField("origin_country", StringType(), True),
    StructField("time_position", LongType(), True),
    StructField("last_contact", LongType(), True),
    StructField("longitude", FloatType(), True),
    StructField("latitude", FloatType(), True),
    StructField("baro_altitude", FloatType(), True),
    StructField("on_ground", BooleanType(), True),
    StructField("velocity", FloatType(), True),
    StructField("true_track", FloatType(), True),
    StructField("vertical_rate", FloatType(), True),
    StructField("geo_altitude", FloatType(), True),
    StructField("squawk", StringType(), True),
    StructField("spi", BooleanType(), True),
    StructField("position_source", IntegerType(), True),
    StructField("category", IntegerType(), True)
])

print("âœ… SchÃ©ma dÃ©fini")

âœ… SchÃ©ma dÃ©fini


## Streaming Kafka â†’ Bronze

In [None]:
kafka_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP) \
    .option("subscribe", TOPIC_NAME) \
    .option("startingOffsets", "earliest") \
    .option("failOnDataLoss", "false") \
    .load()

parsed_df = kafka_df.select(
    from_json(col("value").cast("string"), schema).alias("data")
).select("data.*")

print(f"ðŸš€ Streaming vers {BRONZE_PATH}...")

query = parsed_df.writeStream \
    .format("delta") \
    .outputMode("append") \
    .option("checkpointLocation", CHECKPOINT_PATH) \
    .start(BRONZE_PATH)

query.awaitTermination()

ðŸš€ Streaming vers s3a://datalake/bronze/flights...


26/01/23 15:37:51 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
26/01/23 15:37:51 WARN StreamingQueryManager: Stopping existing streaming query [id=00581bd0-e6af-4af7-b49c-64568b446ba4, runId=f325fa16-14e6-41b9-b49e-b7bcec74108f], as a new run is being started.
26/01/23 15:37:51 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
26/01/23 15:38:14 ERROR NonFateSharingFuture: Failed to get result from future  
scala.runtime.NonLocalReturnControl
26/01/23 15:39:43 ERROR NonFateSharingFuture: Failed to get result from future  
scala.runtime.NonLocalReturnControl
26/01/23 15:41:19 ERROR NonFateSharingFuture: Failed to get result from future  
scala.runtime.NonLocalReturnControl
26/01/23 15:42:49 ERROR NonFateSharingFuture: Failed to get result from future  
scala.runtime.NonLocalReturnC