# Spark Structured Streaming + Kafka
Real-time data processing with Spark and Kafka

In [None]:
import os
from spark_config import get_spark_session

# Note: Spark Streaming requires local or cluster mode, not Spark Connect
spark = get_spark_session(
    app_name="SparkStreamingDemo",
    local_mode=True,  # Use local mode for streaming
    extra_configs={
        "spark.jars.packages": "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.7"
    }
)

In [None]:
# Kafka configuration
KAFKA_BOOTSTRAP_SERVERS = os.environ.get('KAFKA_BOOTSTRAP_SERVERS', 'kafka:9092')
INPUT_TOPIC = 'events'
OUTPUT_TOPIC = 'processed-events'

## Read from Kafka

In [None]:
# Read streaming data from Kafka
kafka_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS) \
    .option("subscribe", INPUT_TOPIC) \
    .option("startingOffsets", "latest") \
    .load()

kafka_df.printSchema()

In [None]:
# Parse JSON events
from pyspark.sql.functions import from_json, col, window, count, avg
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

# Define schema for incoming events
event_schema = StructType([
    StructField("event_id", StringType(), True),
    StructField("event_type", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("value", DoubleType(), True),
    StructField("timestamp", TimestampType(), True)
])

# Parse JSON
events_df = kafka_df \
    .select(from_json(col("value").cast("string"), event_schema).alias("event")) \
    .select("event.*")

## Windowed Aggregations

In [None]:
# Windowed aggregation - count events per type in 1-minute windows
windowed_counts = events_df \
    .withWatermark("timestamp", "5 minutes") \
    .groupBy(
        window(col("timestamp"), "1 minute", "30 seconds"),
        col("event_type")
    ) \
    .agg(
        count("*").alias("event_count"),
        avg("value").alias("avg_value")
    )

## Write to Console (for debugging)

In [None]:
# Write to console for debugging
# query = windowed_counts \
#     .writeStream \
#     .outputMode("update") \
#     .format("console") \
#     .option("truncate", "false") \
#     .start()
# 
# query.awaitTermination(60)  # Run for 60 seconds
# query.stop()

## Write to Kafka

In [None]:
from pyspark.sql.functions import to_json, struct

# Prepare output for Kafka
output_df = windowed_counts \
    .select(
        col("event_type").alias("key"),
        to_json(struct(
            col("window.start").alias("window_start"),
            col("window.end").alias("window_end"),
            col("event_type"),
            col("event_count"),
            col("avg_value")
        )).alias("value")
    )

# Write to Kafka
# query = output_df \
#     .writeStream \
#     .format("kafka") \
#     .option("kafka.bootstrap.servers", KAFKA_BOOTSTRAP_SERVERS) \
#     .option("topic", OUTPUT_TOPIC) \
#     .option("checkpointLocation", "s3a://checkpoints/streaming/windowed-counts") \
#     .outputMode("update") \
#     .start()

## Write to S3 (Parquet)

In [None]:
# Write raw events to S3 as Parquet
# query = events_df \
#     .writeStream \
#     .format("parquet") \
#     .option("path", "s3a://streaming-output/events") \
#     .option("checkpointLocation", "s3a://checkpoints/streaming/events") \
#     .partitionBy("event_type") \
#     .outputMode("append") \
#     .trigger(processingTime="1 minute") \
#     .start()

## Generate Test Data (for testing)

In [None]:
# Helper function to produce test events to Kafka
def produce_test_events(bootstrap_servers, topic, n_events=100):
    """Produce test events to Kafka topic."""
    from kafka import KafkaProducer
    import json
    import random
    from datetime import datetime
    import uuid
    
    producer = KafkaProducer(
        bootstrap_servers=bootstrap_servers,
        value_serializer=lambda v: json.dumps(v).encode('utf-8')
    )
    
    event_types = ['click', 'view', 'purchase', 'signup']
    
    for i in range(n_events):
        event = {
            'event_id': str(uuid.uuid4()),
            'event_type': random.choice(event_types),
            'user_id': f'user_{random.randint(1, 100)}',
            'value': random.uniform(1, 1000),
            'timestamp': datetime.utcnow().isoformat()
        }
        producer.send(topic, value=event)
    
    producer.flush()
    print(f"Produced {n_events} events to {topic}")

# Uncomment to produce test events
# produce_test_events(KAFKA_BOOTSTRAP_SERVERS, INPUT_TOPIC, 100)

In [None]:
spark.stop()