# Spark Structured Streaming Example

Purpose: Reads a stream of messages from a Kafka topic and writes a stream of aggregations over sliding event-time window to memory.

References: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html

Author:  Gary A. Stafford

Date: 2022-12-16

In [1]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructField,
    StructType,
    IntegerType,
    StringType,
    FloatType,
    TimestampType,
    BooleanType,
)

In [2]:
WINDOW_WATERMARK = "10 seconds"
WINDOW_DURATION = "20 seconds"
WINDOW_SLIDE = "10 seconds"
PROCESSING_TIME = "20 seconds"
SHOW_REFRESH = 20

BOOTSTRAP_SERVERS = "kafka:29092"
TOPIC_PURCHASES = "demo.purchases"

KAFKA_OPTIONS = {
    "kafka.bootstrap.servers": BOOTSTRAP_SERVERS,
    "subscribe": TOPIC_PURCHASES,
    "startingOffsets": "latest",
}

In [30]:
spark = SparkSession.builder.remote("sc://127.0.0.1:15002").appName("kafka-streaming-query").getOrCreate()
df_sales = spark.readStream.format("kafka").options(**KAFKA_OPTIONS).load()

schema = StructType(
    [
        StructField("transaction_time", TimestampType(), False),
        StructField("transaction_id", StringType(), False),
        StructField("product_id", StringType(), False),
        StructField("price", FloatType(), False),
        StructField("quantity", IntegerType(), False),
        StructField("is_member", BooleanType(), True),
        StructField("member_discount", FloatType(), True),
        StructField("add_supplements", BooleanType(), True),
        StructField("supplement_price", FloatType(), True),
    ]
)

ds_sales = (
    df_sales.selectExpr("CAST(value AS STRING)")
    .select(F.from_json("value", schema=schema).alias("data"))
    .select("data.*")
        .withColumn("sales", F.col("price")*F.col("quantity"))
    #.withWatermark("transaction_time", WINDOW_WATERMARK)
    .groupBy("product_id", F.window("transaction_time", WINDOW_DURATION, WINDOW_SLIDE))
    .agg(F.sum("sales"), F.sum("quantity"))
    .select(
        "product_id",
        F.format_number("sum(sales)", 2).alias("total_sales"),
        F.format_number("sum(quantity)", 0).alias("total_items"),
        "window.start",
        "window.end",
    )
    #.orderBy(F.col("window").desc(), F.col("product_id").desc())
    .coalesce(1)
    .writeStream.queryName("streaming_to_console")
    .trigger(processingTime=PROCESSING_TIME)
    .outputMode("update")
    .format("memory")
    .start()
)

In [31]:
# data display auto-refresh reference: https://stackoverflow.com/a/61922073/580268

from IPython.display import display, clear_output
from time import sleep

while True:
    clear_output(wait=True)
    display(ds_sales.status)
    display(spark.sql("SELECT * FROM streaming_to_console").show(truncate=False))
    sleep(SHOW_REFRESH)  # refresh every SHOW_REFRESH seconds

{'message': 'Processing new data',
 'isDataAvailable': True,
 'isTriggerActive': True}

+----------+-----------+-----------+-------------------+-------------------+
|product_id|total_sales|total_items|start              |end                |
+----------+-----------+-----------+-------------------+-------------------+
|CS05      |9.98       |2          |2025-10-02 10:39:00|2025-10-02 10:39:20|
|CS05      |9.98       |2          |2025-10-02 10:38:50|2025-10-02 10:39:10|
|CS05      |14.97      |3          |2025-10-02 10:39:00|2025-10-02 10:39:20|
|SF01      |11.98      |2          |2025-10-02 10:39:00|2025-10-02 10:39:20|
|CS09      |4.99       |1          |2025-10-02 10:39:10|2025-10-02 10:39:30|
|CS11      |4.99       |1          |2025-10-02 10:39:10|2025-10-02 10:39:30|
|CS11      |4.99       |1          |2025-10-02 10:39:00|2025-10-02 10:39:20|
|SC04      |5.99       |1          |2025-10-02 10:39:00|2025-10-02 10:39:20|
|SF04      |11.98      |2          |2025-10-02 10:39:00|2025-10-02 10:39:20|
|SF02      |5.99       |1          |2025-10-02 10:39:00|2025-10-02 10:39:20|

None

KeyboardInterrupt: 

In [32]:
ds_sales.stop()