# Spark Structured Streaming Example

Purpose: Reads a stream of messages from a Kafka topic and writes a stream of aggregations over sliding event-time window to memory.

References: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html

Author:  Gary A. Stafford

Date: 2022-12-16

In [1]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructField,
    StructType,
    IntegerType,
    StringType,
    FloatType,
    TimestampType,
    BooleanType,
)

In [2]:
spark = SparkSession.builder.remote("sc://127.0.0.1:15002").appName("kafka-streaming-query").getOrCreate()

In [3]:
BOOTSTRAP_SERVERS = "kafka:29092"
TOPIC_PURCHASES = "demo.purchases"

In [4]:
options = {
    "kafka.bootstrap.servers": BOOTSTRAP_SERVERS,
    "subscribe": TOPIC_PURCHASES,
    "startingOffsets": "earliest",
}

df_sales = spark.readStream.format("kafka").options(**options).load()

In [5]:
schema = StructType(
    [
        StructField("transaction_time", TimestampType(), False),
        StructField("transaction_id", StringType(), False),
        StructField("product_id", StringType(), False),
        StructField("price", FloatType(), False),
        StructField("quantity", IntegerType(), False),
        StructField("is_member", BooleanType(), True),
        StructField("member_discount", FloatType(), True),
        StructField("add_supplements", BooleanType(), True),
        StructField("supplement_price", FloatType(), True),
    ]
)

ds_sales = (
    df_sales.selectExpr("CAST(value AS STRING)")
    .select(F.from_json("value", schema=schema).alias("data"))
    .select("data.*")
        .withColumn("sales", F.col("price")*F.col("quantity"))
    .withWatermark("transaction_time", "2 minutes")
    .groupBy("product_id", F.window("transaction_time", "2 minutes", "1 minutes"))
    .agg(F.sum("sales"), F.sum("quantity"))
    .orderBy(F.col("window").desc(), F.col("sum(sales)").desc())
    .select(
        "product_id",
        F.format_number("sum(sales)", 2).alias("total_sales"),
        F.format_number("sum(quantity)", 0).alias("total_items"),
        "window.start",
        "window.end",
    )
    .coalesce(1)
    .writeStream.queryName("streaming_to_console")
    .trigger(processingTime="1 minute")
    .outputMode("complete")
    .format("memory")
    .start()
)

In [6]:
# data display auto-refresh reference: https://stackoverflow.com/a/61922073/580268

from IPython.display import display, clear_output
from time import sleep

In [None]:
while True:
    clear_output(wait=True)
    display(ds_sales.status)
    display(spark.sql("SELECT * FROM streaming_to_console").show())
    sleep(3)  # refresh every 3 seconds

{'message': 'Waiting for next trigger',
 'isDataAvailable': False,
 'isTriggerActive': False}

+----------+-----------+-----------+-------------------+-------------------+
|product_id|total_sales|total_items|              start|                end|
+----------+-----------+-----------+-------------------+-------------------+
|      IS02|      21.96|          4|2025-09-30 11:07:00|2025-09-30 11:09:00|
|      CS03|      19.96|          4|2025-09-30 11:07:00|2025-09-30 11:09:00|
|      CS06|      19.96|          4|2025-09-30 11:07:00|2025-09-30 11:09:00|
|      SF05|      17.97|          3|2025-09-30 11:07:00|2025-09-30 11:09:00|
|      CS10|      14.97|          3|2025-09-30 11:07:00|2025-09-30 11:09:00|
|      SF01|      11.98|          2|2025-09-30 11:07:00|2025-09-30 11:09:00|
|      SF07|      11.98|          2|2025-09-30 11:07:00|2025-09-30 11:09:00|
|      SF03|      11.98|          2|2025-09-30 11:07:00|2025-09-30 11:09:00|
|      CS01|       9.98|          2|2025-09-30 11:07:00|2025-09-30 11:09:00|
|      CS04|       9.98|          2|2025-09-30 11:07:00|2025-09-30 11:09:00|

None