# Spark Structured Streaming Example

Purpose: Reads a stream of messages from a Kafka topic and writes a stream of aggregations over sliding event-time window to memory.

References: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html

Author:  Gary A. Stafford

Date: 2022-12-16

Updated by Vasilios Anagnostopoulos 30/09/2025

In [1]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructField,
    StructType,
    IntegerType,
    StringType,
    FloatType,
    TimestampType,
    BooleanType,
)

In [2]:
WINDOW_WATERMARK = "10 seconds"
WINDOW_DURATION = "20 seconds"
WINDOW_SLIDE = "10 seconds"
PROCESSING_TIME = "20 seconds"
SHOW_REFRESH = 20

BOOTSTRAP_SERVERS = "kafka:29092"
TOPIC_PURCHASES = "demo.purchases"

KAFKA_OPTIONS = {
    "kafka.bootstrap.servers": BOOTSTRAP_SERVERS,
    "subscribe": TOPIC_PURCHASES,
    "startingOffsets": "latest",
}

In [3]:
spark = SparkSession.builder.remote("sc://127.0.0.1:15002").appName("kafka-streaming-query").getOrCreate()
df_sales = spark.readStream.format("kafka").options(**KAFKA_OPTIONS).load()

In [4]:
schema = StructType(
    [
        StructField("transaction_time", TimestampType(), False),
        StructField("transaction_id", StringType(), False),
        StructField("product_id", StringType(), False),
        StructField("price", FloatType(), False),
        StructField("quantity", IntegerType(), False),
        StructField("is_member", BooleanType(), True),
        StructField("member_discount", FloatType(), True),
        StructField("add_supplements", BooleanType(), True),
        StructField("supplement_price", FloatType(), True),
    ]
)

ds_sales = (
    df_sales.selectExpr("CAST(value AS STRING)")
    .select(F.from_json("value", schema=schema).alias("data"))
    .select("data.*")
        .withColumn("sales", F.col("price")*F.col("quantity"))
    .withWatermark("transaction_time", WINDOW_WATERMARK)
    .groupBy("product_id", F.window("transaction_time", WINDOW_DURATION, WINDOW_SLIDE))
    .agg(F.sum("sales"), F.sum("quantity"))
    .select(
        "product_id",
        F.format_number("sum(sales)", 2).alias("total_sales"),
        F.format_number("sum(quantity)", 0).alias("total_items"),
        F.format_number(F.col("sum(sales)")/F.col("sum(quantity)"), 2).alias("mean_price"),
        "window.start",
        "window.end",
    )
    .coalesce(1)
    .writeStream
    .trigger(processingTime=PROCESSING_TIME)
    .outputMode("append")
    .format("json")
    .option("path", "sliding_windows/someoutput")
    .option("checkpointLocation", "sliding_windows/checkpoint") \
    .start()
)

ds_sales.awaitTermination()

RetriesExceeded: [RETRIES_EXCEEDED] The maximum number of retries has been exceeded.

In [None]:
ds_sales.stop()