# Spark Structured Streaming Example

Purpose: Reads a stream of messages from a Kafka topic and writes a stream of aggregations over sliding event-time window to memory.

References: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html

Author:  Gary A. Stafford

Date: 2022-12-16

In [1]:
import os

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructField,
    StructType,
    IntegerType,
    StringType,
    FloatType,
    TimestampType,
    BooleanType,
)
from pyspark.sql.window import Window

In [2]:
spark = SparkSession.builder.appName("kafka-streaming-query").getOrCreate()

In [3]:
BOOTSTRAP_SERVERS = "kafka:29092"
TOPIC_PURCHASES = "demo.purchases"

In [4]:
options = {
    "kafka.bootstrap.servers": BOOTSTRAP_SERVERS,
    "subscribe": TOPIC_PURCHASES,
    "startingOffsets": "earliest",
}

df_sales = spark.readStream.format("kafka").options(**options).load()

In [5]:
schema = StructType(
    [
        StructField("transaction_time", TimestampType(), False),
        StructField("transaction_id", IntegerType(), False),
        StructField("product_id", StringType(), False),
        StructField("price", FloatType(), False),
        StructField("quantity", IntegerType(), False),
        StructField("is_member", BooleanType(), True),
        StructField("member_discount", FloatType(), True),
        StructField("add_supplements", BooleanType(), True),
        StructField("supplement_price", FloatType(), True),
        StructField("total_purchase", FloatType(), False),
    ]
)

ds_sales = (
    df_sales.selectExpr("CAST(value AS STRING)")
    .select(F.from_json("value", schema=schema).alias("data"))
    .select("data.*")
    .withWatermark("transaction_time", "10 minutes")
    .groupBy("product_id", F.window("transaction_time", "10 minutes", "5 minutes"))
    .agg(F.sum("total_purchase"), F.sum("quantity"))
    .orderBy(F.col("window").desc(), F.col("sum(total_purchase)").desc())
    .select(
        "product_id",
        F.format_number("sum(total_purchase)", 2).alias("sales"),
        F.format_number("sum(quantity)", 0).alias("drinks"),
        "window.start",
        "window.end",
    )
    .coalesce(1)
    .writeStream.queryName("streaming_to_console")
    .trigger(processingTime="1 minute")
    .outputMode("complete")
    .format("memory")
    .start()
)

In [6]:
# data display auto-refresh reference: https://stackoverflow.com/a/61922073/580268

from IPython.display import display, clear_output
from time import sleep

In [7]:
while True:
    clear_output(wait=True)
    display(ds_sales.status)
    display(spark.sql("SELECT * FROM streaming_to_console").show())
    sleep(3)  # refresh every 3 seconds

{'message': 'Waiting for next trigger',
 'isDataAvailable': True,
 'isTriggerActive': False}

+----------+-----+------+-------------------+-------------------+
|product_id|sales|drinks|              start|                end|
+----------+-----+------+-------------------+-------------------+
|      SC01|21.95|     3|2022-12-27 14:30:00|2022-12-27 14:40:00|
|      CS04|18.95|     3|2022-12-27 14:30:00|2022-12-27 14:40:00|
|      CS11|14.97|     3|2022-12-27 14:30:00|2022-12-27 14:40:00|
|      SC05|11.98|     2|2022-12-27 14:30:00|2022-12-27 14:40:00|
|      SF03|11.38|     2|2022-12-27 14:30:00|2022-12-27 14:40:00|
|      SF02|11.38|     2|2022-12-27 14:30:00|2022-12-27 14:40:00|
|      IS02|10.98|     2|2022-12-27 14:30:00|2022-12-27 14:40:00|
|      SC02| 7.98|     1|2022-12-27 14:30:00|2022-12-27 14:40:00|
|      IS03| 5.49|     1|2022-12-27 14:30:00|2022-12-27 14:40:00|
|      SC03| 5.39|     1|2022-12-27 14:30:00|2022-12-27 14:40:00|
|      SC04| 5.39|     1|2022-12-27 14:30:00|2022-12-27 14:40:00|
|      SF04| 5.39|     1|2022-12-27 14:30:00|2022-12-27 14:40:00|
|      CS0

None

KeyboardInterrupt: 