# Spark Structured Streaming Example

Purpose: Reads a stream of messages from a Kafka topic and writes a stream of aggregations over sliding event-time window to memory.

References: https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html

Author:  Gary A. Stafford

Date: 2022-12-16

In [1]:
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructField,
    StructType,
    IntegerType,
    StringType,
    FloatType,
    TimestampType,
    BooleanType,
)

In [2]:
WINDOW_WATERMARK = "10 seconds"
WINDOW_DURATION = "20 seconds"
WINDOW_SLIDE = "10 seconds"
PROCESSING_TIME = "20 seconds"
SHOW_REFRESH = 20

BOOTSTRAP_SERVERS = "kafka:29092"
TOPIC_PURCHASES = "demo.purchases"

KAFKA_OPTIONS = {
    "kafka.bootstrap.servers": BOOTSTRAP_SERVERS,
    "subscribe": TOPIC_PURCHASES,
    "startingOffsets": "latest",
}

In [3]:
spark = SparkSession.builder.remote("sc://127.0.0.1:15002").appName("kafka-streaming-query").getOrCreate()
df_sales = spark.readStream.format("kafka").options(**KAFKA_OPTIONS).load()

In [5]:
schema = StructType(
    [
        StructField("transaction_time", TimestampType(), False),
        StructField("transaction_id", StringType(), False),
        StructField("product_id", StringType(), False),
        StructField("price", FloatType(), False),
        StructField("quantity", IntegerType(), False),
        StructField("is_member", BooleanType(), True),
        StructField("member_discount", FloatType(), True),
        StructField("add_supplements", BooleanType(), True),
        StructField("supplement_price", FloatType(), True),
    ]
)

ds_sales = (
    df_sales.selectExpr("CAST(value AS STRING)")
    .select(F.from_json("value", schema=schema).alias("data"))
    .select("data.*")
    .withColumn("sales", F.col("price")*F.col("quantity"))
    .where(F.col('product_id') == 'SC04')
    .groupBy("product_id")
    .agg(F.sum("sales"), F.sum("quantity"), F.max("transaction_time"))
    .orderBy(F.col("product_id").desc())
    .select(
        "product_id",
        F.col('max(transaction_time)'),
        F.format_number("sum(sales)", 2).alias("total_sales"),
        F.format_number("sum(quantity)", 0).alias("total_items"),
    )
    .coalesce(1)
    .writeStream
    .trigger(processingTime=PROCESSING_TIME)
    .outputMode("complete")
    #.outputMode("update")
    .format("json")
    .option("path", "running_totals/someoutput")
    .start()
)

AnalysisException: [STREAMING_OUTPUT_MODE.UNSUPPORTED_DATASOURCE] Invalid streaming output mode: update. This output mode is not supported in Data Source json. SQLSTATE: 42KDE

JVM stacktrace:
org.apache.spark.sql.AnalysisException
	at org.apache.spark.sql.errors.QueryCompilationErrors$.dataSourceOutputModeUnsupportedError(QueryCompilationErrors.scala:1679)
	at org.apache.spark.sql.execution.datasources.DataSource.createSink(DataSource.scala:335)
	at org.apache.spark.sql.classic.DataStreamWriter.createV1Sink(DataStreamWriter.scala:335)
	at org.apache.spark.sql.classic.DataStreamWriter.startInternal(DataStreamWriter.scala:288)
	at org.apache.spark.sql.classic.DataStreamWriter.start(DataStreamWriter.scala:136)
	at org.apache.spark.sql.connect.planner.SparkConnectPlanner.handleWriteStreamOperationStart(SparkConnectPlanner.scala:3137)
	at org.apache.spark.sql.connect.planner.SparkConnectPlanner.process(SparkConnectPlanner.scala:2501)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner.handleCommand(ExecuteThreadRunner.scala:322)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner.$anonfun$executeInternal$1(ExecuteThreadRunner.scala:224)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner.$anonfun$executeInternal$1$adapted(ExecuteThreadRunner.scala:196)
	at org.apache.spark.sql.connect.service.SessionHolder.$anonfun$withSession$2(SessionHolder.scala:341)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:804)
	at org.apache.spark.sql.connect.service.SessionHolder.$anonfun$withSession$1(SessionHolder.scala:341)
	at org.apache.spark.JobArtifactSet$.withActiveJobArtifactState(JobArtifactSet.scala:94)
	at org.apache.spark.sql.artifact.ArtifactManager.$anonfun$withResources$1(ArtifactManager.scala:112)
	at org.apache.spark.util.Utils$.withContextClassLoader(Utils.scala:186)
	at org.apache.spark.sql.artifact.ArtifactManager.withClassLoaderIfNeeded(ArtifactManager.scala:102)
	at org.apache.spark.sql.artifact.ArtifactManager.withResources(ArtifactManager.scala:111)
	at org.apache.spark.sql.connect.service.SessionHolder.withSession(SessionHolder.scala:340)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner.executeInternal(ExecuteThreadRunner.scala:196)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner.org$apache$spark$sql$connect$execution$ExecuteThreadRunner$$execute(ExecuteThreadRunner.scala:125)
	at org.apache.spark.sql.connect.execution.ExecuteThreadRunner$ExecutionThread.run(ExecuteThreadRunner.scala:347)

In [None]:
ds_sales.stop()