In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.
    builder.
    appName("Handle error and exceptions").
    config("spark.streaming.stopGracefullyOnShutdown", True).
    config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0').
    config("spark.shuffle.partitions", 4).
    master("local[*]").
    getOrCreate()
    
)

spark

In [2]:
kafka_df = (
    spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:19092")
    .option("subscribe", "watermark-data")
    .option("startingOffsets", "earliest")
    .load()
)

In [10]:
from pyspark.sql.functions import expr, from_json, col

json_schema = "event_time string, data string"
kafka_df_flatten = (
            kafka_df.
            withColumn("message_as_string", expr("cast(value as string)")).
            withColumn("message_value_as_json", from_json(col("message_as_string"), json_schema)).
            select("message_value_as_json.event_time", "message_value_as_json.data")
    )


kafka_df_flatten.printSchema()

root
 |-- event_time: string (nullable = true)
 |-- data: string (nullable = true)



In [14]:
from pyspark.sql.functions import split, explode, cast

per_word_df = (
                kafka_df_flatten.
                withColumn("words_list", split("data", " ")).
                withColumn("by_word",explode("words_list")).
                withColumn("event_time_as_time", col("event_time").cast("timestamp")).
                select("event_time_as_time", "by_word")
            )

per_word_df.printSchema()

root
 |-- event_time_as_time: timestamp (nullable = true)
 |-- by_word: string (nullable = false)



In [22]:
# Aggregate by watermark
''' 
    Watermark time: how late can be an event
                    Late events allowed are those were their Event time is > max(event_time_received) - watermark time
                    The event that have event time prior of (max(event_time_received) - watermark time) are late and will be ignored
    
   window("event_time_as_time", "10 minutes"): event time column, size of the window, sliding minutes of the window (not sliding if parameter not provided)
'''

from pyspark.sql.functions import window, count, lit


df_agg = (
            per_word_df.
            withWatermark("event_time_as_time", "10 minutes").
            groupBy(window("event_time_as_time", "10 minutes"), "by_word").
            agg(count(lit(1)).alias("cnt")).
            select("window.start", "window.end", "by_word", "cnt")
        )
df_agg.printSchema()

root
 |-- start: timestamp (nullable = true)
 |-- end: timestamp (nullable = true)
 |-- by_word: string (nullable = false)
 |-- cnt: long (nullable = false)



In [24]:
'''


    writeStream.outputMode("complete"): does not work with watermarks
                                  complete mode will calculate the event in the window even if should be discurted based on watermark time
                                  the spark keeps in-memory the whole dataset in order to output complete result
    writeStream.outputMode("update"): update mode take account the watermark time and works correct
                                in update mode spark remove from its memory the watermarks that are not allowed to updated based on watermark time 

'''

(
    df_agg.
    writeStream.
    format("console").
    outputMode("complete").
    trigger(processingTime="10 seconds").
    option("checkpointLocation", "/home/jovyan/data/checkpoint_kafka/windows_complete").
    start()
)

(
    df_agg.
    writeStream.
    format("console").
    outputMode("update").
    trigger(processingTime="10 seconds").
    option("checkpointLocation", "/home/jovyan/data/checkpoint_kafka/windows_update").
    start()
)

<pyspark.sql.streaming.query.StreamingQuery at 0x7f3a092ce210>