In [0]:
"""
WARNING
    > Remove unnecessary code from notebooks that would return results, such as display and count.
    > Do not run Structured Streaming workloads on interactive clusters; always schedule streams as jobs.
    > To help streaming jobs recover automatically, configure jobs with infinite retries.
    > Do not use auto-scaling for workloads with Structured Streaming.
"""

In [0]:
import pyspark.sql.functions as f
import pyspark.sql.utils
from pyspark.sql.types import *
from pyspark.sql.window import Window
import pandas as pd
from datetime import timedelta, datetime

In [0]:
startdate = (datetime.now() - timedelta(1)).strftime("%Y-%m-%d")
enddate = (datetime.now() - timedelta(1)).strftime("%Y-%m-%d")
dates = [startdate,enddate]
dates

['2024-08-20', '2024-08-20']

In [0]:
# Read from Delta Lake as Stream
file_path = "s3://zalando-datalake-binary/event-types/data/shop.tracking.outfit-card.click"
checkpoint_for_reading = "/team-tracking/checkpoints/read"

raw_stream_df = (
    spark.readStream
    .format("delta")
    .option("checkpointLocation", checkpoint_for_reading)
    .load(file_path)
    .filter(f.col("dt").between(*dates))
    .limit(5)
)

In [0]:
# display() on a streaming DataFrame starts a streaming job.
display(raw_stream_df, streamName = "read_stream")  

In [0]:
# Stopping the reading stream currently running
for stream in spark.streams.active:   
    if stream.name == "read_stream":
        s = spark.streams.get(stream.id)
        s.stop()

In [0]:
# Write Stream to Delta Lake
# The "availableNow=True" option processes all available data in multiple batches then terminates the query
# The processingTime = '60 seconds' option checks evey 1 min
# The "checkpointLocation" should be unique for each streaming writer.It tracks all records processed and state information.

target_path = "/team-tracking/test_stream"
checkpoint_for_writing = "/team-tracking/checkpoints/write"

write_stream =  (
raw_stream_df.writeStream
    .format("delta")
    .option("checkpointLocation", checkpoint_for_writing)
    .option("path", target_path)
    .outputMode("append")
    #.trigger(processingTime = '60 seconds')  
    .trigger(availableNow=True)
    #.toTable(table_name)
    .start()
)

In [0]:
# Check writing stream in specified seconds, if it's still running it will return false
write_stream.awaitTermination(timeout=5)

True

In [0]:
# Stop the writing stream
write_stream.stop()

In [0]:
# Check the data from the Target Delta Lake
file_path = "/team-tracking/test_stream"
target_df = (
        spark.read
        .format("delta")
        .load(file_path)
        .filter(f.col("dt").between(*dates))
        .limit(4)
        )

target_df.display()