In [0]:
"""
WARNING
    > Remove unnecessary code from notebooks that would return results, such as display and count.
    > Do not run Structured Streaming workloads on interactive clusters; always schedule streams as jobs.
    > To help streaming jobs recover automatically, configure jobs with infinite retries.
    > Do not use auto-scaling for workloads with Structured Streaming.
"""

In [0]:
import pyspark.sql.functions as f
import pyspark.sql.utils
import pandas as pd
from datetime import timedelta, datetime

In [0]:
startdate = (datetime.now() - timedelta(1)).strftime("%Y-%m-%d")
enddate = (datetime.now() - timedelta(1)).strftime("%Y-%m-%d")
dates = [startdate,enddate]
dates

['2024-08-21', '2024-08-21']

In [0]:
# Check the latest delta file in source s3 bucket directory
dbutils.fs.ls("s3://zalando-datalake-binary/event-types/data/shop.tracking.outfit-card.click")[-2:-1]

[FileInfo(path='s3://zalando-datalake-binary/event-types/data/shop.tracking.outfit-card.click/dt=2024-08-21/', name='dt=2024-08-21/', size=0, modificationTime=1724327520246)]

In [0]:
# Read from source Delta table as Stream
source_path = "s3://zalando-datalake-binary/event-types/data/shop.tracking.outfit-card.click"

source_query = (
    spark.readStream
    .format("delta")
    .load(source_path)
    .filter(f.col("dt").between(*dates))
    .limit(5)
)

In [0]:
# display() on a streaming DataFrame starts a streaming job.
display(source_query, streamName = "read_stream")  

In [0]:
# Stopping the reading stream currently running
for stream in spark.streams.active:   
    if stream.name == "read_stream":
        s = spark.streams.get(stream.id)
        s.stop()

In [0]:
# Write Stream to Delta Lake
# The "availableNow=True" option processes all available data in multiple batches then terminates the query
# The processingTime = '60 seconds' option checks evey 1 min
# The path of "checkpointLocation" should be unique for each writer streaming. 

target_path = "dbfs:/team-tracking/test_stream"
checkpoint_path = "dbfs:/team-tracking/checkpoints/test_stream"

target_query =  (
    source_query
        .writeStream
        .format("delta")
        .option("checkpointLocation", checkpoint_path)
        .option("path", target_path)
        .outputMode("append")
        #.trigger(processingTime = '60 seconds')  
        .trigger(availableNow=True)
        #.toTable(table_name)
        .start()
    )

In [0]:
# Check writing stream in specified seconds, if it's still running it will return false
target_query.awaitTermination(timeout=10)

True

In [0]:
# Stop the target stream if needed
target_query.stop()

In [0]:
# Check the data at the Target Delta Lake
file_path = "dbfs:/team-tracking/test_stream"
target_df = (
        spark.read
        .format("delta")
        .load(file_path)
        .filter(f.col("dt").between(*dates))
        .limit(4)
        )

target_df.display()

In [0]:
# Check the checkpoints info
#%fs ls
dbutils.fs.ls("dbfs:/team-tracking/checkpoints/test_stream/")

[FileInfo(path='dbfs:/team-tracking/checkpoints/test_stream/commits/', name='commits/', size=0, modificationTime=1724333943685),
 FileInfo(path='dbfs:/team-tracking/checkpoints/test_stream/metadata', name='metadata', size=45, modificationTime=1724333867000),
 FileInfo(path='dbfs:/team-tracking/checkpoints/test_stream/offsets/', name='offsets/', size=0, modificationTime=1724333943685),
 FileInfo(path='dbfs:/team-tracking/checkpoints/test_stream/state/', name='state/', size=0, modificationTime=1724333943685)]

In [0]:
# Check the target stream delta files
dbutils.fs.ls("dbfs:/team-tracking/test_stream/")

[FileInfo(path='dbfs:/team-tracking/test_stream/_delta_log/', name='_delta_log/', size=0, modificationTime=1724334014397),
 FileInfo(path='dbfs:/team-tracking/test_stream/part-00000-48de2635-c752-42b7-8cc6-8bd7a3e30936.c000.snappy.parquet', name='part-00000-48de2635-c752-42b7-8cc6-8bd7a3e30936.c000.snappy.parquet', size=25108, modificationTime=1724333878000),
 FileInfo(path='dbfs:/team-tracking/test_stream/part-00000-9e49af46-a2f1-420b-ad78-cfd0eeec1461.c000.snappy.parquet', name='part-00000-9e49af46-a2f1-420b-ad78-cfd0eeec1461.c000.snappy.parquet', size=19207, modificationTime=1724285225000)]