In [None]:
"""
WARNING
    > Remove unnecessary code from notebooks that would return results, such as display and count.
    > Do not run Structured Streaming workloads on interactive clusters; always schedule streams as jobs.
    > To help streaming jobs recover automatically, configure jobs with infinite retries.
    > Do not use auto-scaling for workloads with Structured Streaming.
"""

In [None]:
import pyspark.sql.functions as f
import pyspark.sql.utils
import pandas as pd
from datetime import timedelta, datetime

In [None]:
startdate = (datetime.now() - timedelta(1)).strftime("%Y-%m-%d")
enddate = (datetime.now() - timedelta(1)).strftime("%Y-%m-%d")
dates = [startdate,enddate]
dates

['2024-08-21', '2024-08-21']

#### Streaming with Delta format 

In [None]:
# Check the latest delta file in source s3 bucket directory
dbutils.fs.ls("s3://zalando-datalake-binary/event-types/data/shop.tracking.outfit-card.click")[-2:-1]

[FileInfo(path='s3://zalando-datalake-binary/event-types/data/shop.tracking.outfit-card.click/dt=2024-08-21/', name='dt=2024-08-21/', size=0, modificationTime=1724327520246)]

In [None]:
# Read from source Delta table as Stream
source_path = "s3://zalando-datalake-binary/event-types/data/shop.tracking.outfit-card.click"

source_query = (
    spark.readStream
    .format("delta")
    .load(source_path)
    .filter(f.col("dt").between(*dates))
    .limit(5)
)

In [None]:
# display() on a streaming DataFrame starts a streaming job.
display(source_query, streamName = "read_stream")  

In [None]:
# Stopping the reading stream currently running
for stream in spark.streams.active:   
    if stream.name == "read_stream":
        s = spark.streams.get(stream.id)
        s.stop()

In [None]:
# Write Stream to Delta Lake
# The "availableNow=True" option processes all available data in multiple batches then terminates the query
# The processingTime = '60 seconds' option checks evey 1 min
# The path of "checkpointLocation" should be unique for each writer streaming. 

target_path = "dbfs:/team-tracking/test_stream"
checkpoint_path = "dbfs:/team-tracking/checkpoints/test_stream"

target_query =  (
    source_query
        .writeStream
        .format("delta")
        .option("checkpointLocation", checkpoint_path)
        .option("path", target_path)
        .outputMode("append")
        #.trigger(processingTime = '60 seconds')  
        .trigger(availableNow=True)
        #.toTable(table_name)
        .start()
    )

In [None]:
# Check writing stream in specified seconds, if it's still running it will return false
target_query.awaitTermination(timeout=10)

True

In [None]:
# Stop the target stream if needed
target_query.stop()

In [None]:
# Trigger Reading and Writing Streaming in single cell
source_query = (
    spark.readStream
    .format("delta")
    .load(source_path)
    .filter(f.col("dt").between(*dates))
    .limit(5)
)

target_query =  (
    source_query
        .writeStream
        .format("delta")
        .option("checkpointLocation", checkpoint_path)
        .option("path", target_path)
        .outputMode("append")
        #.trigger(processingTime = '60 seconds')  
        .trigger(availableNow=True)
        .start()
    )

In [None]:
# Check the data at the Target Delta Lake
file_path = "dbfs:/team-tracking/test_stream"
target_df = (
        spark.read
        .format("delta")
        .load(file_path)
        .filter(f.col("dt").between(*dates))
        .limit(4)
        )

target_df.display()

In [None]:
# Check the checkpoints info
#%fs ls
dbutils.fs.ls("dbfs:/team-tracking/checkpoints/test_stream/")

In [None]:
# Check the target stream delta files
dbutils.fs.ls("dbfs:/team-tracking/test_stream/")

#### Streaming with JSON format 

In [None]:
# Stream reading JSON files with Auto Loader - cloudFiles
source_path = "s3a://datalake-eu-central-1/data/eventqueue/shop.tracking.outfit-card.click"
source_checkpoint = "s3://tracking-analytics/checkpoints/source"

json_source= (
        spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "json")
        .option("cloudFiles.schemaLocation", source_checkpoint)
        .load(file_path)
        .limit(5)
    )

In [None]:
# display() on a streaming DataFrame starts a streaming job.
display(json_source, streamName = "json_read_stream")  

In [None]:
# Stop the reading stream currently running
for stream in spark.streams.active:   
    if stream.name == "json_read_stream":
        s = spark.streams.get(stream.id)
        s.stop()

In [None]:
# Trigger Reading and Writing Streaming in single cell
source_path = "s3a://zalando-saiki-datalake-eu-central-1/data/eventqueue/shop.tracking.outfit-card.click"
target_path = "s3://tracking-analytics/stream"
source_checkpoint = "s3://tracking-analytics/checkpoints/source"
target_checkpoint = "s3://tracking-analytics/checkpoints/target"

json_source= (
        spark.readStream
        .format("cloudFiles")
        .option("cloudFiles.format", "json")
        .option("cloudFiles.schemaLocation", source_checkpoint)
        .load(source_path)
        .limit(5)
    )

target_query =  (
    json_source
        .writeStream
        .format("delta")
        .option("checkpointLocation", target_checkpoint)
        .option("path", target_path)
        .outputMode("append")
        #.trigger(processingTime = '60 seconds')  
        .trigger(availableNow=True)
        .start()
    )