In [None]:
"""
WARNING
    > Remove unnecessary code from notebooks that would return results, such as display and count.
    > Do not run Structured Streaming workloads on interactive clusters; always schedule streams as jobs.
    > To help streaming jobs recover automatically, configure jobs with infinite retries.
    > Do not use auto-scaling for workloads with Structured Streaming.
"""

In [None]:
import pyspark.sql.functions as f
import pyspark.sql.utils
import pandas as pd
from datetime import timedelta, datetime

In [None]:
startdate = (datetime.now() - timedelta(1)).strftime("%Y-%m-%d")
enddate = (datetime.now() - timedelta(1)).strftime("%Y-%m-%d")
dates = [startdate,enddate]
yesterday = (datetime.now() - timedelta(1)).strftime("%Y-%m-%d")
now = datetime.now()

#### Streaming with Delta format data source

In [None]:
source_path = "s3://datalake-binary/event-types/data/shop.tracking.outfit-card.click"
target_path = "s3://tracking-analytics/stream"
target_checkpoint_path = "s3://tracking-analytics/checkpoints/target"

In [None]:
# Check the latest delta file in source s3 bucket directory
dbutils.fs.ls(source_path)[-2:-1]

In [None]:
# Read from source Delta table as Stream
source_query = (
    spark.readStream
    .format("delta")
    .load(source_path)
    .filter(f.col("dt").between(*dates))
    .limit(5)
)

In [None]:
# display() on a streaming DataFrame starts a streaming job.
display(source_query, streamName = "read_stream")  

In [None]:
# Stopping the reading stream currently running
for stream in spark.streams.active:   
    if stream.name == "read_stream":
        s = spark.streams.get(stream.id)
        s.stop()

In [None]:
# Write Stream to Delta Lake
# The "availableNow=True" option processes all available data in multiple batches then terminates the query
# The processingTime = '60 seconds' option checks evey 1 min
# The path of "checkpointLocation" should be unique for each writer streaming. 

target_query =  (
    source_query
        .withColumn('etl_date',f.lit(now))
        .writeStream
        .format("delta")
        .option("checkpointLocation", target_checkpoint_path)
        .option("path", target_path)
        .outputMode("append")
        #.trigger(processingTime = '60 seconds')  
        .trigger(availableNow=True)
        #.toTable(table_name)
        .start()
    )

In [None]:
# Check writing stream in specified seconds, if it's still running it will return false
target_query.awaitTermination(timeout=10)

True

In [None]:
# Stop the target stream if needed
target_query.stop()

In [None]:
# Trigger Reading and Writing Streaming in single cell
source_query = (
    spark.readStream
    .format("delta")
    .load(source_path)
    .filter(f.col("dt").between(*dates))
    .limit(5)
)

target_query =  (
    source_query
        .withColumn('etl_date',f.lit(now))
        .writeStream
        .format("delta")
        .option("checkpointLocation", target_checkpoint_path)
        .option("path", target_path)
        .outputMode("append")
        #.trigger(processingTime = '60 seconds')  
        .trigger(availableNow=True)
        .start()
    )

In [None]:
# Check the data at the Target Delta Lake
target_df = (
        spark.read
        .format("delta")
        .load(target_path)
        .where(f.col("etl_date") == now)
        .limit(4)
        )

target_df.display()

In [None]:
# Check the target stream delta files
files = dbutils.fs.ls(target_path)
display(files)

In [None]:
# Check the checkpoints info
dbutils.fs.ls(target_checkpoint_path)

#### Streaming with JSON format data source

In [None]:
# Stream reading JSON files with Auto Loader - cloudFiles
source_path = "s3a://datalake-eu-central-1/data/eventqueue/shop.tracking.outfit-card.click"
source_checkpoint = "s3://tracking-analytics/checkpoints/source"
target_path = "s3://tracking-analytics/stream"
target_checkpoint = "s3://tracking-analytics/checkpoints/target"

source_query= (
            spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "json")
            .option("cloudFiles.schemaLocation", source_checkpoint)
            .load(f"{source_path}/dt={yesterday}")
            .limit(5)
    )

In [None]:
# display the source data, this will trigger a streaming DataFrame
display(source_query, streamName = "json_read_stream")  

In [None]:
# Stop the reading stream currently running
for stream in spark.streams.active:   
    if stream.name == "json_read_stream":
        s = spark.streams.get(stream.id)
        s.stop()

In [None]:
# Trigger Reading and Writing Streaming in single cell
source_query = (
            spark.readStream
            .format("cloudFiles")
            .option("cloudFiles.format", "json")
            .option("cloudFiles.schemaLocation", source_checkpoint)
            .load(f"{source_path}/dt={yesterday}")
            .limit(5)
    )

target_query =  (
    source_query
        .withColumn('dt',f.lit(yesterday))
        .writeStream
        .format("delta")
        .option("checkpointLocation", target_checkpoint)
        .option("path", target_path)
        .outputMode("append")
        #.trigger(processingTime = '60 seconds')  # Check every 1 min
        .trigger(availableNow=True) # Run once and then will stop
        .start()
    )

In [None]:
# Check the target stream delta files
files = dbutils.fs.ls(target_path)
display(files)

In [None]:
# Check the data at the Target Delta Lake
target_df = (
        spark.read
        .format("delta")
        .load(target_path)
        .where(f.col("dt") == yesterday)
        .limit(5)
        )

target_df.display()