In [0]:
"""
WARNING
    > Remove unnecessary code from notebooks that would return results, such as display and count.
    > Do not run Structured Streaming workloads on interactive clusters; always schedule streams as jobs.
    > To help streaming jobs recover automatically, configure jobs with infinite retries.
    > Do not use auto-scaling for workloads with Structured Streaming.
"""

In [0]:
import pyspark.sql.functions as f
import pyspark.sql.utils
from pyspark.sql.types import *
from pyspark.sql.window import Window
import pandas as pd
import datetime

In [0]:
dbutils.widgets.text("startDate", "")
dbutils.widgets.text("endDate", "")

startDate = dbutils.widgets.get("startDate")
endDate = dbutils.widgets.get("endDate")

dates = [startDate, endDate]

print(dates)
print(startDate, endDate)

['2024-08-11', '2024-08-11']
2024-08-11 2024-08-11


In [0]:
# Reading from Delta Lake as a bulk
# Permissions should be given to roles, buckets etc 
file_path = "s3a://datalake/event-types/data/shop.tracking/"
raw_df = (
        spark.read
        .format("delta")
        .load(file_path)
        .withColumn("data_sink", f.lit("raw"))
        .filter(f.col("dt").between(*dates))
        .limit(5)
        )

raw_df.display()

In [0]:
# Read from Delta Lake as Stream
file_path = "s3a://datalake/event-types/data/shop.tracking/"
checkpoint_read = "/tmp/events/checkpoint_read"

raw_stream_df = (
    spark.readStream
    .format("delta")
    .option("checkpointLocation", checkpoint_read)
    .load(file_path)
    .filter(f.col("dt").between(*dates))
    .limit(5)
)

raw_stream_df.display() # Calling display() on a streaming DataFrame starts a streaming job.

In [0]:
# Write Imcremantal batch to Delta Lake
# The "availableNow=True" setting for the trigger instructs Structured Streaming to process all previously unprocessed records from the source dataset and then shut down
# The checkpointLocation should be unique for each streaming writer. This provides the unique identity for your stream, tracking all records processed and state information associated with your streaming query.
target_path = "/tmp/ss-tutorial/"
checkpoint_write = "/tmp/events/checkpoint_write"
(
raw_stream_df.writeStream
    .trigger(availableNow=True) 
    .option("checkpointLocation", checkpoint_write)
    .option("path", target_path)
    .outputMode("append")
    .start()
)

<pyspark.sql.streaming.query.StreamingQuery at 0xffff5cebf490>

In [0]:
# Write as a bulk to Delta Lake
save_path = "s3a://datalake/..."

(
    df_content_data
    .write.format("delta")
    .mode("overwrite") # or append
    .partitionBy("dt")
    .option("replaceWhere", f"dt >= '{start_date}' AND dt <= '{end_date}'")
    .save(save_path)
)