In [0]:
from pyspark.sql import functions as F


dbutils.widgets.text("source_system", "SHAREPOINT_HIST")
dbutils.widgets.text("landing_path", "abfss://landing@pvcstor.dfs.core.windows.net/sharepoint/entity=workbook/")
dbutils.widgets.text("schema_location", "abfss://lakehouse@pvcstor.dfs.core.windows.net/_schemas/raw_sharepoint_workbook_files")
dbutils.widgets.text("checkpoint_location", "abfss://lakehouse@pvcstor.dfs.core.windows.net/_checkpoints/raw_sharepoint_workbook_files")

SOURCE_SYSTEM = dbutils.widgets.get("source_system")
LANDING_PATH = dbutils.widgets.get("landing_path")
SCHEMA_LOC = dbutils.widgets.get("schema_location")
CHECKPOINT_LOC = dbutils.widgets.get("checkpoint_location")

print("SOURCE_SYSTEM:", SOURCE_SYSTEM)
print("LANDING_PATH:", LANDING_PATH)
print("SCHEMA_LOC:", SCHEMA_LOC)
print("CHECKPOINT_LOC:", CHECKPOINT_LOC)



In [0]:

# ---- stream read from landing with Auto Loader ----
# Auto Loader discovers new files incrementally and is safe for daily runs using availableNow trigger.
df = (
    spark.readStream
      .format("cloudFiles")
      .option("cloudFiles.format", "binaryFile")
      .option("cloudFiles.schemaLocation", SCHEMA_LOC)
      # optional but helpful filters:
      .option("pathGlobFilter", "*.xlsx")
      .load(LANDING_PATH)
      .select(
          F.lit(SOURCE_SYSTEM).alias("source_system"),
          F.col("path").alias("source_path"),
          F.col("modificationTime").alias("source_modified_ts"),
          F.current_timestamp().alias("load_ts"),
          # extract load_date from folder partition: .../load_date=YYYY-MM-DD/...
          F.regexp_extract(F.col("path"), r"load_date=(\d{4}-\d{2}-\d{2})", 1).cast("date").alias("load_date"),
          # hash file bytes to version the file content
          F.sha2(F.col("content"), 256).alias("file_sha256")
      )
)

In [0]:

# ---- write to Delta table in Unity Catalog ----
(
    df.writeStream
      .option("checkpointLocation", CHECKPOINT_LOC)
      .trigger(availableNow=True)
      .toTable("tp_finance.raw.sharepoint_workbook_files")
)

# ---- post-run hint (won't execute until stream completes in availableNow mode) ----
print("RAW ingestion complete. Validate with: SELECT count(*) FROM tp_finance.raw.sharepoint_workbook_files;")