### Extract data from order - Complex JSON file (Having Arrays, Nexted JSON and Data Issues)
Note : This structure is compactable with 
1. incremental load
1. fault tolerance
1. schema evolution
1. Reusability
2. Error Handling
3. Logging
4. Schema validation
5. Modularization
6. Maintainability
7. Observability
8. Re-run safety

In [0]:
from pyspark.sql.functions import current_timestamp, col
from pyspark.sql.utils import AnalysisException
import traceback
import logging

In [0]:
# ADF Parameters (Widgets)
dbutils.widgets.text("source_path", "")
dbutils.widgets.text("target_table", "")
dbutils.widgets.text("file_format", "")
dbutils.widgets.text("write_mode", "append")
dbutils.widgets.text("job_name", "")
dbutils.widgets.text("enable_enrichment", "true")
dbutils.widgets.text("log_dir", "/Volumes/adf_adb_logs/bronze_logs/operational_logs")
dbutils.widgets.text("schema_Evolution_Mode","rescue")
dbutils.widgets.text("logger_name","")

# Fetch parameters
SOURCE_PATH = dbutils.widgets.get("source_path").strip()
TARGET_TABLE = dbutils.widgets.get("target_table").strip()
FILE_FORMAT = dbutils.widgets.get("file_format").strip().lower()
WRITE_MODE = dbutils.widgets.get("write_mode").strip().lower()
JOB_NAME = dbutils.widgets.get("job_name").strip()
ENABLE_ENRICHMENT = dbutils.widgets.get("enable_enrichment").strip().lower() == "true"
LOG_DIR = dbutils.widgets.get("log_dir").strip()
SCHEMA_EVOLUTION_MODE=dbutils.widgets.get("schema_Evolution_Mode").strip()
LOGGER_NAME=dbutils.widgets.get("logger_name").strip()


In [0]:
# Define logger config variables
logger_name = LOGGER_NAME
logger_level = logging.INFO
log_to_file = True
log_dir = LOG_DIR

##### Implemenation for incremental load using autoloader - Structured streaming compactable with batch style

In [0]:
%run ../01_includes/13_incremental_core_modules

In [0]:
# -----------------------------------------
# Orchestration
# -----------------------------------------

# Initialize logger
logger = init_logger(name=LOGGER_NAME, level=logging.INFO, log_to_file=True, log_dir=LOG_DIR)
logger.info("Logger initialized successfully.")

try:
    logger.info(f"{JOB_NAME} - Starting Bronze load pipeline")

    # Auto Loader read (returns streaming DataFrame)
    incremental_read_df = read_incremental_data(SOURCE_PATH, FILE_FORMAT)

    # enrich data with new audit trial fields
    enriched_df = enrich_data(incremental_read_df)

    # Write to Delta table (Auto Loader with checkpoint + trigger)
    streaming_write_to_table(enriched_df, TARGET_TABLE, WRITE_MODE)

    logger.info(f"{JOB_NAME} - Completed successfully")

except Exception as job_exception:
    print(f"{JOB_NAME} - Pipeline failed: {str(job_exception)}")
    print(traceback.format_exc())  # full traceback

    logger.error(f"{JOB_NAME} - Pipeline failed: {str(job_exception)}")
    logger.error(traceback.format_exc())  # full traceback
    raise RuntimeError(f"{JOB_NAME} - Pipeline execution failed") from job_exception

In [0]:
display(spark.table("adf_adb_bronze.bronze.order"))