### Extract data from membership image file 
Note : This structure is compactable with 
1. incremental load
1. fault tolerance
1. schema evolution
1. Reusability
2. Error Handling
3. Logging
4. Schema validation
5. Modularization
6. Maintainability
7. Observability
8. Re-run safety
9. Audit table loading

In [0]:
from pyspark.sql.functions import current_timestamp, col
from pyspark.sql.utils import AnalysisException
import traceback
import logging

In [0]:
# ADF Parameters (Widgets)
dbutils.widgets.text("pENV", "dev")
dbutils.widgets.text("pSOURCE_PATH", "/Volumes/adf_adb_landing/landing/operational_data/memberships")
dbutils.widgets.text("pTARGET_TABLE", "adf_adb_bronze.bronze.membership")
dbutils.widgets.text("pFILE_FORMAT", "binaryFile")
dbutils.widgets.text("pWRITE_MODE", "append")
dbutils.widgets.text("pJOB_NAME", "adb_nbk_ext_membership_image_incremental_03")
dbutils.widgets.text("pLOG_DIR", "/Volumes/adf_adb_logs/bronze_logs/operational_logs")
dbutils.widgets.text("pSCHEMA_EVOLUTION_MODE", "none")
dbutils.widgets.text("pQUERY_NAME", "membership_incremental_query")
dbutils.widgets.text("pRETRY_OF_RUN_ID", "")

# Fetch parameters
ENV  = dbutils.widgets.get("pENV").strip().lower()
SOURCE_PATH = dbutils.widgets.get("pSOURCE_PATH").strip()
TARGET_TABLE = dbutils.widgets.get("pTARGET_TABLE").strip()
FILE_FORMAT = dbutils.widgets.get("pFILE_FORMAT").strip()
WRITE_MODE = dbutils.widgets.get("pWRITE_MODE").strip()
JOB_NAME = dbutils.widgets.get("pJOB_NAME").strip()
LOG_DIR = dbutils.widgets.get("pLOG_DIR").strip()
SCHEMA_EVOLUTION_MODE = dbutils.widgets.get("pSCHEMA_EVOLUTION_MODE").strip()
QUERY_NAME = dbutils.widgets.get("pQUERY_NAME").strip()
RETRY_OF_RUN_ID = dbutils.widgets.get("pRETRY_OF_RUN_ID").strip() or None

##### Implemenation for incremental load using autoloader - Structured streaming compactable with batch style

In [0]:
%run ../01_includes/13_incremental_core_modules

In [0]:
# Init logger and listener
logger = get_logger(JOB_NAME, LOG_DIR)
logger.info("Pipeline starting... (widget-driven)")
attach_streaming_listener(job_name=JOB_NAME, log_dir=LOG_DIR)

In [0]:
# ---- AUDIT start (per layer derived from pTARGET_TABLE schema) ----
_a = audit_start(ENV=ENV, JOB_NAME=JOB_NAME, TARGET_TABLE=TARGET_TABLE,RETRY_OF_RUN_ID=RETRY_OF_RUN_ID, logger=logger)
RUN_ID = _a["run_id"]
logger.info("Audit started for run_id: {RUN_ID}")

#### Orchestration module

In [0]:
# Incremental read via Auto Loader
try:
    readed_data_df = read_incremental_data(
        path=SOURCE_PATH,
        file_format=FILE_FORMAT,
        schema_evolution_mode=SCHEMA_EVOLUTION_MODE,
        job_name=JOB_NAME,
        logger=logger
    )
    logger.info("Read stage complete.")
    
except Exception as e:
    logger.error(f"Read stage failed: {e}")
    close_logger(logger)
    raise


In [0]:
# Enrichment (reuses your existing enricher)
try:
    enriched_df = enrich_data(readed_data_df)
    logger.info("Enrichment complete.")
except Exception as e:
    logger.error(f"Enrichment failed: {e}")
    close_logger(logger)
    raise

In [0]:
# Streaming write to Delta (availableNow) + register table if needed
try:
    streaming_write_to_table(
        df=enriched_df,
        table_name=TARGET_TABLE,
        write_mode=WRITE_MODE,
        job_name=JOB_NAME,
        source_path=SOURCE_PATH,
        query_name=QUERY_NAME,
        logger=logger
    )
    logger.info("Write stage completed successfully.")

# table-wide count; for exact run counts, stamp RUN_ID in rows and filter by it
    try:
        cnt = spark.table(TARGET_TABLE).count()
        audit_update_count(RUN_ID=RUN_ID, TARGET_TABLE=TARGET_TABLE, RECORD_COUNT=cnt, logger=logger)
    except Exception:
        logger.warning("Record count post-write failed; continuing.")

    audit_finalize(RUN_ID=RUN_ID, TARGET_TABLE=TARGET_TABLE, RUN_STATUS="SUCCESS", logger=logger)

except Exception as e:
    try:
        audit_finalize(RUN_ID=RUN_ID, TARGET_TABLE=TARGET_TABLE, RUN_STATUS="FAILED", logger=logger)
    except Exception:
        logger.error("Audit finalize FAILED stage failed as well.")
    logger.error(f"Write stage failed: {e}")
    logger.error(traceback.format_exc())
    raise
finally:
    close_logger(logger)



In [0]:
# Quick checks 
display(spark.sql(f"select * from {TARGET_TABLE}"))

In [0]:
# Audit table verifications
display(spark.sql("select * from adf_adb_audit.audit.pipeline_audit order by AUDIT_DATE desc "))

In [0]:
# remove the checkpoint
dbutils.fs.rm(f"{SOURCE_PATH}/_checkpoints/{JOB_NAME}", True)