## Extract data from customer Simple JSON files using PySpark
1. Query single JSON file
2. Query multiple JSON files using wild charactors
3. Query entire files in a folder

In [0]:
%run ../01_includes/00_logger_config


In [0]:
from pyspark.sql.functions import current_timestamp, col
from pyspark.sql.utils import AnalysisException

In [0]:
# ADF Parameters (Widgets)
dbutils.widgets.text("source_path", "/Volumes/adf_adb_landing/landing/operational_data/customer/")
dbutils.widgets.text("target_table", "adf_adb_bronze.bronze.customer")
dbutils.widgets.text("file_format", "json")
dbutils.widgets.text("write_mode", "overwrite")
dbutils.widgets.text("job_name", "adb_nbk_ext_customer_json_01")
dbutils.widgets.text("enable_enrichment", "true")

# Fetch parameters
SOURCE_PATH = dbutils.widgets.get("source_path").strip()
TARGET_TABLE = dbutils.widgets.get("target_table").strip()
FILE_FORMAT = dbutils.widgets.get("file_format").strip().lower()
WRITE_MODE = dbutils.widgets.get("write_mode").strip().lower()
JOB_NAME = dbutils.widgets.get("job_name").strip()
ENABLE_ENRICHMENT = dbutils.widgets.get("enable_enrichment").strip().lower() == "true"


In [0]:
# -----------------------------------------
# Function to read data with safety checks
# -----------------------------------------
def read_data(path: str, fmt: str):
    try:
        df = spark.read.format(fmt).load(path)
        if df.rdd.isEmpty():
            logger.warning(f"{JOB_NAME} - Source data is empty at: {path}")
        else:
            logger.info(f"{JOB_NAME} - Read successful from path: {path}")
            logger.info(f"{JOB_NAME} - Row count: {df.count():,}")
        return df
    except Exception as e:
        logger.error(f"{JOB_NAME} - Failed to read from path: {path}")
        raise RuntimeError(f"{JOB_NAME} - Read failed") from e


In [0]:
# -----------------------------------------
# Function to Add Audit fields
# -----------------------------------------

def enrich_customer_data(df):
    try:
        enriched_df = df.withColumn("Created_Timestamp", current_timestamp()) \
                        .withColumn("FileName", col("_metadata.file_path"))
        logger.info(f"{JOB_NAME} - Enrichment completed with new Audit columns: Created_Timestamp, FileName")
        return enriched_df
    except Exception as e:
        logger.error(f"{JOB_NAME} - Failed during enrichment stage")
        logger.error(f"{JOB_NAME} - Exception: {str(e)}")
        raise RuntimeError(f"{JOB_NAME} - Enrichment failed") from e


In [0]:
# -----------------------------------------
# Function to write data
# -----------------------------------------
def write_to_table(df, table_name, mode):
    try:
        row_count = df.count()
        if row_count == 0:
            logger.warning(f"{JOB_NAME} - No data to write to table: {table_name}")
            return
        
        df.write.format("delta") \
            .mode(mode) \
            .saveAsTable(table_name)

        logger.info(f"{JOB_NAME} - Write successful to table: {table_name}")
        logger.info(f"{JOB_NAME} - Rows written: {row_count:,}")

    except AnalysisException as ae:
        logger.error(f"{JOB_NAME} - Table write failed due to analysis error")
        logger.error(f"{JOB_NAME} - {str(ae)}")
        raise RuntimeError(f"{JOB_NAME} - Analysis failure writing to {table_name}") from ae

    except Exception as e:
        logger.error(f"{JOB_NAME} - General failure during write to table: {table_name}")
        logger.error(f"{JOB_NAME} - Exception: {str(e)}")
        raise RuntimeError(f"{JOB_NAME} - Write failed to {table_name}") from e

In [0]:
# -----------------------------------------
# Orchestration
# -----------------------------------------

# Initialize logger
logger = init_logger(name="bronze_customer_logger", level=logging.INFO)

try:
    logger.info(f"{JOB_NAME} - Starting Bronze load pipeline")
    
    cust_df = read_data(SOURCE_PATH, FILE_FORMAT)
    cust_df_final = enrich_customer_data(cust_df)
    write_to_table(cust_df_final, TARGET_TABLE, WRITE_MODE)

    logger.info(f"{JOB_NAME} - Completed successfully")

except Exception as job_exception:
    logger.error(f"{JOB_NAME} - Pipeline failed: {str(job_exception)}")
    raise
