## Extract data from customer Simple JSON files using PySpark
Note : This structure is compactable with the below functions
1. Reusability
2. Error Handling
3. Logging
4. Schema validation
5. Modularization
6. Maintainability
7. Observability
8. Re-run safety

In [0]:
from pyspark.sql.functions import current_timestamp, col
from pyspark.sql.utils import AnalysisException
import traceback
import logging

In [0]:
# ADF Parameters (Widgets)
dbutils.widgets.text("source_path", "/Volumes/adf_adb_landing/landing/operational_data/customer/")
dbutils.widgets.text("target_table", "adf_adb_bronze.bronze.customer")
dbutils.widgets.text("file_format", "json")
dbutils.widgets.text("write_mode", "overwrite")
dbutils.widgets.text("job_name", "adb_nbk_ext_customer_json_01")
dbutils.widgets.text("enable_enrichment", "true")
dbutils.widgets.text("log_dir", "/Volumes/adf_adb_logs/bronze_logs/operational_logs")

# Fetch parameters
SOURCE_PATH = dbutils.widgets.get("source_path").strip()
TARGET_TABLE = dbutils.widgets.get("target_table").strip()
FILE_FORMAT = dbutils.widgets.get("file_format").strip().lower()
WRITE_MODE = dbutils.widgets.get("write_mode").strip().lower()
JOB_NAME = dbutils.widgets.get("job_name").strip()
ENABLE_ENRICHMENT = dbutils.widgets.get("enable_enrichment").strip().lower() == "true"
LOG_DIR = dbutils.widgets.get("log_dir").strip()


In [0]:
# Define logger config variables
logger_name = "bronze_customer_logger"
logger_level = logging.INFO
log_to_file = True
log_dir = LOG_DIR

In [0]:
%run ../01_includes/04_include_core_modules

In [0]:
# -----------------------------------------
# Orchestration
# -----------------------------------------

# Initialize logger
logger = init_logger(name="bronze_customer_logger", level=logging.INFO, log_to_file=True, log_dir=LOG_DIR)
logger.info("Logger initialized successfully.")

try:
    logger.info(f"{JOB_NAME} - Starting Bronze load pipeline")
    
    cust_df = read_data(SOURCE_PATH, FILE_FORMAT)
    cust_df_final = enrich_data(cust_df)
    write_to_table(cust_df_final, TARGET_TABLE, WRITE_MODE)

    logger.info(f"{JOB_NAME} - Completed successfully")

except Exception as job_exception:
    print(f"{JOB_NAME} - Pipeline failed: {str(job_exception)}")
    print(traceback.format_exc())  # full traceback

    logger.error(f"{JOB_NAME} - Pipeline failed: {str(job_exception)}")
    logger.error(traceback.format_exc())  # full traceback
    raise


In [0]:
display(spark.table("adf_adb_bronze.bronze.customer"))