In [0]:
%run "/sales_DWH/Includes/Common_function"

In [0]:
## Init access and functions
from pyspark.sql import functions as F
from pyspark.sql.window import Window
get_access_data_lake()

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window


class BronzeCrmCustInfo:
    """
    Processes CRM Customer data from Bronze to Silver layer.
    """

    def __init__(self, spark):
        self.spark = spark
        self.base_path = "abfss://bronze@salesdwh.dfs.core.windows.net/crm_cust_info/"

    def get_schema(self):
        """
        Schema for CRM customer parquet files.
        """
        return """
            cst_id INT,
            cst_key STRING,
            cst_firstname STRING,
            cst_lastname STRING,
            cst_marital_status STRING,
            cst_gndr STRING,
            create_date STRING,
            year INT,
            month INT
        """

    def read_data(self):
        """
        Reads CRM customer data from Bronze layer.
        """
        return (
            self.spark.read
            .format("parquet")
            .schema(self.get_schema())
            .load(self.base_path)
        )

    def transform_data(self, df):
        """
        Cleans and standardizes CRM customer data.
        """

        # Filter latest month
        max_month = df.select(F.max("month")).first()[0]
        df = df.filter(F.col("month") == max_month)

        # Remove duplicates and nulls
        df = (
            df.dropDuplicates()
              .dropna()
              .filter(F.col("cst_id").isNotNull())
        )

        # Keep latest record per customer
        window_spec = Window.partitionBy("cst_id").orderBy(F.desc("create_date"))
        df = (
            df.withColumn("row_num", F.row_number().over(window_spec))
              .filter(F.col("row_num") == 1)
              .drop("row_num")
        )

        # Normalize marital status
        df = normalize_column_value(
            df,
            "cst_marital_status",
            "S", "Single",
            "M", "Married"
        )

        # Normalize gender
        df = normalize_column_value(
            df,
            "cst_gndr",
            "M", "Male",
            "F", "Female"
        )

        return df

    def write_data(self, df):
        """
        Performs incremental load into Silver layer.
        """
        merge_condition = """
            tgt.cst_id = src.cst_id
            AND tgt.cst_key = src.cst_key
            AND tgt.cst_marital_status = src.cst_marital_status
        """

        return incremental_load(
            df,
            catalog_name="salesdwh_catalog",
            schema_name="silver",
            table_name="crm_cust_info",
            merge_condition=merge_condition
        )

    def run(self):
        """
        Executes the Bronze â†’ Silver CRM customer pipeline.
        """
        print("Starting Bronze CRM Customer Transformation...")

        df = self.read_data()
        df_transformed = self.transform_data(df)
        result = self.write_data(df_transformed)

        print("Bronze CRM Customer Transformation completed.")
        return result


In [0]:
try:
    job = BronzeCrmCustInfo(spark)
    result = job.run()
    print(result)
except Exception as e:
    print(f"Bronze CRM Job Failed: {str(e)}")
    raise


Starting  Transformation bronze_crm_cust_info......


' Merge completed: [The affected rows = 13915]-----[The updated rows = 13915]----[The inserted rows =0] '