In [0]:
%run "/sales_DWH/Includes/Common_function"

In [0]:
## Init access and functions
from pyspark.sql import functions as F
from pyspark.sql.window import Window
get_access_data_lake()

In [0]:
# This class handles the Bronze layer CRM customer data
class bronze_crm_cust_info():
    def __init__(self):
        # Base path of the Bronze data in ADLS

        self.base_path = "abfss://bronze@salesdwh.dfs.core.windows.net/"
    def init_schema(self):
        # Define the schema for reading parquet files
        # Helps Spark know the data types before loading

        schema_str = schema_str = """ cst_id INT,
                                cst_key STRING,
                                cst_firstname STRING,
                                cst_lastname STRING,
                                cst_marital_status STRING,
                                cst_gndr STRING,
                                create_date STRING,
                                year INT,
                                month INT """
        return schema_str


    def read_data(self):
        # Read data from the specified path in Parquet format
        # Using the schema defined above

        df=spark.read.format("parquet").option('header','true').schema(self.init_schema()).load(f"{self.base_path}/crm_cust_info/")
        return df
    
    def transform_data(self,df):
        # Data cleaning and transformations
        max_month = df.agg(F.max("month").alias("max_month")).collect()[0]["max_month"]
        df_filter = df.filter(F.col("month") == max_month)
        df_scound=df_filter.dropDuplicates().dropna()
        # Keep only the latest record per customer based on create_date
        df_clean2=df_scound.where(F.col("cst_id").isNotNull()).withColumn('rank',F.row_number()\
            .over(Window.partitionBy('cst_id').orderBy(F.desc('create_date'))))\
            .filter(F.col('rank')==1).drop('rank')
        
        # Normalize marital status column from abbreviations to full words
        df_clean3=normalize_column_value(df_clean2,'cst_marital_status','S','Single',"M","Married")
        
        # Normalize gender column from abbreviations to full words
        df_clean3=normalize_column_value(df_clean3,'cst_gndr','M','Male',"F","Female")
        return df_clean3
    
    def write_data(self,df):
        # Define merge condition for incremental load
        merge_condition = "tgt.cst_id=src.cst_id AND tgt.cst_key=src.cst_key AND tgt.cst_marital_status=src.cst_marital_status " 
        
        # Call incremental load function to merge data into Silver table
        load=icremental_load(df,catalog_name="salesdwh_catalog",schema_name="silver",table_name="crm_cust_info",merge_condition=merge_condition)
        return load
    
    def run(self):
        # Run the full ETL process for Bronze CRM customer data
        print("Starting  Transformation bronze_crm_cust_info......")
        read_data=self.read_data()                              # Step 1: Read data
        transform_data= self.transform_data(read_data)          # Step 2: Clean & transform
        write_data=self.write_data(transform_data)              # Step 3: Write to Silver layer
        return write_data
        print('Done...!!!') 



In [0]:
## Trigger Transformation
start=bronze_crm_cust_info()
start.run()

Starting  Transformation bronze_crm_cust_info......


' Merge completed: [The affected rows = 13915]-----[The updated rows = 13915]----[The inserted rows =0] '