In [0]:
%run "/sales_DWH/Includes/Common_function"

In [0]:
## Init access and functions
from pyspark.sql import functions as F
from pyspark.sql.window import Window
get_access_data_lake()

In [0]:
# This class ETL process the Bronze layer crm_sales_details data
class bronze_crm_sales_details():
    def __init__(self):
        # Base path of the Bronze data in ADLS

        self.base_path = "abfss://bronze@salesdwh.dfs.core.windows.net/"

    def read_data(self):
        # Read data from the specified path in Parquet format
        # Using the schema defined above

        df=spark.read.format("parquet").option("inferSchema", "true").option("samplingRatio", 0.01).option('header','true')\
            .load(f"{self.base_path}/crm_sales_details/")
        return df
    
    def transform_data(self,df):
        # Data cleaning and transformations
        max_month = df.agg(F.max("month").alias("max_month")).collect()[0]["max_month"]
        df_filter = df.filter(F.col("month") == max_month)
        df_scound=df_filter.dropDuplicates().dropna()
        df2=clean_date_column(df_scound,'sls_order_dt')
        df3=clean_date_column(df2,'sls_ship_dt')
        df4=clean_date_column(df3,'sls_due_dt')
        df_clean2= df4.withColumn("sls_sales",F.when((F.col("sls_sales").isNull()) | (F.col("sls_sales") <= 0) | 
                (F.col("sls_sales") != F.col("sls_quantity") * F.abs(F.col("sls_price"))),F.col("sls_quantity") * F.abs(F.col("sls_price")))\
                .otherwise(F.col("sls_sales")))\
                .withColumn("sls_price",F.when((F.col("sls_price").isNull()) | (F.col("sls_price") <= 0),
                F.expr("sls_sales / NULLIF(sls_quantity, 0)"))\
                .otherwise(F.col("sls_price")))
        return df_clean2
    
    def write_data(self,df):
        # Define merge condition for incremental load
        merge_condition = "tgt.sls_ord_num=src.sls_ord_num AND tgt.sls_cust_id=src.sls_cust_id AND tgt.sls_prd_key=src.sls_prd_key AND tgt.sls_sales=src.sls_sales  " 
        
        # Call incremental load function to merge data into Silver table
        load=icremental_load(df,catalog_name="salesdwh_catalog",schema_name="silver",table_name="crm_sales_details",merge_condition=merge_condition)
        return load
    
    def run(self):
        # Run the full ETL process for Bronze crm_sales_details data
        print("Starting  Transformation bronze_crm_prd_info......")
        read_data=self.read_data()                              # Step 1: Read data
        transform_data= self.transform_data(read_data)          # Step 2: Clean & transform
        write_data=self.write_data(transform_data)              # Step 3: Write to Silver layer
        return write_data
        print('Done...!!!') 



In [0]:
## Trigger Transformation
start=bronze_crm_sales_details()
start.run()

Starting  Transformation bronze_crm_prd_info......


' Merge completed: [The affected rows = 60385]-----[The updated rows = 60385]----[The inserted rows =0] '