In [0]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [0]:
# Drop existing table if exists
spark.sql("DROP TABLE IF EXISTS etl.gold.dim_customers")

df_crm_cust_info = spark.read.table("etl.silver.crm_cust_info")
df_erp_cust_az12 = spark.read.table("etl.silver.erp_cust_az12")
df_erp_loc_a101 = spark.read.table("etl.silver.erp_loc_a101")

df = df_crm_cust_info.join(
    df_erp_cust_az12, df_crm_cust_info.cst_key == df_erp_cust_az12.CID, "left"
)
df = df.join(df_erp_loc_a101, df_crm_cust_info.cst_key == df_erp_loc_a101.CID, "left")

df = (
    df.withColumn("id", row_number().over(Window.orderBy(col("cst_id"))))
    .withColumnRenamed("cst_id", "customer_id")
    .withColumnRenamed("cst_key", "customer_key")
    .withColumnRenamed("cst_firstname", "firstname")
    .withColumnRenamed("cst_lastname", "lastname")
    .withColumnRenamed("cst_marital_status", "marital_status")
    .withColumnRenamed("cst_create_date", "create_date")
    .withColumnRenamed("BDATE", "birth_date")
    .withColumnRenamed("CNTRY", "country")
    .withColumn(
        "cst_gndr",
        when(col("cst_gndr") == "n/a", col("GEN")).otherwise(col("cst_gndr")),
    )
    .withColumnRenamed("cst_gndr", "gender")
)

df = (
    df.select(
        "id",
        "customer_id",
        "customer_key",
        "firstname",
        "lastname",
        "gender",
        "marital_status",
        "birth_date",
        "country",
        "create_date",
    )
    .write.mode("overwrite")
    .format("delta")
    .saveAsTable("etl.gold.dim_customers")
)

In [0]:
# Drop existing table if exists
spark.sql("DROP TABLE IF EXISTS etl.gold.dim_products")

df_crm_prd_info = spark.read.table("etl.silver.crm_prd_info")
df_erp_px_cat_g1v2 = spark.read.table("etl.silver.erp_px_cat_g1v2")

df = (
    df_crm_prd_info.join(
        df_erp_px_cat_g1v2, df_crm_prd_info.cat_id == df_erp_px_cat_g1v2.ID, "left"
    )
    .withColumn("id", row_number().over(Window.orderBy(col("prd_id"))))
    .withColumnRenamed("prd_id", "product_id")
    .withColumnRenamed("prd_key", "product_key")
    .withColumnRenamed("cat_id", "category_id")
    .withColumnRenamed("prd_nm", "product_name")
    .withColumnRenamed("prd_cost", "product_cost")
    .withColumnRenamed("prd_line", "product_line")
    .withColumnRenamed("prd_start_dt", "product_start_date")
    .withColumnRenamed("prd_end_dt", "product_end_date")
    .withColumnRenamed("CAT", "category")
    .withColumnRenamed("SUBCAT", "subcategory")
    .withColumnRenamed("MAINTENANCE", "maintenance")
    .filter(col("product_end_date").isNull())
)

df.select(
    "id",
    "product_id",
    "product_key",
    "category_id",
    "product_name",
    "product_cost",
    "product_line",
    "product_start_date",
    "category",
    "subcategory",
    "maintenance",
).write.mode("overwrite").format("delta").saveAsTable("etl.gold.dim_products")

In [0]:
# Drop existing table if exists
spark.sql("DROP TABLE IF EXISTS etl.gold.fact_sales")

df_crm_sales_details = spark.read.table("etl.silver.crm_sales_details")
df_crm_cust_info = (
    spark.read.table("etl.gold.dim_customers")
    .withColumnRenamed("id", "cust_id")
    .withColumnRenamed("customer_id", "src_customer_id")
    .select("cust_id", "src_customer_id")
)
df_crm_prd_info = (
    spark.read.table("etl.gold.dim_products")
    .withColumnRenamed("id", "prod_id")
    .select("prod_id", "product_key")
)

df = df_crm_sales_details.join(
    df_crm_cust_info,
    df_crm_sales_details.sls_cust_id == df_crm_cust_info.src_customer_id,
    "left",
).join(
    df_crm_prd_info,
    df_crm_sales_details.sls_prd_key == df_crm_prd_info.product_key,
    "left",
)
df = (
    df
    .withColumn("id", row_number().over(Window.orderBy(col("sls_ord_num"))))
    .withColumnRenamed("sls_ord_num", "order_id")
    .withColumnRenamed("cust_id", "customer_id")
    .withColumnRenamed("prod_id", "product_id")
    .withColumnRenamed("sls_order_dt", "order_date")
    .withColumnRenamed("sls_ship_dt", "ship_date")
    .withColumnRenamed("sls_due_dt", "due_date")
    .withColumnRenamed("sls_sales", "sales")
    .withColumnRenamed("sls_quantity", "quantity")
    .withColumnRenamed("sls_price", "price")
).select(
    "order_id",
    "customer_id",
    "product_id",
    "order_date",
    "ship_date",
    "due_date",
    "sales",
    "quantity",
    "price",
)

df.write.mode("overwrite").format("delta").saveAsTable("etl.gold.fact_sales")