SOURCE CRM

In [0]:
# Imports

from pyspark.sql.functions import col, count, when, row_number, desc, trim
from pyspark.sql.functions import substring, regexp_replace, when, length, expr, lead, lit
from pyspark.sql.window import Window

In [0]:
# Customer Dimension
gold_cust_info = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/Volumes/workspace/default/silver/source_crm/cust_info.csv")
gold_CUST_AZ12 = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/Volumes/workspace/default/silver/source_erp/CUST_AZ12.csv")
gold_LOC_A101 = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/Volumes/workspace/default/silver/source_erp/LOC_A101.csv")

cust_dimension = gold_cust_info.join(gold_CUST_AZ12, gold_cust_info.cst_key == gold_CUST_AZ12.CID, "left")
cust_dimension = cust_dimension.join(gold_LOC_A101, cust_dimension.CID == gold_LOC_A101.CID, "left")

# Data integration in missmatching columns
cust_dimension = cust_dimension.withColumn(
    "cst_gndr",
    when(col("cst_gndr") == 'na', col("GEN"))
    .otherwise(col("cst_gndr"))
)

# Adding Surrogate Key (system generated unique identifier)
cust_dimension = cust_dimension.withColumn(
    "cst_sk",
    row_number().over(Window.orderBy(lit(1)))
)


display(cust_dimension)

In [0]:
# Check for duplicates
# display(cust_dimension.groupBy("cst_id").count().filter(col("count") > 1).select("cst_id"))

# Data integration in missmatching columns
display(cust_dimension.select("cst_gndr", "GEN").filter(col("cst_gndr") != col("GEN")))

cust_dimension = cust_dimension.withColumn(
    "cst_gndr",
    when(col("cst_gndr") == 'na', col("GEN"))
    .otherwise(col("cst_gndr"))
)

display(cust_dimension.select("cst_gndr", "GEN",).filter(col("cst_gndr") != col("GEN")))

display(cust_dimension.select("cst_gndr", "GEN",).filter((col("cst_gndr").isNull()) | (col("cst_gndr") == 'na')))



In [0]:
# Product Dimension
gold_prd_info = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/Volumes/workspace/default/silver/source_crm/prd_info.csv")
gold_PX_CAT_G1V2 = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/Volumes/workspace/default/silver/source_erp/PX_CAT_G1V.csv")

# Filtering by the latest product
gold_prd_info = gold_prd_info.filter(col("prd_end_dt").isNull())

# Merging tables
prod_dimension = gold_prd_info.join(gold_PX_CAT_G1V2, gold_prd_info.cat_id == gold_PX_CAT_G1V2.ID, "left")


display(prod_dimension)



In [0]:
# Check for duplicates
display(prod_dimension.groupBy("prd_key").count().filter(col("count") > 1).select("prd_key"))

In [0]:
# Sales Fact
gold_sales_details = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/Volumes/workspace/default/silver/source_crm/sales_details.csv")

sales_fact = gold_sales_details.join(cust_dimension, gold_sales_details.sls_cust_id == cust_dimension.cst_id, "left").join(prod_dimension, gold_sales_details.sls_prd_key == prod_dimension.prd_key, "left")


display(sales_fact)
