# Initialization

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import col, trim
from pyspark.sql.types import StringType
from pyspark.sql.window import Window

# Read Bronze Table


In [0]:
# I am reading from the source which is the Bronze
df=spark.table("workspace.bronze.crm_cust_info")

In [0]:
df.display()

# Silver Transformations

In [0]:
RENAME_MAP = {
    "cst_id": "customer_id",
    "cst_key": "customer_number",
    "cst_firstname": "first_name",
    "cst_lastname": "last_name",
    "cst_marital_status": "marital_status",
    "cst_gndr": "gender",
    "cst_create_date": "created_date"
}

##  Trimming

In [0]:
for field in df.schema.fields: 
  if isinstance(field.dataType, StringType):
    df = df.withColumn(field.name, trim(col(field.name)))
df.display()

## Name Normalization

In [0]:
df = (df.withColumn(
    "cst_gndr",
    F.when(F.upper(F.col("cst_gndr")) == "M", "Male")
     .when(F.upper(F.col("cst_gndr")) == "F", "Female")
     .otherwise("Unknown")
)
.withColumn(
    "cst_marital_status",
    F.when(F.upper(F.col("cst_marital_status")) == "S", "Single")
     .when(F.upper(F.col("cst_marital_status")) == "M", "Married")
     .otherwise("Unknown"))
)
df.display()


## Renaming The Columns

In [0]:
for old,new in RENAME_MAP.items():
    df = df.withColumnRenamed(old, new)
df.display()

## Remove Records with Missing Customer ID

In [0]:
# Check if there are NULL values in the customer_id column
df.filter(col("customer_id").isNull()).show()

In [0]:
df=df.na.drop(subset=["customer_id"])
# df = df.filter(col("customer_id").isNotNull())

## Check NULLs

In [0]:
df.filter(col("customer_id").isNull()).display()

In [0]:
null_stats = (
    df.select([
        F.count(F.when(F.col(c).isNull(), c)).alias(c)
        for c in df.columns
    ])
)

display(null_stats)

## Check Duplicates 

In [0]:
df.groupBy("customer_id").count().filter(col("count") > 1).display()

In [0]:
%sql
SELECT * FROM(
SELECT *, count(1) OVER(PARTITION BY cst_id) as checkduplicates
 FROM bronze.crm_cust_info) t
WHERE t.checkduplicates>1

In [0]:
window=Window.partitionBy("customer_id").orderBy(F.col("created_date").desc())
df_dedup=df.withColumn("flag",F.row_number().over(window)).filter(col("flag")==1).drop("flag") # flag=1--> “the latest row per customer”

display(df_dedup)
# Check if there are NULL values in the customer_id column
df_dedup.filter(col("customer_id").isNull()).count()


# Sanity checks

In [0]:
null_stats = (
    df_dedup.select([
        F.count(F.when(F.col(c).isNull(), c)).alias(c)
        for c in df_dedup.columns
    ])
)

display(null_stats)

In [0]:
#sanity check
df_dedup.limit(10).display()

# Write Into Silver Table
# 

In [0]:
df_dedup.write.mode("overwrite").format("delta").saveAsTable("workspace.silver.crm_customers")

In [0]:
%sql
SELECT * FROM silver.crm_customers
LIMIT 10
