# Initialization

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, DateType , IntegerType
from pyspark.sql.functions import trim, col

# Read Bronze table

In [0]:
# I am reading from the source which is the Bronze
df=spark.table("workspace.bronze.erp_cust_az12")

In [0]:
df.display()

# Silver Transformations

In [0]:
df.dtypes


##  Trimming

In [0]:
for field in df.schema.fields: 
  if isinstance(field.dataType, StringType):
    df = df.withColumn(field.name, trim(col(field.name)))
df.display()

In [0]:
# check for NULLs
null_stats = (
    df.select([
        F.count(F.when(F.col(c).isNull(), c)).alias(c)
        for c in df.columns
    ])
)

display(null_stats)


## GEN column normalization

In [0]:
df.groupBy("GEN").count().display()



In [0]:
# Replace NULLs with Unknown

df=df.withColumn("GEN",
                 F.when(F.upper(col("GEN")).isin("F", "FEMALE"), "Female")
                 .when(F.upper(col("GEN")).isin("M", "MALE"),"Male")
                 .otherwise("Unknown")
                 )




In [0]:
df.groupBy("GEN").count().display()

In [0]:
df.display()

## Customer ID _Cleanup_

In [0]:
df = df.withColumn(
    "CID",
    F.when(col("CID").startswith("NAS"),
           F.substring(col("CID"), 4, F.length(col("CID"))))
     .otherwise(col("CID"))
)


## Birthday Validation

In [0]:
df=df.withColumn("BDATE",
                 F.when(col("BDATE")>F.current_date(),None)
                 .otherwise(col("BDATE"))
                 )

In [0]:
df.display()

In [0]:
df.filter(col("BDATE").isNull()).display()

## Renaming The Columns

In [0]:

RENAME_MAP = {
    "CID": "customer_number",
    "BDATE": "birth_date",
    "GEN": "gender"
}

for old,new in RENAME_MAP.items():
    df = df.withColumnRenamed(old, new)
df.display()

# Sanity checks

In [0]:
null_stats = (
    df.select([
        F.count(F.when(F.col(c).isNull(), c)).alias(c)
        for c in df.columns
    ])
)

display(null_stats)

In [0]:
#sanity check
df.limit(10).display()

# Write Into Silver Table
# 

In [0]:
df.write.mode("overwrite").format("delta").saveAsTable("workspace.silver.erp_customers")

In [0]:
%sql
SELECT * FROM silver.erp_customers
LIMIT 5