# Init


In [0]:
import pyspark.sql.functions as f
from pyspark.sql.types import StringType
from pyspark.sql.functions import trim, col

# Read from Bronze Layer

In [0]:
df = spark.table("workspace.bronze.crm_prd_info")

# Data Transformations


In [0]:
df.display()

## Trim columns


In [0]:
for field in df.schema.fields:
    if isinstance(field.dataType, StringType): 
        df = df.withColumn(field.name, trim(col(field.name)))

In [0]:
df.display()

## Product key parsing


In [0]:
df = df.withColumn("cat_id", f.regexp_replace(f.substring(col("prd_key"), 1, 5), "-", "_"))
df = df.withColumn("prd_key", f.substring(col("prd_key"), 7, f.length(col("prd_key"))))

In [0]:
df.display()

## Cost cleanup


In [0]:
df = df.withColumn("prd_cost", f.coalesce(col("prd_cost"), f.lit(0)))

In [0]:
df.display()

## Normalization

In [0]:
df = (
    df
    .withColumn(
        "prd_line",
        f.when(f.upper(f.col("prd_line")) == "M", "Mountiain")
         .when(f.upper(f.col("prd_line")) == "R", "Road")
         .when(f.upper(f.col("prd_line")) == "T", "Touring")
         .when(f.upper(f.col("prd_line")) == "S", "Other Sales")
         .otherwise("N/A")
    )
)

In [0]:
df.display()

# Rename columns


In [0]:
RENAME_MAP = {
    "prd_id": "product_id",
    "cat_id": "category_id",
    "prd_key": "product_number",
    "prd_nm": "product_name",
    "prd_cost": "product_cost",
    "prd_line": "product_line",
    "prd_start_dt": "start_date",
    "prd_end_dt": "end_date"
}

for old_name, new_name in RENAME_MAP.items():
  df = df.withColumnRenamed(old_name, new_name)

In [0]:
df.limit(10).display()

# Write into Silver table

In [0]:
df.write.mode("overwrite").format("delta").saveAsTable("silver.crm_prd_info")

In [0]:
%sql
SELECT * FROM workspace.silver.crm_prd_info LIMIT 10