# Initialization

In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, DateType
from pyspark.sql.functions import trim, col
from pyspark.sql.window import Window


# Read Bronze table

In [0]:
df=spark.table("bronze.crm_prd_info")

In [0]:
df.display()

In [0]:
df.dtypes

# Silver Transformations

## Trimming

In [0]:
for field in df.schema.fields:
  if field.dataType == StringType():
    df = df.withColumn(field.name, trim(col(field.name)))
df.display()

## Product Key Parsing

In [0]:
df = df.withColumn("cat_id", F.regexp_replace(F.substring(col("prd_key"), 1, 5), "-", "_"))
df = df.withColumn("prd_key", F.substring(col("prd_key"), 7, F.length(col("prd_key"))))

In [0]:
df.limit(5).display()

## Cost Cleanup

In [0]:
# replace NULL with 0 in column prd_cost
df=df.na.fill(0,["prd_cost"])
df.limit(5).display()

## Renaming the Columns

In [0]:
RENAME_MAP={
    "prd_id":"product_id",
    "cat_id": "category_id",
    "prd_key":"product_number",
    "prd_nm":"product_name",
    "prd_cost":"product_cost",
    "prd_line":"product_line",
    "prd_start_dt":"start_date",
    "prd_end_dt":"end_date"
 

} 

In [0]:
for old,new in RENAME_MAP.items():
  df = df.withColumnRenamed(old,new)
df.limit(5).display()

## Product Line Normalization


In [0]:
df=df.withColumn("product_line",
              F.when(F.upper(F.col("product_line"))=="R","Road").
              when(F.upper(F.col("product_line"))=="T","Touring").
              when(F.upper(F.col("product_line"))=="M","Mountain").
              when(F.upper(F.col("product_line"))=="S","Other Sales").
              otherwise("Uknown"))

df.display()


## Clean end_date column

In [0]:
df.filter(col("end_date")>col("start_date")).display()

In [0]:
window=Window.partitionBy("product_number").orderBy(col("start_date"))
df_result=df.withColumn("end_date",
                        F.date_sub(F.lead("start_date").over(window),1)) # calculate end date as one day before the next start date

df_result.display()


In [0]:
df_result.filter(df.end_date<df.start_date).count()

## Sanity checks of dataframe

In [0]:
df_result.limit(10).display()

In [0]:
df_result.dtypes

# Write Into Silver Table

In [0]:
df_result.write.mode("overwrite").format("delta").option("overwriteSchema", "true").saveAsTable("workspace.silver.crm_products")

In [0]:
df_result.printSchema()

## Sanity checks of Silver table

In [0]:
%sql
SELECT * FROM workspace.silver.crm_products 
ORDER BY product_id

In [0]:
%sql
SELECT * FROM workspace.silver.crm_products 
WHERE end_date<start_date