# Bronze to Silver: Data Cleansing and Transformation

In [0]:
from pyspark.sql.types import StructType, StringType, IntegerType, DateType, BooleanType
import pyspark.sql.functions as F

catalog_name = 'ecommerce'

In [0]:
df = spark.table(f'{catalog_name}.bronze.brz_order_items')
display(df)

In [0]:
df.printSchema()

In [0]:
#remove duplicates
df = df.dropDuplicates(["order_id", "item_seq"])

# Convert "Two" to 2
df = df.withColumn(
    "quantity",
    F.when(F.col("quantity") == "Two", 2).otherwise(F.col("quantity")).cast("int")
)

# Remove $ or other symbols from unit_price
df = df.withColumn(
    "unit_price",
    F.regexp_replace(F.col("unit_price"), "[$]", "").cast("double")
)

# Remove % or other symbols from discount_pct
df = df.withColumn(
    "discount_pct",
    F.regexp_replace(F.col("discount_pct"), "[%]", "").cast("double")
)

# Coupon Code processing
df = df.withColumn(
    "coupon_code",
    F.lower(F.trim(F.col("coupon_code")))
)

# Channel Complete Name
df = df.withColumn(
    "channel",
    F.when(F.col("channel") == "web", "Website")
    .when(F.col("channel") == "app", "Mobile")
    .otherwise(F.col("channel"))
)



In [0]:
display(df)

### Datatype Conversion

In [0]:
#  1. Convert Date type to Date (string to date)
df = df.withColumn("dt", F.to_date(F.col("dt"), "yyyy-MM-dd"))

# 2. Covert Order TimeStamps' Type to TimeStamp (String to Timestamp)
df = df.withColumn(
    "order_ts",
    F.coalesce(
        F.to_timestamp("order_ts", "yyyy-MM-dd HH:mm:ss"),
        F.to_timestamp("order_ts", "dd-MM-yyyy HH:mm")
    )
)

# 3. Convert item_seq (String to Integer)
df = df.withColumn(
    "item_seq",
    F.col("item_seq").cast("int")
)

# 4. Convert Tax_Amount (String to Double, Strip non-numeric Characters)
df = df.withColumn(
    "tax_amount",
    F.regexp_replace(F.col("tax_amount"), r"[^0-9.\-]", "").cast("double")
)

# 5. Add Processed Time
df = df.withColumn(
    "processed_time", F.current_timestamp()
)

In [0]:
display(df)

In [0]:
df.printSchema()

In [0]:
# Write raw data to Silver Layer
df.write.format("delta") \
    .mode("overwrite") \
    .option("mergeSchema", "true") \
    .saveAsTable(f"{catalog_name}.silver.slv_order_items")