#Bronze to Silver: Data Cleaning and Transformation for Dimension Tables

In [0]:
from pyspark.sql.types import StringType, IntegerType, DateType, TimestampType, FloatType
import pyspark.sql.functions as F

catalog_name = 'ecommerce'

#Brands

In [0]:
df_bronze = spark.read.table(f'{catalog_name}.bronze.brz_brands')
df_bronze.show()

In [0]:
df_silver_trim_regex = df_bronze.withColumn("brand_code", F.regexp_replace(F.col("brand_code"), r"([^A-Za-z0-9])", ""))
df_silver = df_silver_trim_regex.withColumn("brand_name", F.trim(F.col("brand_name")))
df_silver.show()
                                

In [0]:
df_silver.select("category_code").distinct().show()

In [0]:
anomalies = {
    "GROCERY": "GRCY",
    "BOOKS": "BKS",
    "TOYS": "TOY"
}

df_silver = df_silver.replace(to_replace=anomalies, subset=["category_code"])
df_silver.select("category_code").distinct().show()


In [0]:
df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeschema", "true") \
    .saveAsTable(f"{catalog_name}.silver.slv_brands")

#Category

In [0]:
df_bronze = spark.table(f"{catalog_name}.bronze.brz_category")
df_bronze.show()

In [0]:
# Check for duplicates
df_duplicates = df_bronze.groupBy("category_code").count().filter(F.col("count") > 1)
display(df_duplicates)

In [0]:
df_silver = df_bronze.dropDuplicates(["category_code"])
display(df_silver)

In [0]:
df_silver = df_silver.withColumn("category_code", F.upper(F.col("category_code")))
display(df_silver)

In [0]:
df_silver.write.format("delta") \
    .mode("overwrite") \
    .option("mergeschema", "true") \
    .saveAsTable(f"{catalog_name}.silver.slv_category")

#Products

In [0]:
df_bronze = spark.read.table(f"{catalog_name}.bronze.brz_products")

row_count, column_count = df_bronze.count(), len(df_bronze.columns)

print(f"Number of rows in bronze Products table: {row_count}")
print(f"Number of columns in bronze Products table: {column_count}")

In [0]:
display(df_bronze.limit(5))

##Check weight_grams (contains 'g')

In [0]:
df_bronze.select("weight_grams").show(5,truncate=False)

In [0]:
df_silver = df_bronze.withColumn(
    "weight_grams",
    F.regexp_replace(F.col("weight_grams"), "g", "").cast(IntegerType())
)
df_silver.select("weight_grams").show(5, truncate=False)

##Check length_cm (comma instead of dot)