Reading bronze table

In [0]:
df_bronze = spark.table("data_engineering_project.bronze_orders")
df_bronze.display()

Standardizing column names

In [0]:
from pyspark.sql.functions import col
df_silver = df_bronze.select([col(c).alias(c.lower()) for c in df_bronze.columns])
display(df_silver)

Converting string to date datatype

In [0]:
from pyspark.sql.functions import when
df_silver = df_silver.withColumn('order_date',when(col("order_date") == '29/02/2023',"2023-02-28").otherwise(col('order_date')))

In [0]:
from pyspark.sql.functions import to_date
df_silver = df_silver.withColumn('order_date', to_date(col("order_date"),'yyyy-MM-dd'))
display(df_silver)

In [0]:
df_silver.printSchema()

Handling Null values 

Identifying columns that have atleast one null

In [0]:
for c in df_silver.columns:
    null_counts = df_silver.filter(col(c).isNull()).count()
    if null_counts > 0:
        print(f"Column '{c}' has {null_counts} null values.")

Dropping these columns as they are of no use user_id, state_id, order_s

In [0]:
df_silver = df_silver.drop("user_id","state_id","order_s")
df_silver.display()

Identifying duplicate values 

In [0]:
df_silver.distinct().count()

In [0]:
for c in df_silver.columns:
    distinct_count = df_silver.select(col(c)).distinct().count()
    print(f" Column '{c}' has {distinct_count}' distinct values.")

id has 4 duplicate value it seems should not be present

In [0]:
df_silver.groupBy(col("id")).count().where(col("count") >1).display()

In [0]:
df_silver.where(col("id") == "2021").display()

Adding rownumber to ensure row_id is unique for each record

In [0]:
from pyspark.sql.window import Window 
from pyspark.sql.functions import row_number

window_spec = Window.orderBy(col("id"))
df_silver = df_silver.withColumn("id_key", row_number().over(window_spec))


In [0]:
for c in df_silver.columns:
    distinct_count = df_silver.select(col(c)).distinct().count()
    print(f" Column '{c}' has {distinct_count}' distinct values.")

In [0]:
df_silver = df_silver.drop("id").withColumnRenamed("id_key","id")
df_silver.display()

Adding derived columns order_year , order_month and shipping_days

In [0]:
from pyspark.sql.functions import date_format,date_diff
df_silver = df_silver.withColumn("order_year", date_format(col("order_date"),"yyyy"))\
                     .withColumn("order_month", date_format(col("order_date"),"MM"))\
                     .withColumn("shipping_days", date_diff(col("ship_date") , col("order_date")) )
df_silver.display()

In [0]:
df_silver = df_silver.withColumn("order_year", col("order_year").cast("int"))\
                     .withColumn("order_month", col("order_month").cast("int"))
display(df_silver)

Data Validating 

In [0]:
df_silver.where((col("sales")< 0) | (col("quantity")< 0) | (col("order_date")>col("ship_date"))).display()