In [0]:
from pyspark.sql.functions import *
from pyspark.sql import functions as F

In [0]:
bronze_path = "/Volumes/workspace/ecommerce/ecommerce_data/bronze/events/"
silver_path = "/Volumes/workspace/ecommerce/ecommerce_data/silver/events"
gold_path ="/Volumes/workspace/ecommerce/ecommerce_data/gold/events"

In [0]:
# medallion architecture

# load the raw data
df_rawoct = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv",header=True, inferSchema=True)
df_rawnov = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",header=True, inferSchema=True)
df_raw=df_rawoct.union(df_rawnov)

df_raw.write.format("delta").mode("overwrite").save(bronze_path)

In [0]:
# load the bronze delta table data and create managed table
df_bronze = spark.read.format("delta").load(bronze_path)

# write as managed table for sql querries
df_bronze.write.format("delta").mode("overwrite").saveAsTable("bronze_events")


In [0]:
%sql
select * from bronze_events limit 5

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
2019-11-21T06:35:28.000Z,view,6500842,2053013554155487563,computers.components.motherboard,gigabyte,63.55,572263216,66b8f723-3ae5-4944-9296-1a2bac971889
2019-11-21T06:35:28.000Z,view,100015909,2152167773222993940,,,414.43,571872422,81de329d-b024-4697-b0de-deacb5f85d34
2019-11-21T06:35:28.000Z,view,1004836,2053013555631882655,electronics.smartphone,samsung,229.88,553168387,fe866663-ccb6-480a-9695-994006133e40
2019-11-21T06:35:28.000Z,view,12719629,2053013553559896355,,kapsen,42.47,513254251,10c7b1be-4844-4f87-a1e0-7b72225d540f
2019-11-21T06:35:28.000Z,view,29400006,2059484387635888645,,daisy,156.74,545687627,0ef1b04b-2e8f-40be-8cee-b8d9aae36eb1


In [0]:
# check for null before cleanign up

null_counts = df_bronze.select([
    sum(col(c).isNull().cast("int")).alias(c)
    for c in df_bronze.columns
])

display(null_counts)

event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,0,0,0,35413780,15331243,0,0,12


#### silver layer : cleaned data

In [0]:
#silver layer cleanup

df_silver = df_bronze.filter(F.col("price") > 0) \
                    .filter(F.col("price") < 10000) \
                    .dropDuplicates(["user_session","event_time"]) \
                    .withColumn("event_date", F.to_date("event_time")) \
                    .withColumn("price_tier",F.when(F.col("price") < 10, "budget")
                        .when(F.col("price") < 50, "mid")
                        .otherwise("premium")) 
                    
df_silver.write.format("delta").mode("overwrite").save(silver_path)



In [0]:
# create silver managed table
df_silver.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("silver_events")

In [0]:
%sql
select distinct event_type from events_octnov

event_type
purchase
cart
view


#### gold layer : KPI summary. chk each category in terms of event_type and revenue. find the rate of cart to purchase.
#### KPIs: views, carts, purchases, revenue, and cart‑to‑purchase rate = (purchases / carts) × 100.
#### The final KPI table is written to the Gold layer for analytics and reporting.

In [0]:
# gold layer : KPI analysis.

product_perf = df_silver.groupBy("category_id", "category_code") \
    .agg(
        F.countDistinct(F.when(F.col("event_type")=="view", F.col("user_id"))).alias("views"),
        F.countDistinct(F.when(F.col("event_type")=="cart", F.col("user_id"))).alias("carts"),
        F.countDistinct(F.when(F.col("event_type")=="purchase", F.col("user_id"))).alias("purchases"),
        F.sum(F.when(F.col("event_type")=="purchase", F.col("price"))).alias("revenue")
    ).withColumn("cart_to_purchase_rate", 
                 F.when(F.col("carts") > 0,
                     F.col("purchases")/F.col("carts")*100
                ).otherwise(0)
                )
              
product_perf.write.format("delta").mode("overwrite").save(gold_path)

# create managed table
product_perf.write.format("delta") \
    .mode("overwrite") \
    .saveAsTable("gold_category_performance")

In [0]:
%sql
select * from gold_category_performance limit 10

category_id,category_code,views,carts,purchases,revenue,cart_to_purchase_rate
2088750570935419494,construction.tools.painting,28,0,0,,0.0
2106075725441269865,,16077,291,181,7139.079999999997,62.19931271477663
2070005009256284935,,16540,132,40,5659.320000000001,30.303030303030305
2053013565782098913,apparel.shoes,91873,2893,1896,254128.67,65.53750432077429
2053013555413778833,,15028,524,428,43564.22999999999,81.67938931297711
2171876348610478994,apparel.shirt,5893,84,18,927.22,21.428571428571427
2091727629378912491,,6304,24,5,304.1,20.833333333333336
2053013559842964121,furniture.living_room.chair,10797,143,87,21844.58000000002,60.83916083916085
2053013556227473861,construction.tools.saw,23489,1294,834,232664.5699999999,64.45131375579598
2175419524847763461,,471,12,5,582.57,41.66666666666667
