In [0]:
import pyspark.sql.functions as F

In [0]:
# WIDGETS -INPUT PARAMETERS

dbutils.widgets.text("process_date", "2019-11-21")
dbutils.widgets.dropdown("layer", "bronze", ["bronze", "silver", "gold"])

process_date = dbutils.widgets.get("process_date")
layer = dbutils.widgets.get("layer")

print(f"Running layer: {layer} for date: {process_date}")


# Path configuaration
base_path = "/Volumes/workspace/ecommerce/ecommerce_data"

# Raw CSV files (Oct and Nov)
raw_oct_path = f"{base_path}/2019-Oct.csv"
raw_nov_path = f"{base_path}/2019-Nov.csv"

# Delta layer paths
bronze_path = f"{base_path}/bronze/events"
silver_path = f"{base_path}/silver/events"
gold_path   = f"{base_path}/gold/events"

Running layer: bronze for date: 


In [0]:
# Bronze Layer

def run_bronze():
    print("Starting Bronze Layer...")

    # Load raw data
    df_rawoct = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Oct.csv",
                               header=True, inferSchema=True)
    df_rawnov = spark.read.csv("/Volumes/workspace/ecommerce/ecommerce_data/2019-Nov.csv",
                               header=True, inferSchema=True)
    df_raw = df_rawoct.union(df_rawnov)

    # Write to Bronze Delta
    df_raw.write.format("delta").mode("overwrite").save(bronze_path)

    # Load Bronze Delta
    df_bronze = spark.read.format("delta").load(bronze_path)
  
    df_bronze.write.format("delta").mode("overwrite").save(bronze_path)
    print("Bronze layer completed.")

# Silver Layer

def run_silver():
    print("Starting Silver Layer...")

    # Load Bronze Delta
    df_bronze = spark.read.format("delta").load(bronze_path)

    # Data cleaning logic
    df_silver = (
        df_bronze.filter(F.col("price") > 0)
                 .filter(F.col("price") < 10000)
                 .dropDuplicates(["user_session", "event_time"])
                 .withColumn("event_date", F.to_date("event_time"))
                 .withColumn(
                     "price_tier",
                     F.when(F.col("price") < 10, "budget")
                      .when(F.col("price") < 50, "mid")
                      .otherwise("premium")
                 )
    )

    # Write to Silver Delta
    df_silver.write.format("delta").mode("overwrite").save(silver_path)

    print("Silver layer completed.")


# Gold Layer

def run_gold():
    print("Starting Gold Layer...")

    # Load Silver Delta
    df_silver = spark.read.format("delta").load(silver_path)

    #  KPI aggregation logic
    product_perf = (
        df_silver.groupBy("category_id", "category_code")
                 .agg(
                     F.countDistinct(F.when(F.col("event_type") == "view", F.col("user_id"))).alias("views"),
                     F.countDistinct(F.when(F.col("event_type") == "cart", F.col("user_id"))).alias("carts"),
                     F.countDistinct(F.when(F.col("event_type") == "purchase", F.col("user_id"))).alias("purchases"),
                     F.sum(F.when(F.col("event_type") == "purchase", F.col("price"))).alias("revenue")
                 )
                 .withColumn(
                     "cart_to_purchase_rate",
                     F.when(F.col("carts") > 0,
                            F.col("purchases") / F.col("carts") * 100
                     ).otherwise(0)
                 )
    )

    # Write to Gold Delta
    product_perf.write.format("delta").mode("overwrite").save(gold_path)
    print("Gold layer completed.")

In [0]:
# write into managed table to be used in the dashboard using SQL
spark.sql("""
    CREATE OR REPLACE TABLE bronze_events
    AS SELECT * FROM delta.`/Volumes/workspace/ecommerce/ecommerce_data/bronze/events`
""")
spark.sql("""
    CREATE OR REPLACE TABLE silver_events
    AS SELECT * FROM delta.`/Volumes/workspace/ecommerce/ecommerce_data/silver/events`
""")
spark.sql("""
    CREATE OR REPLACE TABLE gold_events
    AS SELECT * FROM delta.`/Volumes/workspace/ecommerce/ecommerce_data/gold/events`
""")

DataFrame[num_affected_rows: bigint, num_inserted_rows: bigint]

In [0]:
# select the layer

def run_layer(layer_name):
    if layer_name == "bronze":
        run_bronze()
    elif layer_name == "silver":
        run_silver()
    elif layer_name == "gold":
        run_gold()
    else:
        raise ValueError("Invalid layer selected.")

In [0]:
# Execute the layer 
#layer = "bronze"
run_layer(layer)

Starting Gold Layer...
Gold layer completed.
