#### 1. Descriptive statistical analysis = summarizing the data using measures such as mean, median, standard deviation, missing values, outliers, and quantiles.

In [0]:
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [0]:
# desccriptive statistical analysis

# load the cleaned dataset silver_events
spark.sql("USE CATALOG ecom_catalog")
spark.sql("USE SCHEMA ecom_schema")

events = spark.table("silver_events")

+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|         109511637|
|   mean|292.34153514786425|
| stddev| 356.8840028041405|
|    min|              0.77|
|    max|           2574.07|
+-------+------------------+



In [0]:
# data distribution -  mean , median, staddev, min , max for price
events.select("price").summary().show()

+-------+------------------+
|summary|             price|
+-------+------------------+
|  count|         109511637|
|   mean|292.34153514786425|
| stddev| 356.8840028041405|
|    min|              0.77|
|    25%|              68.6|
|    50%|            165.64|
|    75%|            360.34|
|    max|           2574.07|
+-------+------------------+



In [0]:
# Percentiles (25th, 50th, 75th) 
events.approxQuantile("price", [0.25, 0.5, 0.75], 0.01)

[67.99, 164.17, 359.08]

In [0]:
# find price missing values
events.filter(F.col("price").isNull()).count()

0

#### 2. Hypothesis Testing (weekday vs weekend)

In [0]:
# hypothesis testing assumptionss: people buy on weekend
df_hypo = events.withColumn("Is_weekend",F.when(F.dayofweek(F.col("event_date")).isin([1,7]),"Weekend").otherwise("Weekday"))
df_hypo.groupBy("Is_weekend","event_type").count().show()

+----------+----------+--------+
|Is_weekend|event_type|   count|
+----------+----------+--------+
|   Weekend|  purchase|  613075|
|   Weekend|      view|34012230|
|   Weekday|  purchase| 1046615|
|   Weekend|      cart| 1430347|
|   Weekday|      cart| 2371012|
|   Weekday|      view|70038358|
+----------+----------+--------+



In [0]:
from pyspark.sql.functions import count, when
# previously we see all event types are higher in weekday than weekend. 
# now lets focus on conversion rate between weekday and weekend

df_hypo_test_results = df_hypo.groupBy("Is_weekend").agg(
    count(when(F.col("event_type") == "view", 1)).alias("views"),
    count(when(F.col("event_type") == "purchase", 1)).alias("purchases")
).withColumn(
    "conversion_rate", 
    F.round((F.col("purchases") / F.col("views")) * 100,2)
)
display(df_hypo_test_results)

Is_weekend,views,purchases,conversion_rate
Weekday,70038358,1046615,1.49
Weekend,34012230,613075,1.8


Databricks visualization. Run in Databricks to view.

#### 3. Corelation Analysis - a measure of how strongly two variables move together, ranging from â€“1 to +1.

In [0]:
#correlation analysis
# price vs purchase count

df_prod = events.groupBy("product_id").agg(F.avg("price").alias("avg_price"), \
    count(F.when(F.col("event_type") == "purchase",1)).alias("purchase_count"))

display(df_prod)



product_id,avg_price,purchase_count
5701087,52.817026585474274,100
8500290,264.46637801204827,55
1005159,209.88261932264248,3663
17302448,74.86664723032068,1
12400079,78.20499999999997,0
45300001,111.69151696606788,5
13700041,292.9171641791044,3
18001092,1.556379746835443,3
41100005,71.2206221198157,1
7004004,128.6798584905662,23


Databricks visualization. Run in Databricks to view.

In [0]:
# Calculate Correlation
correlation_val = df_prod.stat.corr("avg_price", "purchase_count")
print("Correlation:", correlation_val)

Correlation: 0.015199354567417092


#### 4. Feature Engineering for ML

In [0]:
from pyspark.sql.functions import col, dayofweek, hour, log, unix_timestamp, first, when, count, avg, lit
# Window to track each user's event sequence
user_window = Window.partitionBy("user_id").orderBy("event_time")

# Extract hour of the event (0â€“23), day of week, weekend_flag, log_price
# # Log-transform price to handle skewed price distribution, reduce the influence of outliers,
# and help models learn patterns more effectively

df_features = events \
    .withColumn("hour_of_day", hour("event_time")) \
    .withColumn("day_of_week", dayofweek("event_date")) \
    .withColumn("is_weekend_flag", when(col("day_of_week").isin([1, 7]), 1).otherwise(0)) \
    .withColumn("price_log", log(col("price") + 1)) \
    .withColumn(
        "time_since_first_view",
        unix_timestamp("event_time") - unix_timestamp(first("event_time").over(user_window)) 
    )

# save as managed delta table for ml workflows
df_features.select(
    "event_type","user_id", "product_id", "brand", "category_code",
    "hour_of_day", "day_of_week", "is_weekend_flag", "price_log", "time_since_first_view"
).write.mode("overwrite").saveAsTable("gold_ml_features")

spark.table("gold_ml_features").limit(5).display()

event_time,event_type,user_id,product_id,brand,category_code,hour_of_day,day_of_week,is_weekend_flag,price_log,time_since_first_view,event_date
,view,81255481,16400235,bergner,,7,6,0,4.209902902856373,0,
,view,81255481,16400235,bergner,,14,5,0,4.206779991551889,1146401,
,view,117019800,35000021,zeta,,5,3,0,3.0233474405869645,0,
,view,153489867,1201216,lenovo,electronics.tablet,15,2,0,5.502278098445473,0,
,view,153489867,1201471,samsung,electronics.tablet,15,2,0,6.151370064617605,1106,


#### - These engineered features show meaningful patterns in user behavior, timing, and product interactions.
#### - They turn raw data into an organized, MLâ€‘ready dataset that helps models learn how users engage with products over time.
