In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder \
.appName("E-CommerceAnalytics")\
.getOrCreate()

In [13]:
raw_customers = [
("C001","Rahul","29","Bangalore","Electronics,Fashion"),
("C002","Sneha","Thirty Two","Delhi","Fashion"),
("C003","Aman",None,"Mumbai",["Home","Electronics"]),
("C004","Pallavi","27","Pune","Electronics|Beauty"),

("C005","", "35","Chennai",None)
]

In [3]:
raw_sellers = [
("S001","TechWorld","Electronics","2019-06-01"),
("S002","FashionHub","Fashion","01/07/2020"),
("S003","HomeEssentials","Home","2018/09/15"),
("S004","BeautyStore","Beauty","invalid_date")
]

In [4]:
raw_products = [
("P001","Laptop","Electronics","S001","55000"),
("P002","Headphones","Electronics","S001","2500"),
("P003","T-Shirt","Fashion","S002","1200"),
("P004","Sofa","Home","S003","45000"),
("P005","Face Cream","Beauty","S004","800")
]

In [5]:
raw_orders = [
("O001","C001","P001","2024-01-05","Delivered","55000"),
("O002","C002","P003","05/01/2024","Cancelled","0"),
("O003","C003","P004","2024/01/06","Delivered","45000"),
("O004","C004","P005","invalid_date","Delivered","800"),
("O005","C001","P002","2024-01-10","Delivered","2500"),
("O006","C005","P003","2024-01-12","Delivered","1200")
]

In [6]:
raw_activity = [
("C001","search,view,add_to_cart","{'device':'mobile'}",180),
("C002",["search","view"],"device=laptop",90),
("C003","search|view|purchase",None,120),
("C004",None,"{'device':'tablet'}",60),
("C005","search","{'device':'mobile'}",30)
]

#PART A — DATA CLEANING & STRUCTURING

1. Design explicit schemas for all datasets

In [33]:
from pyspark.sql.types import *
from pyspark.sql.functions import *
customer_schema = StructType([
StructField("customer_id", StringType(), True),
StructField("name", StringType(), True),
StructField("age", StringType(), True),
StructField("city", StringType(), True),
StructField("interests", StringType(), True)
])
raw_customers_df = spark.createDataFrame(raw_customers, schema=customer_schema)

In [35]:
seller_schema = StructType([
StructField("seller_id", StringType(), True),
StructField("seller_name", StringType(), True),
StructField("category", StringType(), True),
StructField("start_date", StringType(), True)
])
raw_sellers_df = spark.createDataFrame(raw_sellers, schema=seller_schema)

In [38]:
product_schema = StructType([
StructField("product_id", StringType(), True),
StructField("product", StringType(), True),
StructField("category", StringType(), True),
StructField("seller_id", StringType(), True),
StructField("start_date", StringType(), True)
])
raw_products_df = spark.createDataFrame(raw_products, schema=product_schema)

In [41]:
order_schema = StructType([
StructField("order_id", StringType(), True),
StructField("customer_id", StringType(), True),
StructField("product_id", StringType(), True),
StructField("order_date", StringType(), True),
StructField("status", StringType(), True),
StructField("amount", StringType(), True)
])
raw_orders_df = spark.createDataFrame(raw_orders, schema=order_schema)

In [42]:
activity_schema = StructType([
StructField("customer_id", StringType(), True),
StructField("actions", StringType(), True),
StructField("metadata", StringType(), True),
StructField("duration", IntegerType(), True)
])
raw_activity_df = spark.createDataFrame(raw_activity, schema=activity_schema)

2. Normalize data types (age, price, dates)

In [45]:

from pyspark.sql.functions import regexp_extract, col, when

customers_df = raw_customers_df \
    .withColumn("age", when(regexp_extract("age", "\\d+", 0) == "", None)
                .otherwise(regexp_extract("age", "\\d+", 0)).cast("int")) \
    .withColumn("name", when(col("name") == "", None).otherwise(col("name")))

customers_df.show()


+-----------+-------+----+---------+-------------------+
|customer_id|   name| age|     city|          interests|
+-----------+-------+----+---------+-------------------+
|       C001|  Rahul|  29|Bangalore|Electronics,Fashion|
|       C002|  Sneha|NULL|    Delhi|            Fashion|
|       C003|   Aman|NULL|   Mumbai|[Home, Electronics]|
|       C004|Pallavi|  27|     Pune| Electronics|Beauty|
|       C005|   NULL|  35|  Chennai|               NULL|
+-----------+-------+----+---------+-------------------+



In [47]:
products_df = raw_products_df.withColumnRenamed("start_date", "price").withColumn("price", col("price").cast("int"))
products_df.show()

+----------+----------+-----------+---------+-----+
|product_id|   product|   category|seller_id|price|
+----------+----------+-----------+---------+-----+
|      P001|    Laptop|Electronics|     S001|55000|
|      P002|Headphones|Electronics|     S001| 2500|
|      P003|   T-Shirt|    Fashion|     S002| 1200|
|      P004|      Sofa|       Home|     S003|45000|
|      P005|Face Cream|     Beauty|     S004|  800|
+----------+----------+-----------+---------+-----+



In [48]:
orders_df =raw_orders_df.withColumn("order_date",
                                    coalesce(
                                        to_date(try_to_timestamp(col("order_date"),lit("yyyy-MM-dd"))),
                                        to_date(try_to_timestamp(col("order_date"),lit("dd/MM/yyyy"))),
                                        to_date(try_to_timestamp(col("order_date"),lit("yyyy/MM/dd")))
                                    )
                                   )
orders_df.show()

+--------+-----------+----------+----------+---------+------+
|order_id|customer_id|product_id|order_date|   status|amount|
+--------+-----------+----------+----------+---------+------+
|    O001|       C001|      P001|2024-01-05|Delivered| 55000|
|    O002|       C002|      P003|2024-01-05|Cancelled|     0|
|    O003|       C003|      P004|2024-01-06|Delivered| 45000|
|    O004|       C004|      P005|      NULL|Delivered|   800|
|    O005|       C001|      P002|2024-01-10|Delivered|  2500|
|    O006|       C005|      P003|2024-01-12|Delivered|  1200|
+--------+-----------+----------+----------+---------+------+



3. Convert interests and actions into arrays

In [50]:

from pyspark.sql.functions import split, regexp_replace

customers_df = customers_df.withColumn(
    "interests",
    split(regexp_replace("interests", "[|]", ","), ",")
)


In [53]:
from pyspark.sql.functions import split, regexp_replace

activity_df = raw_activity_df.withColumn(
    "actions",
    split(regexp_replace("actions", "[|]", ","), ",")
)

4. Handle missing and invalid records gracefully

In [61]:
from pyspark.sql.functions import col, to_date, coalesce, split, lit, array_remove, try_to_timestamp

# Make an empty string array: split("", ",") -> [""] then remove "" -> []
empty_string_array = array_remove(split(lit(""), ","), "")

customers_df = customers_df.withColumn(
    "interests",
    coalesce(col("interests"), empty_string_array)
)

orders_df = orders_df.filter(col("order_date").isNotNull())

sellers_df = raw_sellers_df.withColumn(
    "start_date",
    coalesce(
        to_date(try_to_timestamp(col("start_date"), lit("yyyy-MM-dd"))),
        to_date(try_to_timestamp(col("start_date"), lit("dd/MM/yyyy"))),
        to_date(try_to_timestamp(col("start_date"), lit("yyyy/MM/dd")))
    )
)

5. Produce clean DataFrames:
users_df
courses_df
enrollments_df
activity_df

In [60]:
customers_df.printSchema()
customers_df.show()
sellers_df.printSchema()
sellers_df.show()
products_df.printSchema()
products_df.show()
orders_df.printSchema()
orders_df.show()
activity_df.printSchema()
activity_df.show()

root
 |-- customer_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- city: string (nullable = true)
 |-- interests: array (nullable = false)
 |    |-- element: string (containsNull = false)

+-----------+-------+----+---------+--------------------+
|customer_id|   name| age|     city|           interests|
+-----------+-------+----+---------+--------------------+
|       C001|  Rahul|  29|Bangalore|[Electronics, Fas...|
|       C002|  Sneha|NULL|    Delhi|           [Fashion]|
|       C003|   Aman|NULL|   Mumbai|[[Home,  Electron...|
|       C004|Pallavi|  27|     Pune|[Electronics, Bea...|
|       C005|   NULL|  35|  Chennai|                  []|
+-----------+-------+----+---------+--------------------+

root
 |-- seller_id: string (nullable = true)
 |-- seller_name: string (nullable = true)
 |-- category: string (nullable = true)
 |-- start_date: date (nullable = true)

+---------+--------------+-----------+----------+
|seller_id

#PART B — DATA INTEGRATION (JOINS)

6. Join orders with products

In [62]:
orders_products_df  = orders_df.join(products_df, "product_id", "inner")
orders_products_df.show()

+----------+--------+-----------+----------+---------+------+----------+-----------+---------+-----+
|product_id|order_id|customer_id|order_date|   status|amount|   product|   category|seller_id|price|
+----------+--------+-----------+----------+---------+------+----------+-----------+---------+-----+
|      P001|    O001|       C001|2024-01-05|Delivered| 55000|    Laptop|Electronics|     S001|55000|
|      P002|    O005|       C001|2024-01-10|Delivered|  2500|Headphones|Electronics|     S001| 2500|
|      P003|    O002|       C002|2024-01-05|Cancelled|     0|   T-Shirt|    Fashion|     S002| 1200|
|      P003|    O006|       C005|2024-01-12|Delivered|  1200|   T-Shirt|    Fashion|     S002| 1200|
|      P004|    O003|       C003|2024-01-06|Delivered| 45000|      Sofa|       Home|     S003|45000|
+----------+--------+-----------+----------+---------+------+----------+-----------+---------+-----+



7. Join products with sellers

In [63]:
products_seller_df  = products_df.join(broadcast(sellers_df), "seller_id", "inner")
products_seller_df.show()

+---------+----------+----------+-----------+-----+--------------+-----------+----------+
|seller_id|product_id|   product|   category|price|   seller_name|   category|start_date|
+---------+----------+----------+-----------+-----+--------------+-----------+----------+
|     S001|      P001|    Laptop|Electronics|55000|     TechWorld|Electronics|2019-06-01|
|     S001|      P002|Headphones|Electronics| 2500|     TechWorld|Electronics|2019-06-01|
|     S002|      P003|   T-Shirt|    Fashion| 1200|    FashionHub|    Fashion|2020-07-01|
|     S003|      P004|      Sofa|       Home|45000|HomeEssentials|       Home|2018-09-15|
|     S004|      P005|Face Cream|     Beauty|  800|   BeautyStore|     Beauty|      NULL|
+---------+----------+----------+-----------+-----+--------------+-----------+----------+



8. Join orders with customers

In [64]:
orders_customers_df  = orders_df.join(customers_df, "customer_id", "inner")
orders_customers_df.show()

+-----------+--------+----------+----------+---------+------+-----+----+---------+--------------------+
|customer_id|order_id|product_id|order_date|   status|amount| name| age|     city|           interests|
+-----------+--------+----------+----------+---------+------+-----+----+---------+--------------------+
|       C001|    O001|      P001|2024-01-05|Delivered| 55000|Rahul|  29|Bangalore|[Electronics, Fas...|
|       C001|    O005|      P002|2024-01-10|Delivered|  2500|Rahul|  29|Bangalore|[Electronics, Fas...|
|       C002|    O002|      P003|2024-01-05|Cancelled|     0|Sneha|NULL|    Delhi|           [Fashion]|
|       C003|    O003|      P004|2024-01-06|Delivered| 45000| Aman|NULL|   Mumbai|[[Home,  Electron...|
|       C005|    O006|      P003|2024-01-12|Delivered|  1200| NULL|  35|  Chennai|                  []|
+-----------+--------+----------+----------+---------+------+-----+----+---------+--------------------+



9. Decide which table(s) should be broadcast

In [65]:
from pyspark.sql.functions import broadcast

# Join orders_df with broadcasted customers_df
orders_customers_broadcast_df = orders_df.join(broadcast(customers_df), "customer_id", "inner")

print("Physical plan for join with broadcasted customers_df:")
orders_customers_broadcast_df.explain(True)

Physical plan for join with broadcasted customers_df:
== Parsed Logical Plan ==
'Join UsingJoin(Inner, [customer_id])
:- Filter isnotnull(order_date#263)
:  +- Filter isnotnull(order_date#263)
:     +- Filter isnotnull(order_date#263)
:        +- Project [order_id#180, customer_id#181, product_id#182, coalesce(to_date(try_to_timestamp(order_date#183, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(order_date#183, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(order_date#183, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true)) AS order_date#263, status#184, amount#185]
:           +- LogicalRDD [order_id#180, customer_id#181, product_id#182, order_date#183, status#184, amount#185], false
+- ResolvedHint (strategy=broadcast)
   +- Project [customer_id#165, name#227, age#226, city#168, coalesce(interests#317, array_remove(split(, ,, -1)

10. Prove your decision using explain(True)

In [66]:
from pyspark.sql.functions import broadcast

# Join orders_df with broadcasted customers_df
orders_customers_broadcast_df = orders_df.join(broadcast(customers_df), "customer_id", "inner")

print("Physical plan for join with broadcasted customers_df:")
orders_customers_broadcast_df.explain(True)

Physical plan for join with broadcasted customers_df:
== Parsed Logical Plan ==
'Join UsingJoin(Inner, [customer_id])
:- Filter isnotnull(order_date#263)
:  +- Filter isnotnull(order_date#263)
:     +- Filter isnotnull(order_date#263)
:        +- Project [order_id#180, customer_id#181, product_id#182, coalesce(to_date(try_to_timestamp(order_date#183, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(order_date#183, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(order_date#183, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true)) AS order_date#263, status#184, amount#185]
:           +- LogicalRDD [order_id#180, customer_id#181, product_id#182, order_date#183, status#184, amount#185], false
+- ResolvedHint (strategy=broadcast)
   +- Project [customer_id#165, name#227, age#226, city#168, coalesce(interests#317, array_remove(split(, ,, -1)

11. Eliminate orphan records

In [67]:
# 1. Eliminate customers without any orders
customers_with_orders_df = customers_df.join(orders_df, "customer_id", "left_semi")
orphan_customers_df = customers_df.join(orders_df, "customer_id", "left_anti")

print("Orphan Customers:")
orphan_customers_df.show()

# Update customers_df to only include customers with orders
customers_df = customers_with_orders_df

# 2. Eliminate products without any orders
products_with_orders_df = products_df.join(orders_df, "product_id", "left_semi")
orphan_products_df = products_df.join(orders_df, "product_id", "left_anti")

print("Orphan Products:")
orphan_products_df.show()

# Update products_df to only include products with orders
products_df = products_with_orders_df

# 3. Eliminate sellers without any products
sellers_with_products_df = sellers_df.join(products_df, "seller_id", "left_semi")
orphan_sellers_df = sellers_df.join(products_df, "seller_id", "left_anti")

print("Orphan Sellers:")
orphan_sellers_df.show()

# Update sellers_df to only include sellers with products
sellers_df = sellers_with_products_df

print("DataFrames after eliminating orphan records:")
customers_df.show()
products_df.show()
sellers_df.show()

Orphan Customers:
+-----------+-------+---+----+--------------------+
|customer_id|   name|age|city|           interests|
+-----------+-------+---+----+--------------------+
|       C004|Pallavi| 27|Pune|[Electronics, Bea...|
+-----------+-------+---+----+--------------------+

Orphan Products:
+----------+----------+--------+---------+-----+
|product_id|   product|category|seller_id|price|
+----------+----------+--------+---------+-----+
|      P005|Face Cream|  Beauty|     S004|  800|
+----------+----------+--------+---------+-----+

Orphan Sellers:
+---------+-----------+--------+----------+
|seller_id|seller_name|category|start_date|
+---------+-----------+--------+----------+
|     S004|BeautyStore|  Beauty|      NULL|
+---------+-----------+--------+----------+

DataFrames after eliminating orphan records:
+-----------+-----+----+---------+--------------------+
|customer_id| name| age|     city|           interests|
+-----------+-----+----+---------+--------------------+
|       

#PART C — ANALYTICS & AGGREGATIONS

12. Total revenue per category

In [68]:
revenue_category_df = orders_products_df.groupBy("category").agg(sum("amount").alias("total_revenue"))
revenue_category_df.show()

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|       Home|      45000.0|
|    Fashion|       1200.0|
|Electronics|      57500.0|
+-----------+-------------+



13. Total revenue per seller

In [69]:
revenue_seller_df = orders_products_df.groupBy("seller_id").agg(sum("amount").alias("total_revenue"))
revenue_seller_df.show()

+---------+-------------+
|seller_id|total_revenue|
+---------+-------------+
|     S001|      57500.0|
|     S002|       1200.0|
|     S003|      45000.0|
+---------+-------------+



14. Total orders per customer

In [70]:
orders_customers_df = orders_df.groupBy("customer_id").agg(count("order_id").alias("total_orders"))
orders_customers_df.show()

+-----------+------------+
|customer_id|total_orders|
+-----------+------------+
|       C003|           1|
|       C001|           2|
|       C002|           1|
|       C005|           1|
+-----------+------------+



15. Average order value per customer

In [71]:
average_order_value_df = orders_df.withColumn("amount", col("amount").cast("double")) \
    .groupBy("customer_id") \
    .agg(avg("amount").alias("average_order_value"))
average_order_value_df.show()

+-----------+-------------------+
|customer_id|average_order_value|
+-----------+-------------------+
|       C003|            45000.0|
|       C001|            28750.0|
|       C002|                0.0|
|       C005|             1200.0|
+-----------+-------------------+



16. Identify sellers with zero delivered orders

In [72]:
zero_delivery_sellers_df = orders_products_df.filter(col("status") == "Delivered")
zero_delivery_sellers_df.show()

+----------+--------+-----------+----------+---------+------+----------+-----------+---------+-----+
|product_id|order_id|customer_id|order_date|   status|amount|   product|   category|seller_id|price|
+----------+--------+-----------+----------+---------+------+----------+-----------+---------+-----+
|      P001|    O001|       C001|2024-01-05|Delivered| 55000|    Laptop|Electronics|     S001|55000|
|      P002|    O005|       C001|2024-01-10|Delivered|  2500|Headphones|Electronics|     S001| 2500|
|      P003|    O006|       C005|2024-01-12|Delivered|  1200|   T-Shirt|    Fashion|     S002| 1200|
|      P004|    O003|       C003|2024-01-06|Delivered| 45000|      Sofa|       Home|     S003|45000|
+----------+--------+-----------+----------+---------+------+----------+-----------+---------+-----+



#PART D — WINDOW FUNCTIONS

17. Rank customers by total spend (overall)

In [73]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, col, rank

# Calculate total spend per customer
total_spend_per_customer_df = orders_products_df.groupBy("customer_id") \
    .agg(sum(col("amount").cast("double")).alias("total_spend"))

# Define a window specification to rank customers by total spend
window_spec = Window.orderBy(col("total_spend").desc())

# Apply the rank function
customer_spend_rank_df = total_spend_per_customer_df.withColumn("spend_rank", rank().over(window_spec))

customer_spend_rank_df.show()

+-----------+-----------+----------+
|customer_id|total_spend|spend_rank|
+-----------+-----------+----------+
|       C001|    57500.0|         1|
|       C003|    45000.0|         2|
|       C005|     1200.0|         3|
|       C002|        0.0|         4|
+-----------+-----------+----------+



18. Rank sellers by revenue within each category

In [75]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, col, rank

revenue_per_seller_category_df = orders_products_df.groupBy("category", "seller_id") \
    .agg(sum(col("amount").cast("double")).alias("total_revenue"))

window_spec_category = Window.partitionBy("category").orderBy(col("total_revenue").desc())

seller_category_rank_df = revenue_per_seller_category_df.withColumn("category_rank", rank().over(window_spec_category))

seller_category_rank_df.show()

+-----------+---------+-------------+-------------+
|   category|seller_id|total_revenue|category_rank|
+-----------+---------+-------------+-------------+
|Electronics|     S001|      57500.0|            1|
|    Fashion|     S002|       1200.0|            1|
|       Home|     S003|      45000.0|            1|
+-----------+---------+-------------+-------------+



19. Calculate running revenue per day

In [77]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, col, asc

daily_revenue_df = orders_products_df.withColumn("amount", col("amount").cast("double")) \
    .groupBy("order_date") \
    .agg(sum("amount").alias("daily_revenue"))


window_spec_daily = Window.orderBy(asc("order_date"))

running_revenue_df = daily_revenue_df.withColumn("running_revenue", sum("daily_revenue").over(window_spec_daily))

running_revenue_df.show()

+----------+-------------+---------------+
|order_date|daily_revenue|running_revenue|
+----------+-------------+---------------+
|2024-01-05|      55000.0|        55000.0|
|2024-01-06|      45000.0|       100000.0|
|2024-01-10|       2500.0|       102500.0|
|2024-01-12|       1200.0|       103700.0|
+----------+-------------+---------------+



20. Identify top 2 products per category by revenue

In [79]:
from pyspark.sql.window import Window
from pyspark.sql.functions import sum, col, rank

product_revenue_per_category_df = orders_products_df.groupBy("category", "product_id", "product") \
    .agg(sum(col("amount").cast("double")).alias("product_revenue"))

window_spec_product_rank = Window.partitionBy("category").orderBy(col("product_revenue").desc())

top_2_products_per_category_df = product_revenue_per_category_df.withColumn("rank", rank().over(window_spec_product_rank)) \
    .filter(col("rank") <= 2)

top_2_products_per_category_df.show()

+-----------+----------+----------+---------------+----+
|   category|product_id|   product|product_revenue|rank|
+-----------+----------+----------+---------------+----+
|Electronics|      P001|    Laptop|        55000.0|   1|
|Electronics|      P002|Headphones|         2500.0|   2|
|    Fashion|      P003|   T-Shirt|         1200.0|   1|
|       Home|      P004|      Sofa|        45000.0|   1|
+-----------+----------+----------+---------------+----+



#PART E — UDF (ONLY IF REQUIRED)

21. Classify customers into spending tiers:
High
Medium
Low

Rules:
Prefer built-in functions
Use UDF only if unavoidable
Justify your choice

In [81]:
from pyspark.sql.functions import col, when
customer_spending_tiers_df = total_spend_per_customer_df.withColumn(
    "spending_tier",
    when(col("total_spend") > 10000, "High")
    .when((col("total_spend") > 1000) & (col("total_spend") <= 10000), "Medium")
    .otherwise("Low")
)

print("Customers classified into spending tiers:")
customer_spending_tiers_df.show()

# Justification for not using a UDF:
# PySpark's `when().otherwise()` provides native, optimized functionality for conditional logic.
# It is executed within the Spark engine, benefiting from Catalyst Optimizer and Tungsten execution engine,
# leading to significantly better performance compared to Python UDFs. UDFs involve serialization/deserialization
# overhead and context switching between JVM and Python, which can be very slow for large datasets.
# Since `when().otherwise()` perfectly handles the tier classification logic, a UDF is unnecessary and less efficient.

Customers classified into spending tiers:
+-----------+-----------+-------------+
|customer_id|total_spend|spending_tier|
+-----------+-----------+-------------+
|       C003|    45000.0|         High|
|       C005|     1200.0|       Medium|
|       C001|    57500.0|         High|
|       C002|        0.0|          Low|
+-----------+-----------+-------------+



#PART F — SORTING & ORDERING

22. Sort categories by total revenue (descending)

In [85]:
from pyspark.sql.functions import desc

sorted_categories_by_revenue_df = revenue_category_df.orderBy(desc("total_revenue"))
sorted_categories_by_revenue_df.show()

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|Electronics|      57500.0|
|       Home|      45000.0|
|    Fashion|       1200.0|
+-----------+-------------+



23. Sort sellers by revenue within category

In [84]:
from pyspark.sql.functions import col
sorted_sellers_by_category_revenue_df = seller_category_rank_df.orderBy(col("category").asc(), col("total_revenue").desc())

print("Sellers sorted by revenue within each category:")
sorted_sellers_by_category_revenue_df.show()

Sellers sorted by revenue within each category:
+-----------+---------+-------------+-------------+
|   category|seller_id|total_revenue|category_rank|
+-----------+---------+-------------+-------------+
|Electronics|     S001|      57500.0|            1|
|    Fashion|     S002|       1200.0|            1|
|       Home|     S003|      45000.0|            1|
+-----------+---------+-------------+-------------+



24. Explain why sorting caused a shuffle

In [86]:
# Sorting a DataFrame in Spark often triggers a 'shuffle' operation.
# A shuffle is the process of redistributing data across partitions (and potentially across machines in a cluster).
# This is necessary because to perform a global sort (or even a sort within groups if data is not pre-partitioned
# or pre-sorted), all data relevant to a specific sort key range might need to be collected on the same partition.
# For example, when sorting categories by total revenue, Spark needs to know the total revenue for all categories
# to correctly order them. If different parts of a category's data reside on different partitions,
# Spark must move this data to ensure a consistent global order. This involves serializing data,
# sending it over the network, and deserializing it on the receiving end, which is a resource-intensive operation.

#PART G — SET OPERATIONS

Create two DataFrames:

Customers who placed orders
Customers who were active (search/view)

In [87]:
ordered_customers_df = orders_df.select("customer_id").distinct()
active_customers_df = activity_df.filter(col("actions").isNotNull()).select("customer_id").distinct()

25. Find customers who were active but never ordered

In [88]:
active_customers_df.subtract(ordered_customers_df).show()

+-----------+
|customer_id|
+-----------+
+-----------+



26. Find customers who ordered and were active

In [89]:
active_customers_df.intersect(ordered_customers_df).show()

+-----------+
|customer_id|
+-----------+
|       C003|
|       C005|
|       C001|
|       C002|
+-----------+



27. Explain why set operations differ from joins

In [90]:
# Differences between Set Operations and Joins:
#
# Set Operations (UNION, INTERSECT, EXCEPT/SUBTRACT):
# - Operate on the *rows* of DataFrames.
# - Require the DataFrames to have a compatible schema (same number of columns, same column names, and compatible data types).
# - Combine or compare rows based on their *entire content*.
# - The result has the same schema as the input DataFrames.
#
# Join Operations (INNER, LEFT, RIGHT, FULL, ANTI, SEMI):
# - Combine *columns* from two DataFrames.
# - Combine data based on a *common key* or a specified condition.
# - Typically result in a wider DataFrame (more columns) by merging information from both DataFrames.
# - The schema of the result is a combination of the schemas of the input DataFrames (excluding duplicate join keys if specified).

print("\n--- Set Operations (Operating on rows) ---")
print("Customers active but never ordered (using subtract):")
active_customers_df.subtract(ordered_customers_df).show()

print("Customers who ordered AND were active (using intersect):")
active_customers_df.intersect(ordered_customers_df).show()

print("\n--- Join Operations (Operating on columns based on keys) ---")
print("Inner Join: Combining customer and order details for matching customer_ids:")
orders_df.join(customers_df, "customer_id", "inner").show()

print("Left Anti Join: Customers from 'customers_df' who are NOT in 'orders_df' (different from subtract, key-based):")
customers_df.join(orders_df, "customer_id", "left_anti").show()


--- Set Operations (Operating on rows) ---
Customers active but never ordered (using subtract):
+-----------+
|customer_id|
+-----------+
+-----------+

Customers who ordered AND were active (using intersect):
+-----------+
|customer_id|
+-----------+
|       C003|
|       C005|
|       C001|
|       C002|
+-----------+


--- Join Operations (Operating on columns based on keys) ---
Inner Join: Combining customer and order details for matching customer_ids:
+-----------+--------+----------+----------+---------+------+-----+----+---------+--------------------+
|customer_id|order_id|product_id|order_date|   status|amount| name| age|     city|           interests|
+-----------+--------+----------+----------+---------+------+-----+----+---------+--------------------+
|       C003|    O003|      P004|2024-01-06|Delivered| 45000| Aman|NULL|   Mumbai|[[Home,  Electron...|
|       C001|    O001|      P001|2024-01-05|Delivered| 55000|Rahul|  29|Bangalore|[Electronics, Fas...|
|       C002|    O

PART H — DAG & PERFORMANCE ANALYSIS

28. Run explain(True) for:
Product → Seller join
Window ranking
Sorting

In [91]:
products_seller_df.explain(True)
customer_spend_rank_df.explain(True)
sorted_categories_by_revenue_df.explain(True)

== Parsed Logical Plan ==
'Join UsingJoin(Inner, [seller_id])
:- Project [product_id#175, product#176, category#177, seller_id#178, cast(price#245 as int) AS price#246]
:  +- Project [product_id#175, product#176, category#177, seller_id#178, start_date#179 AS price#245]
:     +- LogicalRDD [product_id#175, product#176, category#177, seller_id#178, start_date#179], false
+- ResolvedHint (strategy=broadcast)
   +- Project [seller_id#171, seller_name#172, category#173, coalesce(to_date(try_to_timestamp(start_date#174, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(start_date#174, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(start_date#174, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true)) AS start_date#397]
      +- LogicalRDD [seller_id#171, seller_name#172, category#173, start_date#174], false

== Analyzed Logical Plan ==
seller_

29. Identify:
Shuffles
Broadcast joins
Sort stages

In [None]:
#Shuffles: GroupBy, sort
#Broadcast joins: Seller join
#Sort stages: Window+OrderBy

30. Suggest one performance improvement

In [93]:
# Performance Improvement Suggestion:
# Cache the 'orders_products_df' DataFrame.
# This DataFrame is the result of a join and is used multiple times in subsequent calculations
# (e.g., total revenue per category/seller, running revenue, top products).
# Caching it will prevent Spark from recomputing this DataFrame every time it's accessed.
orders_products_df.cache()

DataFrame[product_id: string, order_id: string, customer_id: string, order_date: date, status: string, amount: string, product: string, category: string, seller_id: string, price: int]