In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.functions import trim, col, when, to_date, sum as spark_sum, avg, desc, rank, lit, coalesce, isnull,try_to_timestamp,regexp_extract,initcap
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("OnlineMarketExercise").getOrCreate()

In [3]:
# Q1. Read CSV ensuring no failure on bad data
# We use inferSchema=False to read everything as String initially.
df_raw = spark.read \
.option("header", "true") \
.option("inferSchema", "false") \
.csv("orders_raw.csv")

In [4]:
df_raw.show()

+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|   order_id|customer_id|       city|   category|    product| amount|order_date|   status|
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|ORD00000000|    C000000| hyderabad |   grocery |       Oil |invalid|01/01/2024|Cancelled|
|ORD00000001|    C000001|       Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|
|ORD00000002|    C000002|       Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|
|ORD00000003|    C000003|  Bangalore|Electronics|     Laptop|   5558|2024-01-04|Completed|
|ORD00000004|    C000004|       Pune|       Home|AirPurifier|  33659|2024-01-05|Completed|
|ORD00000005|    C000005|      Delhi|    Fashion|      Jeans|   8521|2024-01-06|Completed|
|ORD00000006|    C000006|      Delhi|    Grocery|      Sugar|  42383|2024-01-07|Completed|
|ORD00000007|    C000007|       Pune|    Grocery|       Rice|  45362|2024-01-08|Completed|

# Q2
Because if we use specific data type and the coloumn as arbitary value then it will fail to protect data we would use StringType

In [6]:
# Q3. Print Schema and Count
print("Raw Schema:")
df_raw.printSchema()
print(f"Total Raw Records: {df_raw.count()}")

Raw Schema:
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)

Total Raw Records: 300000


In [10]:
#Q4 & Q5
cols_to_clean = ["city", "category", "product", "status"]
df_cleaned_str = df_raw
for col_name in cols_to_clean:
  df_cleaned_str = df_cleaned_str.withColumn(col_name, initcap(trim(col(col_name))))

In [11]:
df_cleaned_str.show()

+-----------+-----------+---------+-----------+-----------+-------+----------+---------+
|   order_id|customer_id|     city|   category|    product| amount|order_date|   status|
+-----------+-----------+---------+-----------+-----------+-------+----------+---------+
|ORD00000000|    C000000|Hyderabad|    Grocery|        Oil|invalid|01/01/2024|Cancelled|
|ORD00000001|    C000001|     Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|
|ORD00000002|    C000002|     Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|
|ORD00000003|    C000003|Bangalore|Electronics|     Laptop|   5558|2024-01-04|Completed|
|ORD00000004|    C000004|     Pune|       Home|Airpurifier|  33659|2024-01-05|Completed|
|ORD00000005|    C000005|    Delhi|    Fashion|      Jeans|   8521|2024-01-06|Completed|
|ORD00000006|    C000006|    Delhi|    Grocery|      Sugar|  42383|2024-01-07|Completed|
|ORD00000007|    C000007|     Pune|    Grocery|       Rice|  45362|2024-01-08|Completed|
|ORD00000008|    C000

In [14]:
# Q6.
numeric_price_str = regexp_extract(col("amount"), r"(\d+)", 0)
df_casted_amount = df_cleaned_str.withColumn("amount",when((numeric_price_str == "") | numeric_price_str.isNull(), lit(0)).otherwise(numeric_price_str.cast('int')))

In [15]:
df_casted_amount.show()

+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|   order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|ORD00000000|    C000000|Hyderabad|    Grocery|        Oil|     0|01/01/2024|Cancelled|
|ORD00000001|    C000001|     Pune|    Grocery|      Sugar| 35430|2024-01-02|Completed|
|ORD00000002|    C000002|     Pune|Electronics|     Mobile| 65358|2024-01-03|Completed|
|ORD00000003|    C000003|Bangalore|Electronics|     Laptop|  5558|2024-01-04|Completed|
|ORD00000004|    C000004|     Pune|       Home|Airpurifier| 33659|2024-01-05|Completed|
|ORD00000005|    C000005|    Delhi|    Fashion|      Jeans|  8521|2024-01-06|Completed|
|ORD00000006|    C000006|    Delhi|    Grocery|      Sugar| 42383|2024-01-07|Completed|
|ORD00000007|    C000007|     Pune|    Grocery|       Rice| 45362|2024-01-08|Completed|
|ORD00000008|    C000008|Bangalo

In [18]:
# Q7.
df_casted_date = df_casted_amount.withColumn(
    "order_date",
    coalesce(
        to_date(try_to_timestamp(col("order_date"), lit("yyyy-MM-dd"))),
        to_date(try_to_timestamp(col("order_date"), lit("dd/MM/yyyy"))),
        to_date(try_to_timestamp(col("order_date"), lit("yyyy/MM/dd")))
    )
)

In [19]:
df_casted_date.show()

+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|   order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+-----------+-----------+---------+-----------+-----------+------+----------+---------+
|ORD00000000|    C000000|Hyderabad|    Grocery|        Oil|     0|2024-01-01|Cancelled|
|ORD00000001|    C000001|     Pune|    Grocery|      Sugar| 35430|2024-01-02|Completed|
|ORD00000002|    C000002|     Pune|Electronics|     Mobile| 65358|2024-01-03|Completed|
|ORD00000003|    C000003|Bangalore|Electronics|     Laptop|  5558|2024-01-04|Completed|
|ORD00000004|    C000004|     Pune|       Home|Airpurifier| 33659|2024-01-05|Completed|
|ORD00000005|    C000005|    Delhi|    Fashion|      Jeans|  8521|2024-01-06|Completed|
|ORD00000006|    C000006|    Delhi|    Grocery|      Sugar| 42383|2024-01-07|Completed|
|ORD00000007|    C000007|     Pune|    Grocery|       Rice| 45362|2024-01-08|Completed|
|ORD00000008|    C000008|Bangalo

In [28]:
# Q8.
df_valid = df_casted_date.dropna(subset=["amount", "order_date"]) \
.drop("order_date")

print(f"Count after cleaning: {df_valid.count()}")

count_before = df_valid.count()

Count after cleaning: 297405


In [24]:
df_valid.show()

+-----------+-----------+---------+-----------+-----------+------+---------+
|   order_id|customer_id|     city|   category|    product|amount|   status|
+-----------+-----------+---------+-----------+-----------+------+---------+
|ORD00000000|    C000000|Hyderabad|    Grocery|        Oil|     0|Cancelled|
|ORD00000001|    C000001|     Pune|    Grocery|      Sugar| 35430|Completed|
|ORD00000002|    C000002|     Pune|Electronics|     Mobile| 65358|Completed|
|ORD00000003|    C000003|Bangalore|Electronics|     Laptop|  5558|Completed|
|ORD00000004|    C000004|     Pune|       Home|Airpurifier| 33659|Completed|
|ORD00000005|    C000005|    Delhi|    Fashion|      Jeans|  8521|Completed|
|ORD00000006|    C000006|    Delhi|    Grocery|      Sugar| 42383|Completed|
|ORD00000007|    C000007|     Pune|    Grocery|       Rice| 45362|Completed|
|ORD00000008|    C000008|Bangalore|    Fashion|      Jeans| 10563|Completed|
|ORD00000009|    C000009|  Kolkata|Electronics|     Laptop| 63715|Completed|

In [25]:
# Q9.
df_dedup = df_valid.dropDuplicates(["order_id"])

In [27]:
# Q10
df_final = df_dedup.filter(col("status") == "Completed")

In [29]:
# Q11.
count_after = df_final.count()
print(f"Records Dropped by Business Rules: {count_before - count_after}")
print(f"Final Dataset Count: {count_after}")

Records Dropped by Business Rules: 14870
Final Dataset Count: 282535


In [47]:
# Q12
# groupBy, distinct/dropDuplicates, join, repartition, orderBy, window functions.

In [36]:
# Q13.
df_final.explain()

== Physical Plan ==
AdaptiveSparkPlan isFinalPlan=false
+- Filter (isnotnull(status#705) AND (status#705 = Completed))
   +- SortAggregate(key=[order_id#17], functions=[first(customer_id#18, false), first(city#79, false), first(category#80, false), first(product#81, false), first(amount#166, false), first(status#82, false)])
      +- Sort [order_id#17 ASC NULLS FIRST], false, 0
         +- Exchange hashpartitioning(order_id#17, 200), ENSURE_REQUIREMENTS, [plan_id=787]
            +- SortAggregate(key=[order_id#17], functions=[partial_first(customer_id#18, false), partial_first(city#79, false), partial_first(category#80, false), partial_first(product#81, false), partial_first(amount#166, false), partial_first(status#82, false)])
               +- Sort [order_id#17 ASC NULLS FIRST], false, 0
                  +- Project [order_id#17, customer_id#18, initcap(trim(city#19, None)) AS city#79, initcap(trim(category#20, None)) AS category#80, initcap(trim(product#21, None)) AS product#81, CAS

In [30]:
# Q14.
df_optimized = df_final.repartition(col("city"))


In [31]:
df_optimized.show()

+-----------+-----------+---------+-----------+-----------+------+---------+
|   order_id|customer_id|     city|   category|    product|amount|   status|
+-----------+-----------+---------+-----------+-----------+------+---------+
|ORD00000008|    C000008|Bangalore|    Fashion|      Jeans| 10563|Completed|
|ORD00000010|    C000010|Bangalore|    Grocery|      Sugar| 66576|Completed|
|ORD00000012|    C000012|Bangalore|    Grocery|      Sugar| 84768|Completed|
|ORD00000017|    C000017|Bangalore|    Grocery|        Oil| 69582|Completed|
|ORD00000024|    C000024|Bangalore|       Home|      Mixer| 18082|Completed|
|ORD00000025|    C000025|Bangalore|       Home|Airpurifier| 58248|Completed|
|ORD00000114|    C000114|Bangalore|       Home|Airpurifier|     0|Completed|
|ORD00000116|    C000116|Bangalore|    Grocery|      Sugar|     0|Completed|
|ORD00000124|    C000124|Bangalore|    Grocery|      Sugar| 54296|Completed|
|ORD00000159|    C000159|Bangalore|Electronics|     Tablet| 89397|Completed|

In [32]:
# Q15.

df_optimized.cache()

DataFrame[order_id: string, customer_id: string, city: string, category: string, product: string, amount: int, status: string]

In [35]:
# Q16.
df_optimized.groupBy("city") \
.agg(sum("amount").alias("total_revenue")) \
.orderBy(col("total_revenue").desc()) \
.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|     Pune|   1611302685|
|Hyderabad|   1609260488|
|    Delhi|   1602686184|
|Bangalore|   1595093850|
|  Chennai|   1594968796|
|   Mumbai|   1592819957|
|  Kolkata|   1589960718|
+---------+-------------+



In [37]:
# Q17.
df_optimized.groupBy("category") \
.agg(sum("amount").alias("total_revenue")) \
.orderBy(col("total_revenue").desc()) \
.show()

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|       Home|   2808429137|
|    Grocery|   2806779336|
|Electronics|   2806431494|
|    Fashion|   2774452711|
+-----------+-------------+



In [38]:
# Q18.
df_optimized.groupBy("product") \
.agg(sum("amount").alias("total_revenue")) \
.orderBy(col("total_revenue").desc()) \
.limit(5) \
.show()


+-------+-------------+
|product|total_revenue|
+-------+-------------+
|    Oil|    943593995|
| Laptop|    943181599|
| Tablet|    939989279|
| Vacuum|    939626254|
|  Mixer|    937113183|
+-------+-------------+



In [39]:
# Q19.
df_optimized.groupBy("city") \
.agg(avg("amount").alias("avg_order_value")) \
.orderBy(col("avg_order_value").desc()) \
.show()

+---------+------------------+
|     city|   avg_order_value|
+---------+------------------+
|Bangalore|39910.272224585286|
|     Pune| 39758.74565104745|
|    Delhi|39603.790254027874|
|  Kolkata|39558.149876844225|
|   Mumbai|39540.748131966335|
|Hyderabad| 39533.74165970619|
|  Chennai| 39488.22252481989|
+---------+------------------+



In [40]:
#Q20.
city_window = Window.orderBy(col("total_revenue").desc())
df_city_rev = df_optimized.groupBy("city").agg(sum("amount").alias("total_revenue"))
df_city_rank = df_city_rev.withColumn("rank", rank().over(city_window))
df_city_rank.show()


+---------+-------------+----+
|     city|total_revenue|rank|
+---------+-------------+----+
|     Pune|   1611302685|   1|
|Hyderabad|   1609260488|   2|
|    Delhi|   1602686184|   3|
|Bangalore|   1595093850|   4|
|  Chennai|   1594968796|   5|
|   Mumbai|   1592819957|   6|
|  Kolkata|   1589960718|   7|
+---------+-------------+----+



In [41]:
# Q21.
df_prod_cat_rev = df_optimized.groupBy("category", "product") \
.agg(sum("amount").alias("revenue"))

In [50]:
# Q22.
top_products = df_optimized.groupBy("product") \
    .agg(sum("amount").alias("total_revenue")) \
    .orderBy(col("total_revenue").desc()) \


In [51]:
top_products.show()

+-----------+-------------+
|    product|total_revenue|
+-----------+-------------+
|        Oil|    943593995|
|     Laptop|    943181599|
|     Tablet|    939989279|
|     Vacuum|    939626254|
|      Mixer|    937113183|
|       Rice|    934345709|
|Airpurifier|    931689700|
|      Jeans|    930922473|
|      Sugar|    928839632|
|      Shoes|    926563627|
|     Mobile|    923260616|
|     Tshirt|    916966611|
+-----------+-------------+



In [43]:
# Q23.
print("Writing cleaned data to Parquet (partitioned by city)...")
df_optimized.write \
    .mode("overwrite") \
    .partitionBy("city") \
    .parquet("output/orders_cleaned_parquet")

Writing cleaned data to Parquet (partitioned by city)...


In [44]:
# Q24.
df_city_rev.write \
    .mode("overwrite") \
    .orc("output/city_revenue_orc")

In [45]:
# q 25.

1. No Schema Enforcement: CSVs don't store metadata/types, requiring inference on every read.
2. Poor Compression: Text-based formats take up more space than Snappy/Zlib compressed binary formats (Parquet/ORC).
3. No Columnar Pruning: Reading a CSV requires parsing the entire row, whereas Parquet allows reading only specific columns, drastically reducing I/O.

# 26

Explanation:
If 'amount' is still of StringType (before our cleaning step), comparing it to an Integer (50000) forces Spark to cast the column to Integer implicitly.
- If the column contains non-numeric strings (like 'invalid'), the cast fails.
- In ANSI mode (spark.sql.ansi.enabled=true), this throws a NumberFormatException and stops the job.
- Even without ANSI mode, it converts values to NULL, leading to incorrect results.

27
1. Spark UI: Check 'Stages' to identify which specific stage is lagging.
2. Skew Analysis: Look at the task duration distribution. If Max Duration >> Median Duration, a single task is stuck on skewed data (e.g., one city has 90% of orders).

28
1. Memory Starvation: Caching takes up storage memory. If full, it eats into execution memory, causing shuffles and sorts to spill to disk, slowing down the job.
2. GC Overhead: Managing large cached objects in Java Heap can lead to frequent Garbage Collection pauses.