<a href="https://colab.research.google.com/github/codingniket/Python-Training/blob/main/24_12_2025/multi-category%20online%20marketplace.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

https://drive.google.com/file/d/1KKg-Ue_-SKaoyJkZCi2tyUwpcQcxS7V0/view

In [47]:
orders_data = [
    ("ORD001","C001","Delhi ","Electronics","Laptop","45000","2024-01-05","Completed"),
    ("ORD002","C002","Mumbai","Electronics","Mobile ","32000","05/01/2024","Completed"),
    ("ORD003","C003","Bangalore","Electronics","Tablet","30000","2024/01/06","Completed"),
    ("ORD004","C004","Delhi","Electronics","Laptop","","2024-01-07","Cancelled"),
    ("ORD005","C005","Chennai","Electronics","Mobile","invalid","2024-01-08","Completed"),
    ("ORD006","C006","Mumbai","Home","Mixer",None,"2024-01-08","Completed"),
    ("ORD007","C001","Delhi","Electronics","Laptop","47000","09-01-2024","Completed"),
    ("ORD008","C007","Bangalore","Home","Vacuum","28000","2024-01-09","Completed"),
    ("ORD009","C002","Mumbai","Electronics","Laptop","55000","2024-01-10","Completed"),
    ("ORD010","C008","Delhi","Home","AirPurifier","38000","2024-01-10","Completed"),
    ("ORD011","C009","Mumbai","Home","Vacuum","29000","2024-01-11","Completed"),
    ("ORD012","C010","Bangalore","Electronics","Mobile","33000","2024-01-11","Completed"),
    ("ORD013","C003","Bangalore","Home","Mixer","21000","2024-01-12","Completed"),
    ("ORD014","C004","Delhi","Electronics","Tablet","26000","2024-01-12","Completed"),
    ("ORD015","C005","Chennai","Electronics","Laptop","62000","2024-01-13","Completed"),
    ("ORD016","C006","Mumbai","Home","AirPurifier","40000","2024-01-13","Completed"),
    ("ORD017","C007","Bangalore","Electronics","Laptop","51000","2024-01-14","Completed"),
    ("ORD018","C008","Delhi","Home","Vacuum","31000","2024-01-14","Completed"),
    ("ORD019","C009","Mumbai","Electronics","Tablet","29000","2024-01-15","Completed"),
    ("ORD020","C010","Bangalore","Electronics","Laptop","54000","2024-01-15","Completed"),
    ("ORD020","C010","Bangalore","Electronics","Laptop","54000","2024-01-15","Completed")
]


In [48]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.functions import trim, col, when, to_date, sum as spark_sum, avg, desc, rank, lit, coalesce, isnull,try_to_timestamp,regexp_extract
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("OnlineMarketExercise").getOrCreate()

In [49]:
schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("city", StringType(), True),
    StructField("category", StringType(), True),
    StructField("product", StringType(), True),
    StructField("amount", StringType(), True),
    StructField("order_date", StringType(), True),
    StructField("status", StringType(), True)
])

In [50]:
df = spark.createDataFrame(orders_data, schema)

original_count = df.count()

In [51]:
original_count

21

In [52]:
df.printSchema()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)



In [53]:
string_columns = ["order_id", "customer_id", "city", "category", "product", "amount", "order_date", "status"]
for column in string_columns:
    df = df.withColumn(column, trim(col(column)))

In [54]:
numeric_price_str = regexp_extract(col("amount"), r"(\d+)", 0)
df = df.withColumn("amount",when((numeric_price_str == "") | numeric_price_str.isNull(), lit(0)).otherwise(numeric_price_str.cast('int')))

In [55]:
df = df.withColumn(
    "order_date",
    coalesce(
        to_date(try_to_timestamp(col("order_date"), lit("yyyy-MM-dd"))),
        to_date(try_to_timestamp(col("order_date"), lit("dd/MM/yyyy"))),
        to_date(try_to_timestamp(col("order_date"), lit("yyyy/MM/dd")))
    )
)

In [56]:
df.show()

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|2024-01-05|Completed|
|  ORD003|       C003|Bangalore|Electronics|     Tablet| 30000|2024-01-06|Completed|
|  ORD004|       C004|    Delhi|Electronics|     Laptop|     0|2024-01-07|Cancelled|
|  ORD005|       C005|  Chennai|Electronics|     Mobile|     0|2024-01-08|Completed|
|  ORD006|       C006|   Mumbai|       Home|      Mixer|     0|2024-01-08|Completed|
|  ORD007|       C001|    Delhi|Electronics|     Laptop| 47000|      NULL|Completed|
|  ORD008|       C007|Bangalore|       Home|     Vacuum| 28000|2024-01-09|Completed|
|  ORD009|       C002|   Mumbai|Electronics|     Laptop| 55000|20

In [57]:
df = df.dropDuplicates(["order_id"])

In [58]:
df = df.filter(col("status") == "Completed")

df = df.filter(col("amount").isNotNull())
df = df.filter(col("order_date").isNotNull())

cleaned_df = df
cleaned_count = cleaned_df.count()

In [59]:
# 11
print(f"Records before cleaning: {original_count}")
print(f"Records after cleaning: {cleaned_count}")

# 12
null_counts = cleaned_df.filter(
    col("order_id").isNull() | col("amount").isNull() | col("order_date").isNull()
).count()
print(f"No nulls in key columns: {null_counts == 0}")

# 13
cleaned_df.printSchema()

# 14
cleaned_df.groupBy("city").agg(spark_sum("amount").alias("total_revenue")).show()

# 15
cleaned_df.groupBy("category").agg(spark_sum("amount").alias("total_revenue")).show()

# 16
cleaned_df.groupBy("product").agg(spark_sum("amount").alias("total_revenue")).show()

# 17
cleaned_df.groupBy("city").agg(avg("amount").alias("avg_order_value")).show()

Records before cleaning: 21
Records after cleaning: 18
No nulls in key columns: True
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- status: string (nullable = true)

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|       217000|
|  Chennai|        62000|
|   Mumbai|       185000|
|    Delhi|       140000|
+---------+-------------+

+-----------+-------------+
|   category|total_revenue|
+-----------+-------------+
|       Home|       187000|
|Electronics|       417000|
+-----------+-------------+

+-----------+-------------+
|    product|total_revenue|
+-----------+-------------+
|     Vacuum|        88000|
|AirPurifier|        78000|
|     Laptop|       267000|
|      Mixer|        21000|
|     Mobile|       

In [60]:
# 18
cleaned_df.groupBy("product").agg(spark_sum("amount").alias("revenue")).orderBy(desc("revenue")).limit(3).show()

# 19
city_revenue = cleaned_df.groupBy("city").agg(spark_sum("amount").alias("total_revenue"))
window_spec_city = Window.orderBy(desc("total_revenue"))
city_revenue.withColumn("rank", rank().over(window_spec_city)).show()

# 20
product_revenue = cleaned_df.groupBy("category", "product").agg(spark_sum("amount").alias("revenue"))
window_spec_product = Window.partitionBy("category").orderBy(desc("revenue"))
product_revenue.withColumn("rank", rank().over(window_spec_product)).show()

# 21
product_revenue.withColumn("rank", rank().over(window_spec_product)).filter(col("rank") == 1).show()

# 22
cleaned_df.cache()

# 23
cleaned_df.groupBy("city").agg(spark_sum("amount").alias("total_revenue")).show()
cleaned_df.groupBy("category").agg(spark_sum("amount").alias("total_revenue")).show()
cleaned_df.groupBy("product").agg(spark_sum("amount").alias("total_revenue")).show()

# 24
agg_df = cleaned_df.groupBy("city").agg(spark_sum("amount").alias("total_revenue"))
agg_df.explain(True)

+-------+-------+
|product|revenue|
+-------+-------+
| Laptop| 267000|
| Vacuum|  88000|
| Tablet|  85000|
+-------+-------+

+---------+-------------+----+
|     city|total_revenue|rank|
+---------+-------------+----+
|Bangalore|       217000|   1|
|   Mumbai|       185000|   2|
|    Delhi|       140000|   3|
|  Chennai|        62000|   4|
+---------+-------------+----+

+-----------+-----------+-------+----+
|   category|    product|revenue|rank|
+-----------+-----------+-------+----+
|Electronics|     Laptop| 267000|   1|
|Electronics|     Tablet|  85000|   2|
|Electronics|     Mobile|  65000|   3|
|       Home|     Vacuum|  88000|   1|
|       Home|AirPurifier|  78000|   2|
|       Home|      Mixer|  21000|   3|
+-----------+-----------+-------+----+

+-----------+-------+-------+----+
|   category|product|revenue|rank|
+-----------+-------+-------+----+
|Electronics| Laptop| 267000|   1|
|       Home| Vacuum|  88000|   1|
+-----------+-------+-------+----+

+---------+-----------

In [61]:
# 25
repartitioned_df = cleaned_df.repartition("city")
# Repartitioning by city groups data by city in partitions, reducing shuffle during city-based operations like groupBy("city")

# 26
cleaned_df.write.mode("overwrite").parquet("cleaned_orders.parquet")

# 27
city_revenue.write.mode("overwrite").orc("analytics.orc")

# 28
parquet_read = spark.read.parquet("cleaned_orders.parquet")
parquet_read.printSchema()
orc_read = spark.read.orc("analytics.orc")
orc_read.printSchema()

# 29
# The line is incorrect because .show() prints the DataFrame but returns None, so assigning it back to df sets df to None, losing the filtered data.

# 30
df = df.filter(df.amount > 30000)
df.show()

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- status: string (nullable = true)

root
 |-- city: string (nullable = true)
 |-- total_revenue: long (nullable = true)

+--------+-----------+---------+-----------+-----------+------+----------+---------+
|order_id|customer_id|     city|   category|    product|amount|order_date|   status|
+--------+-----------+---------+-----------+-----------+------+----------+---------+
|  ORD018|       C008|    Delhi|       Home|     Vacuum| 31000|2024-01-14|Completed|
|  ORD001|       C001|    Delhi|Electronics|     Laptop| 45000|2024-01-05|Completed|
|  ORD002|       C002|   Mumbai|Electronics|     Mobile| 32000|2024-01-05|Completed|
|  ORD017|       C007|Bangalore|Electronics|     Laptop| 51000|2024-01-14|Completed|