In [1]:

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Marketplace-Orders-Ingestion") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")


In [2]:

# Raw orders data provided
orders_data = [
    ("ORD001","C001","Delhi ","Electronics","Laptop","45000","2024-01-05","Completed"),
    ("ORD002","C002","Mumbai","Electronics","Mobile ","32000","05/01/2024","Completed"),
    ("ORD003","C003","Bangalore","Electronics","Tablet","30000","2024/01/06","Completed"),
    ("ORD004","C004","Delhi","Electronics","Laptop","","2024-01-07","Cancelled"),
    ("ORD005","C005","Chennai","Electronics","Mobile","invalid","2024-01-08","Completed"),
    ("ORD006","C006","Mumbai","Home","Mixer",None,"2024-01-08","Completed"),
    ("ORD007","C001","Delhi","Electronics","Laptop","47000","09-01-2024","Completed"),
    ("ORD008","C007","Bangalore","Home","Vacuum","28000","2024-01-09","Completed"),
    ("ORD009","C002","Mumbai","Electronics","Laptop","55000","2024-01-10","Completed"),
    ("ORD010","C008","Delhi","Home","AirPurifier","38000","2024-01-10","Completed"),
    ("ORD011","C009","Mumbai","Home","Vacuum","29000","2024-01-11","Completed"),
    ("ORD012","C010","Bangalore","Electronics","Mobile","33000","2024-01-11","Completed"),
    ("ORD013","C003","Bangalore","Home","Mixer","21000","2024-01-12","Completed"),
    ("ORD014","C004","Delhi","Electronics","Tablet","26000","2024-01-12","Completed"),
    ("ORD015","C005","Chennai","Electronics","Laptop","62000","2024-01-13","Completed"),
    ("ORD016","C006","Mumbai","Home","AirPurifier","40000","2024-01-13","Completed"),
    ("ORD017","C007","Bangalore","Electronics","Laptop","51000","2024-01-14","Completed"),
    ("ORD018","C008","Delhi","Home","Vacuum","31000","2024-01-14","Completed"),
    ("ORD019","C009","Mumbai","Electronics","Tablet","29000","2024-01-15","Completed"),
    ("ORD020","C010","Bangalore","Electronics","Laptop","54000","2024-01-15","Completed"),
    ("ORD020","C010","Bangalore","Electronics","Laptop","54000","2024-01-15","Completed")
]


In [6]:
#Define an explicit schema
from pyspark.sql.types import StructType, StructField, StringType

orders_schema = StructType([
    StructField("order_id",   StringType(), True),
    StructField("customer_id",StringType(), True),
    StructField("city",       StringType(), True),
    StructField("category",   StringType(), True),
    StructField("product",    StringType(), True),
    StructField("amount",     StringType(), True),
    StructField("order_date", StringType(), True),
    StructField("status",     StringType(), True)
])


In [7]:
#Create a DataFrame using the schema
raw_df = spark.createDataFrame(orders_data, schema=orders_schema)

In [8]:
#Print and verify schema
raw_df.printSchema()
# Expected: all StringType

print("Raw record count:", raw_df.count())
raw_df.show(truncate=False)


root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)

Raw record count: 21
+--------+-----------+---------+-----------+-----------+-------+----------+---------+
|order_id|customer_id|city     |category   |product    |amount |order_date|status   |
+--------+-----------+---------+-----------+-----------+-------+----------+---------+
|ORD001  |C001       |Delhi    |Electronics|Laptop     |45000  |2024-01-05|Completed|
|ORD002  |C002       |Mumbai   |Electronics|Mobile     |32000  |05/01/2024|Completed|
|ORD003  |C003       |Bangalore|Electronics|Tablet     |30000  |2024/01/06|Completed|
|ORD004  |C004       |Delhi    |Electronics|Laptop     |       |2024-01-07|Cancelled|
|ORD005  |C005       |Chennai  |Electronics|Mobile     

In [10]:

# PHASE 2 — DATA CLEANING
# -----------------------
from pyspark.sql.types import StringType
from pyspark.sql.functions import (
    col, trim, lower, initcap, regexp_replace, when, to_date, coalesce, length
)
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

df = raw_df


In [12]:
#task4
from pyspark.sql.types import StringType
from pyspark.sql.functions import col, trim

df_q4 = raw_df

string_cols = [f.name for f in df_q4.schema.fields if isinstance(f.dataType, StringType)]
df_q4 = df_q4.select([trim(col(c)).alias(c) if c in string_cols else col(c) for c in df_q4.columns])

df_q4.show(truncate=False)


+--------+-----------+---------+-----------+-----------+-------+----------+---------+
|order_id|customer_id|city     |category   |product    |amount |order_date|status   |
+--------+-----------+---------+-----------+-----------+-------+----------+---------+
|ORD001  |C001       |Delhi    |Electronics|Laptop     |45000  |2024-01-05|Completed|
|ORD002  |C002       |Mumbai   |Electronics|Mobile     |32000  |05/01/2024|Completed|
|ORD003  |C003       |Bangalore|Electronics|Tablet     |30000  |2024/01/06|Completed|
|ORD004  |C004       |Delhi    |Electronics|Laptop     |       |2024-01-07|Cancelled|
|ORD005  |C005       |Chennai  |Electronics|Mobile     |invalid|2024-01-08|Completed|
|ORD006  |C006       |Mumbai   |Home       |Mixer      |NULL   |2024-01-08|Completed|
|ORD007  |C001       |Delhi    |Electronics|Laptop     |47000  |09-01-2024|Completed|
|ORD008  |C007       |Bangalore|Home       |Vacuum     |28000  |2024-01-09|Completed|
|ORD009  |C002       |Mumbai   |Electronics|Laptop    

In [13]:
#5
from pyspark.sql.functions import col, regexp_replace, lower, initcap

df_q5 = df_q4  # use the output from Q4

for c in ["city", "category", "product"]:
    col_norm = regexp_replace(col(c), r"\s+", " ")                       # collapse multiple spaces
    col_norm = regexp_replace(col_norm, r"(?<=[a-z])(?=[A-Z])", " ")     # split camelCase/PascalCase
    col_norm = initcap(lower(col_norm))                                  # Title Case
    df_q5 = df_q5.withColumn(c, col_norm)

df_q5.show(truncate=False)


+--------+-----------+---------+-----------+------------+-------+----------+---------+
|order_id|customer_id|city     |category   |product     |amount |order_date|status   |
+--------+-----------+---------+-----------+------------+-------+----------+---------+
|ORD001  |C001       |Delhi    |Electronics|Laptop      |45000  |2024-01-05|Completed|
|ORD002  |C002       |Mumbai   |Electronics|Mobile      |32000  |05/01/2024|Completed|
|ORD003  |C003       |Bangalore|Electronics|Tablet      |30000  |2024/01/06|Completed|
|ORD004  |C004       |Delhi    |Electronics|Laptop      |       |2024-01-07|Cancelled|
|ORD005  |C005       |Chennai  |Electronics|Mobile      |invalid|2024-01-08|Completed|
|ORD006  |C006       |Mumbai   |Home       |Mixer       |NULL   |2024-01-08|Completed|
|ORD007  |C001       |Delhi    |Electronics|Laptop      |47000  |09-01-2024|Completed|
|ORD008  |C007       |Bangalore|Home       |Vacuum      |28000  |2024-01-09|Completed|
|ORD009  |C002       |Mumbai   |Electronics

In [14]:
#task6

from pyspark.sql.functions import regexp_replace, when

df_q6 = df_q5

# Extract only digits; anything else becomes empty -> invalid
df_q6 = df_q6.withColumn("amount_digits", regexp_replace(col("amount"), r"[^0-9]", ""))

# Cast to int if digits exist, else set to null
df_q6 = df_q6.withColumn(
    "amount",
    when(col("amount_digits").rlike(r"^\d+$"), col("amount_digits").cast("int")).otherwise(None)
).drop("amount_digits")

df_q6.select("order_id", "amount").show()


+--------+------+
|order_id|amount|
+--------+------+
|  ORD001| 45000|
|  ORD002| 32000|
|  ORD003| 30000|
|  ORD004|  NULL|
|  ORD005|  NULL|
|  ORD006|  NULL|
|  ORD007| 47000|
|  ORD008| 28000|
|  ORD009| 55000|
|  ORD010| 38000|
|  ORD011| 29000|
|  ORD012| 33000|
|  ORD013| 21000|
|  ORD014| 26000|
|  ORD015| 62000|
|  ORD016| 40000|
|  ORD017| 51000|
|  ORD018| 31000|
|  ORD019| 29000|
|  ORD020| 54000|
+--------+------+
only showing top 20 rows


In [15]:
#task7

df_q7 = df_q6.filter(col("amount").isNotNull())

df_q7.select("order_id", "amount").show()


+--------+------+
|order_id|amount|
+--------+------+
|  ORD001| 45000|
|  ORD002| 32000|
|  ORD003| 30000|
|  ORD007| 47000|
|  ORD008| 28000|
|  ORD009| 55000|
|  ORD010| 38000|
|  ORD011| 29000|
|  ORD012| 33000|
|  ORD013| 21000|
|  ORD014| 26000|
|  ORD015| 62000|
|  ORD016| 40000|
|  ORD017| 51000|
|  ORD018| 31000|
|  ORD019| 29000|
|  ORD020| 54000|
|  ORD020| 54000|
+--------+------+



In [21]:
#task 8
from pyspark.sql.functions import col, to_date, when

df_q8 = df_q7.withColumn(
    "order_date",
    when(col("order_date").rlike(r"^\d{4}-\d{2}-\d{2}$"), to_date(col("order_date"), "yyyy-MM-dd"))
    .when(col("order_date").rlike(r"^\d{2}/\d{2}/\d{4}$"), to_date(col("order_date"), "dd/MM/yyyy"))
    .when(col("order_date").rlike(r"^\d{4}/\d{2}/\d{2}$"), to_date(col("order_date"), "yyyy/MM/dd"))
    .when(col("order_date").rlike(r"^\d{2}-\d{2}-\d{4}$"), to_date(col("order_date"), "dd-MM-yyyy"))
    .otherwise(None)
).filter(col("order_date").isNotNull())

df_q8.select("order_id", "order_date").show(truncate=False)
df_q8.printSchema()


+--------+----------+
|order_id|order_date|
+--------+----------+
|ORD001  |2024-01-05|
|ORD002  |2024-01-05|
|ORD003  |2024-01-06|
|ORD007  |2024-01-09|
|ORD008  |2024-01-09|
|ORD009  |2024-01-10|
|ORD010  |2024-01-10|
|ORD011  |2024-01-11|
|ORD012  |2024-01-11|
|ORD013  |2024-01-12|
|ORD014  |2024-01-12|
|ORD015  |2024-01-13|
|ORD016  |2024-01-13|
|ORD017  |2024-01-14|
|ORD018  |2024-01-14|
|ORD019  |2024-01-15|
|ORD020  |2024-01-15|
|ORD020  |2024-01-15|
+--------+----------+

root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- status: string (nullable = true)



In [22]:
#task 9
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

# Sanity check: order_date must be DateType
df_q8.printSchema()

w = Window.partitionBy("order_id").orderBy(
    col("order_date").desc(),
    col("amount").desc(),
    col("customer_id").desc()
)

df_q9 = df_q8.withColumn("rn", row_number().over(w)) \
             .filter(col("rn") == 1) \
             .drop("rn")

# Verify (should be 1 per order_id)
df_q9.groupBy("order_id").count().orderBy(col("order_id")).show()


root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- status: string (nullable = true)

+--------+-----+
|order_id|count|
+--------+-----+
|  ORD001|    1|
|  ORD002|    1|
|  ORD003|    1|
|  ORD007|    1|
|  ORD008|    1|
|  ORD009|    1|
|  ORD010|    1|
|  ORD011|    1|
|  ORD012|    1|
|  ORD013|    1|
|  ORD014|    1|
|  ORD015|    1|
|  ORD016|    1|
|  ORD017|    1|
|  ORD018|    1|
|  ORD019|    1|
|  ORD020|    1|
+--------+-----+



In [23]:
#task 10
from pyspark.sql.functions import lower, col

# Assume df_q9 is your DataFrame after deduplication
df_q10 = df_q9.withColumn("status_norm", lower(col("status"))) \
              .filter(col("status_norm") == "completed") \
              .drop("status_norm")

# Verify
df_q10.select("order_id", "status").show(truncate=False)
print("Final cleaned record count:", df_q10.count())


+--------+---------+
|order_id|status   |
+--------+---------+
|ORD001  |Completed|
|ORD002  |Completed|
|ORD003  |Completed|
|ORD007  |Completed|
|ORD008  |Completed|
|ORD009  |Completed|
|ORD010  |Completed|
|ORD011  |Completed|
|ORD012  |Completed|
|ORD013  |Completed|
|ORD014  |Completed|
|ORD015  |Completed|
|ORD016  |Completed|
|ORD017  |Completed|
|ORD018  |Completed|
|ORD019  |Completed|
|ORD020  |Completed|
+--------+---------+

Final cleaned record count: 17


In [24]:

# Task 11: Counts before vs after cleaning
before_count = raw_df.count()
after_count = df_q10.count()

print(f"Records BEFORE cleaning: {before_count}")
print(f"Records AFTER cleaning:  {after_count}")
print(f"Rows removed during cleaning: {before_count - after_count}")


Records BEFORE cleaning: 21
Records AFTER cleaning:  17
Rows removed during cleaning: 4


In [25]:

from pyspark.sql.functions import col

# Task 12: Null checks in key columns
null_checks = df_q10.selectExpr(
    "sum(case when order_id  IS NULL then 1 else 0 end) as null_order_id",
    "sum(case when amount    IS NULL then 1 else 0 end) as null_amount",
    "sum(case when order_date IS NULL then 1 else 0 end) as null_order_date"
).collect()[0]

print("Nulls in key columns:")
print(f"- order_id:   {null_checks['null_order_id']}")
print(f"- amount:     {null_checks['null_amount']}")
print(f"- order_date: {null_checks['null_order_date']}")




Nulls in key columns:
- order_id:   0
- amount:     0
- order_date: 0


In [26]:

# Task 13: Print schema and programmatic type checks
df_q10.printSchema()

# Programmatic checks
schema_ok = (
    dict(df_q10.dtypes) == {
        'order_id': 'string',
        'customer_id': 'string',
        'city': 'string',
        'category': 'string',
        'product': 'string',
        'amount': 'int',
        'order_date': 'date',
        'status': 'string'
    }
)
print("Schema matches expected types:", schema_ok)

# More explicit assertions
types = dict(df_q10.dtypes)
assert types['order_id']   == 'string'
assert types['customer_id'] == 'string'
assert types['city']       == 'string'
assert types['category']   == 'string'
assert types['product']    == 'string'
assert types['status']     == 'string'
assert types['amount']     == 'int'
assert types['order_date'] == 'date'


root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- status: string (nullable = true)

Schema matches expected types: True


In [27]:
#task 14
from pyspark.sql.functions import sum as F_sum, col

revenue_by_city = df_q10.groupBy("city").agg(F_sum("amount").alias("total_revenue")) \
                        .orderBy(col("total_revenue").desc())

revenue_by_city.show(truncate=False)


+---------+-------------+
|city     |total_revenue|
+---------+-------------+
|Bangalore|217000       |
|Delhi    |187000       |
|Mumbai   |185000       |
|Chennai  |62000        |
+---------+-------------+



In [28]:
#task15
revenue_by_category = df_q10.groupBy("category").agg(F_sum("amount").alias("total_revenue")) \
                            .orderBy(col("total_revenue").desc())

revenue_by_category.show(truncate=False)


+-----------+-------------+
|category   |total_revenue|
+-----------+-------------+
|Electronics|464000       |
|Home       |187000       |
+-----------+-------------+



In [29]:
#task16
revenue_by_product = df_q10.groupBy("product").agg(F_sum("amount").alias("total_revenue")) \
                           .orderBy(col("total_revenue").desc())

revenue_by_product.show(truncate=False)


+------------+-------------+
|product     |total_revenue|
+------------+-------------+
|Laptop      |314000       |
|Vacuum      |88000        |
|Tablet      |85000        |
|Air Purifier|78000        |
|Mobile      |65000        |
|Mixer       |21000        |
+------------+-------------+



In [30]:
#task17
from pyspark.sql.functions import avg as F_avg, count as F_count

aov_by_city = df_q10.groupBy("city") \
    .agg(
        F_avg("amount").alias("avg_order_value"),
        F_count("*").alias("order_count")
    ) \
    .orderBy(col("avg_order_value").desc())

aov_by_city.show(truncate=False)


+---------+------------------+-----------+
|city     |avg_order_value   |order_count|
+---------+------------------+-----------+
|Chennai  |62000.0           |1          |
|Delhi    |37400.0           |5          |
|Mumbai   |37000.0           |5          |
|Bangalore|36166.666666666664|6          |
+---------+------------------+-----------+



In [31]:
#task18

top3_products = revenue_by_product.limit(3)
top3_products.show(truncate=False)


+-------+-------------+
|product|total_revenue|
+-------+-------------+
|Laptop |314000       |
|Vacuum |88000        |
|Tablet |85000        |
+-------+-------------+



In [32]:
#task19
from pyspark.sql.functions import sum as F_sum, col
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank

# Aggregate: total revenue per city
city_revenue = df_q10.groupBy("city").agg(F_sum("amount").alias("total_revenue"))

# Window for ranking across all cities by total_revenue (descending)
w_city = Window.orderBy(col("total_revenue").desc())

# Rank cities
city_ranked = city_revenue.withColumn("rank", dense_rank().over(w_city)) \
                          .orderBy(col("rank"), col("total_revenue").desc(), col("city"))

city_ranked.show(truncate=False)


+---------+-------------+----+
|city     |total_revenue|rank|
+---------+-------------+----+
|Bangalore|217000       |1   |
|Delhi    |187000       |2   |
|Mumbai   |185000       |3   |
|Chennai  |62000        |4   |
+---------+-------------+----+



In [33]:
#task20
from pyspark.sql.functions import sum as F_sum, col
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank

# Aggregate: revenue per product within each category
cat_prod_revenue = df_q10.groupBy("category", "product") \
                         .agg(F_sum("amount").alias("total_revenue"))

# Window partitioned by category; order by revenue desc
w_cat_prod = Window.partitionBy("category").orderBy(col("total_revenue").desc())

# Rank products within each category
cat_prod_ranked = cat_prod_revenue.withColumn("rank_within_category", dense_rank().over(w_cat_prod)) \
                                  .orderBy(col("category"), col("rank_within_category"), col("total_revenue").desc())

cat_prod_ranked.show(truncate=False)


+-----------+------------+-------------+--------------------+
|category   |product     |total_revenue|rank_within_category|
+-----------+------------+-------------+--------------------+
|Electronics|Laptop      |314000       |1                   |
|Electronics|Tablet      |85000        |2                   |
|Electronics|Mobile      |65000        |3                   |
|Home       |Vacuum      |88000        |1                   |
|Home       |Air Purifier|78000        |2                   |
|Home       |Mixer       |21000        |3                   |
+-----------+------------+-------------+--------------------+



In [35]:
#task21
top_product_per_category = cat_prod_ranked.filter(col("rank_within_category") == 1) \
                                          .orderBy(col("category"))

top_product_per_category.show(truncate=False)


+-----------+-------+-------------+--------------------+
|category   |product|total_revenue|rank_within_category|
+-----------+-------+-------------+--------------------+
|Electronics|Laptop |314000       |1                   |
|Home       |Vacuum |88000        |1                   |
+-----------+-------+-------------+--------------------+



In [36]:

# Task 22: Cache the cleaned DataFrame
df_cached = df_q10.cache()

print("Caching df_q10 ...")
df_cached.count()

from pyspark import StorageLevel
print("Default storage level:", StorageLevel.MEMORY_ONLY)




Caching df_q10 ...
Default storage level: Memory Serialized 1x Replicated


In [37]:
#task 23

import pyspark.sql.functions as F
from pyspark.sql.functions import col

# Total revenue by city
revenue_by_city = df_cached.groupBy("city").agg(F.sum("amount").alias("total_revenue")) \
                           .orderBy(col("total_revenue").desc())

# Total revenue by category
revenue_by_category = df_cached.groupBy("category").agg(F.sum("amount").alias("total_revenue")) \
                               .orderBy(col("total_revenue").desc())

# AOV by city, with count
aov_by_city = df_cached.groupBy("city") \
    .agg(F.avg("amount").alias("avg_order_value"), F.count("*").alias("order_count")) \
    .orderBy(col("avg_order_value").desc())

# Top products by revenue
revenue_by_product = df_cached.groupBy("product").agg(F.sum("amount").alias("total_revenue")) \
                              .orderBy(col("total_revenue").desc())

print("Revenue by city:")
revenue_by_city.show(truncate=False)

print("Revenue by category:")
revenue_by_category.show(truncate=False)

print("AOV by city:")
aov_by_city.show(truncate=False)

print("Revenue by product (top 10):")
revenue_by_product.limit(10).show(truncate=False)


Revenue by city:
+---------+-------------+
|city     |total_revenue|
+---------+-------------+
|Bangalore|217000       |
|Delhi    |187000       |
|Mumbai   |185000       |
|Chennai  |62000        |
+---------+-------------+

Revenue by category:
+-----------+-------------+
|category   |total_revenue|
+-----------+-------------+
|Electronics|464000       |
|Home       |187000       |
+-----------+-------------+

AOV by city:
+---------+------------------+-----------+
|city     |avg_order_value   |order_count|
+---------+------------------+-----------+
|Chennai  |62000.0           |1          |
|Delhi    |37400.0           |5          |
|Mumbai   |37000.0           |5          |
|Bangalore|36166.666666666664|6          |
+---------+------------------+-----------+

Revenue by product (top 10):
+------------+-------------+
|product     |total_revenue|
+------------+-------------+
|Laptop      |314000       |
|Vacuum      |88000        |
|Tablet      |85000        |
|Air Purifier|78000    

In [38]:
# Task 24: Explain execution plans with details
print("Explain plan — revenue_by_city:")
revenue_by_city.explain(True)

print("Explain plan — revenue_by_category:")
revenue_by_category.explain(True)

print("Explain plan — aov_by_city:")
aov_by_city.explain(True)


Explain plan — revenue_by_city:
== Parsed Logical Plan ==
'Sort ['total_revenue DESC NULLS LAST], true
+- Aggregate [city#86], [city#86, sum(amount#115) AS total_revenue#785L]
   +- Project [order_id#53, customer_id#54, city#86, category#87, product#88, amount#115, order_date#168, status#60]
      +- Filter (status_norm#197 = completed)
         +- Project [order_id#53, customer_id#54, city#86, category#87, product#88, amount#115, order_date#168, status#60, lower(status#60) AS status_norm#197]
            +- Project [order_id#53, customer_id#54, city#86, category#87, product#88, amount#115, order_date#168, status#60]
               +- Filter (rn#176 = 1)
                  +- Project [order_id#53, customer_id#54, city#86, category#87, product#88, amount#115, order_date#168, status#60, rn#176]
                     +- Project [order_id#53, customer_id#54, city#86, category#87, product#88, amount#115, order_date#168, status#60, rn#176, rn#176]
                        +- Window [row_number(

In [39]:

# Task 25: Repartition by 'city' (wide transformation with shuffle)
df_city_part = df_cached.repartition(col("city"))

num_parts = df_city_part.rdd.getNumPartitions()
print("Number of partitions after repartition by city:", num_parts)

revenue_by_city_part = df_city_part.groupBy("city").agg(F.sum("amount").alias("total_revenue")) \
                                   .orderBy(col("total_revenue").desc())

revenue_by_city_part.explain(True)
revenue_by_city_part.show(truncate=False)


Number of partitions after repartition by city: 1
== Parsed Logical Plan ==
'Sort ['total_revenue DESC NULLS LAST], true
+- Aggregate [city#86], [city#86, sum(amount#115) AS total_revenue#2245L]
   +- RepartitionByExpression [city#86]
      +- Project [order_id#53, customer_id#54, city#86, category#87, product#88, amount#115, order_date#168, status#60]
         +- Filter (status_norm#197 = completed)
            +- Project [order_id#53, customer_id#54, city#86, category#87, product#88, amount#115, order_date#168, status#60, lower(status#60) AS status_norm#197]
               +- Project [order_id#53, customer_id#54, city#86, category#87, product#88, amount#115, order_date#168, status#60]
                  +- Filter (rn#176 = 1)
                     +- Project [order_id#53, customer_id#54, city#86, category#87, product#88, amount#115, order_date#168, status#60, rn#176]
                        +- Project [order_id#53, customer_id#54, city#86, category#87, product#88, amount#115, order_dat

In [41]:

# Set ONE of these based on your environment

# Local / standard Spark
base_path = "/tmp/marketplace_out"

# If using Databricks/DBFS, uncomment this:
# base_path = "dbfs:/FileStore/marketplace_out"

orders_parquet_path = f"{base_path}/orders_parquet"
analytics_orc_base  = f"{base_path}/analytics_orc"


In [42]:
#task 26
from pyspark.sql.functions import year, month, col

# Derive partitions for BI-friendly layout
orders_for_write = df_q10 \
    .withColumn("order_year",  year(col("order_date"))) \
    .withColumn("order_month", month(col("order_date")))

# Optional (good default in Spark 3.x; set explicitly if you want):
spark.conf.set("spark.sql.parquet.compression.codec", "snappy")

# Write Parquet partitioned by year, month, and city
orders_for_write.write \
    .mode("overwrite") \
    .partitionBy("order_year", "order_month", "city") \
    .parquet(orders_parquet_path)

print(f" Order-level Parquet written to: {orders_parquet_path}")


 Order-level Parquet written to: /tmp/marketplace_out/orders_parquet


In [43]:
#task27
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.window import Window
from pyspark.sql.functions import dense_rank

# Core aggregates
revenue_by_city = df_q10.groupBy("city").agg(F.sum("amount").alias("total_revenue"))
revenue_by_category = df_q10.groupBy("category").agg(F.sum("amount").alias("total_revenue"))
revenue_by_product = df_q10.groupBy("product").agg(F.sum("amount").alias("total_revenue"))
aov_by_city = df_q10.groupBy("city").agg(
    F.avg("amount").alias("avg_order_value"),
    F.count("*").alias("order_count")
)

# Top-3 products by revenue (dense_rank to include ties)
w_prod = Window.orderBy(col("total_revenue").desc())
top_products_ranked = revenue_by_product.withColumn("rank", dense_rank().over(w_prod)) \
                                        .filter(col("rank") <= 3)

# Optional: set ORC compression codec
spark.conf.set("spark.sql.orc.compression.codec", "snappy")

# Paths
orc_city_path     = f"{analytics_orc_base}/revenue_by_city"
orc_cat_path      = f"{analytics_orc_base}/revenue_by_category"
orc_prod_path     = f"{analytics_orc_base}/revenue_by_product"
orc_aov_city_path = f"{analytics_orc_base}/aov_by_city"
orc_top3_path     = f"{analytics_orc_base}/top3_products_by_revenue"

# (Optional) reduce number of small files for demo purposes
revenue_by_city.coalesce(1).write.mode("overwrite").format("orc").save(orc_city_path)
revenue_by_category.coalesce(1).write.mode("overwrite").format("orc").save(orc_cat_path)
revenue_by_product.coalesce(1).write.mode("overwrite").format("orc").save(orc_prod_path)
aov_by_city.coalesce(1).write.mode("overwrite").format("orc").save(orc_aov_city_path)
top_products_ranked.coalesce(1).write.mode("overwrite").format("orc").save(orc_top3_path)

print("✅ Analytics ORC datasets written to:")
print(orc_city_path)
print(orc_cat_path)
print(orc_prod_path)
print(orc_aov_city_path)
print(orc_top3_path)


✅ Analytics ORC datasets written to:
/tmp/marketplace_out/analytics_orc/revenue_by_city
/tmp/marketplace_out/analytics_orc/revenue_by_category
/tmp/marketplace_out/analytics_orc/revenue_by_product
/tmp/marketplace_out/analytics_orc/aov_by_city
/tmp/marketplace_out/analytics_orc/top3_products_by_revenue


In [44]:
#task 28

# --- Read back Parquet (order-level) ---
orders_read = spark.read.parquet(orders_parquet_path)
print(" Orders Parquet schema:")
orders_read.printSchema()
print("Sample rows from orders:")
orders_read.orderBy("order_date", "city").show(truncate=False)

# Schema assertions (adjust if you changed partitioning columns)
expected_order_cols = {
    'order_id': 'string',
    'customer_id': 'string',
    'city': 'string',          # present in data & also a partition column
    'category': 'string',
    'product': 'string',
    'amount': 'int',
    'order_date': 'date',
    'status': 'string',
    'order_year': 'int',
    'order_month': 'int'
}
read_types = dict(orders_read.dtypes)
missing = [c for c in expected_order_cols if c not in read_types]
mismatched = {c:(read_types.get(c), t) for c,t in expected_order_cols.items() if read_types.get(c) != t}
print("Validation — missing columns:", missing)
print("Validation — mismatched types (read vs expected):", mismatched)

# --- Read back ORC analytics ---
rev_city_read = spark.read.format("orc").load(orc_city_path)
rev_cat_read  = spark.read.format("orc").load(orc_cat_path)
rev_prod_read = spark.read.format("orc").load(orc_prod_path)
aov_city_read = spark.read.format("orc").load(orc_aov_city_path)
top3_read     = spark.read.format("orc").load(orc_top3_path)

print("\n ORC schemas:")
print("revenue_by_city:")
rev_city_read.printSchema()
print("revenue_by_category:")
rev_cat_read.printSchema()
print("revenue_by_product:")
rev_prod_read.printSchema()
print("aov_by_city:")
aov_city_read.printSchema()
print("top3_products_by_revenue:")
top3_read.printSchema()

# Quick integrity checks
print("\nRow counts:")
print("revenue_by_city:", rev_city_read.count())
print("revenue_by_category:", rev_cat_read.count())
print("revenue_by_product:", rev_prod_read.count())
print("aov_by_city:", aov_city_read.count())
print("top3_products_by_revenue:", top3_read.count())

# Peek data
rev_city_read.orderBy(F.col("total_revenue").desc()).show(truncate=False)
rev_cat_read.orderBy(F.col("total_revenue").desc()).show(truncate=False)
rev_prod_read.orderBy(F.col("total_revenue").desc()).show(truncate=False)
aov_city_read.orderBy(F.col("avg_order_value").desc()).show(truncate=False)
top3_read.orderBy(F.col("rank"), F.col("total_revenue").desc()).show(truncate=False)


 Orders Parquet schema:
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: integer (nullable = true)
 |-- order_date: date (nullable = true)
 |-- status: string (nullable = true)
 |-- order_year: integer (nullable = true)
 |-- order_month: integer (nullable = true)
 |-- city: string (nullable = true)

Sample rows from orders:
+--------+-----------+-----------+------------+------+----------+---------+----------+-----------+---------+
|order_id|customer_id|category   |product     |amount|order_date|status   |order_year|order_month|city     |
+--------+-----------+-----------+------------+------+----------+---------+----------+-----------+---------+
|ORD001  |C001       |Electronics|Laptop      |45000 |2024-01-05|Completed|2024      |1          |Delhi    |
|ORD002  |C002       |Electronics|Mobile      |32000 |2024-01-05|Completed|2024      |1          |Mumbai   

In [45]:
#task 29
df = df.filter(df.amount > 30000).show()

{"ts": "2025-12-24 04:40:13.960", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[CAST_INVALID_INPUT] The value '' of the type \"STRING\" cannot be cast to \"BIGINT\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018", "context": {"file": "line 2 in cell [45]", "line": "", "fragment": "__gt__", "errorClass": "CAST_INVALID_INPUT"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o930.showString.\n: org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '' of the type \"STRING\" cannot be cast to \"BIGINT\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018\n== DataFrame ==\n\"__gt__\" was called from\nline 2 in cell [45]\n\n\tat org.apache.spark.sql.errors.QueryExecutionErro

NumberFormatException: [CAST_INVALID_INPUT] The value '' of the type "STRING" cannot be cast to "BIGINT" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"__gt__" was called from
line 2 in cell [45]


In [52]:

from pyspark.sql.functions import col

# Correct: keep filtered DataFrame in a variable, then call show()
raw_df = raw_df.filter(col("amount") > 30000)
raw_df.show()


{"ts": "2025-12-24 04:43:11.830", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[CAST_INVALID_INPUT] The value '' of the type \"STRING\" cannot be cast to \"BIGINT\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018", "context": {"file": "line 4 in cell [52]", "line": "", "fragment": "__gt__", "errorClass": "CAST_INVALID_INPUT"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o1038.showString.\n: org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value '' of the type \"STRING\" cannot be cast to \"BIGINT\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018\n== DataFrame ==\n\"__gt__\" was called from\nline 4 in cell [52]\n\n\tat org.apache.spark.sql.errors.QueryExecutionErr

NumberFormatException: [CAST_INVALID_INPUT] The value '' of the type "STRING" cannot be cast to "BIGINT" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"__gt__" was called from
line 4 in cell [52]
