PHASE 1

In [8]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("OrderPipeline").getOrCreate()

# Load CSV without schema inference
df = spark.read.option("header", "true").option("inferSchema", "false").csv("orders.csv")

# Print schema
df.printSchema()

# Count records
print("Total Records:", df.count())

# Show sample rows
df.show(5)


root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)

Total Records: 300000
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|   order_id|customer_id|       city|   category|    product| amount|order_date|   status|
+-----------+-----------+-----------+-----------+-----------+-------+----------+---------+
|ORD00000000|    C000000| hyderabad |   grocery |       Oil |invalid|01/01/2024|Cancelled|
|ORD00000001|    C000001|       Pune|    Grocery|      Sugar|  35430|2024-01-02|Completed|
|ORD00000002|    C000002|       Pune|Electronics|     Mobile|  65358|2024-01-03|Completed|
|ORD00000003|    C000003|  Bangalore|Electronics|     Laptop|   5558|2024-01-04|Completed|
|ORD00000004|    C0

In [None]:
from google.colab import drive
drive.mount('/content/drive')

PHASE 2


In [9]:
#phase 2 task 1
from pyspark.sql import functions as F

# Proper case = trim → lower → initcap
def proper_case(col):
    return F.initcap(F.lower(F.trim(F.col(col))))

df2 = (df
       .withColumn("city_std",     proper_case("city"))
       .withColumn("category_std", proper_case("category"))
       .withColumn("product_std",  proper_case("product"))
       .withColumn("status_std",   proper_case("status")))

df2.select("city","city_std","category","category_std","product","product_std","status","status_std").show(10, truncate=False)


+-----------+---------+-----------+------------+-----------+-----------+---------+----------+
|city       |city_std |category   |category_std|product    |product_std|status   |status_std|
+-----------+---------+-----------+------------+-----------+-----------+---------+----------+
| hyderabad |Hyderabad| grocery   |Grocery     |Oil        |Oil        |Cancelled|Cancelled |
|Pune       |Pune     |Grocery    |Grocery     |Sugar      |Sugar      |Completed|Completed |
|Pune       |Pune     |Electronics|Electronics |Mobile     |Mobile     |Completed|Completed |
|Bangalore  |Bangalore|Electronics|Electronics |Laptop     |Laptop     |Completed|Completed |
|Pune       |Pune     |Home       |Home        |AirPurifier|Airpurifier|Completed|Completed |
|Delhi      |Delhi    |Fashion    |Fashion     |Jeans      |Jeans      |Completed|Completed |
|Delhi      |Delhi    |Grocery    |Grocery     |Sugar      |Sugar      |Completed|Completed |
|Pune       |Pune     |Grocery    |Grocery     |Rice       |

In [10]:
#Clean amount column t3
from pyspark.sql.types import IntegerType

amount_no_commas = F.regexp_replace(F.trim(F.col("amount")), ",", "")

df3 = (df2
       .withColumn("amount_clean",
                   F.when(amount_no_commas.rlike(r"^[0-9]+$"),
                          amount_no_commas.cast(IntegerType()))
                    .otherwise(F.lit(None))))

df3.select("amount","amount_clean").show(15, truncate=False)


+-------+------------+
|amount |amount_clean|
+-------+------------+
|invalid|NULL        |
|35430  |35430       |
|65358  |65358       |
|5558   |5558        |
|33659  |33659       |
|8521   |8521        |
|42383  |42383       |
|45362  |45362       |
|10563  |10563       |
|63715  |63715       |
|66576  |66576       |
|50318  |50318       |
|84768  |84768       |
|79121  |79121       |
|79469  |79469       |
+-------+------------+
only showing top 15 rows


In [11]:
 #Clean the order_date column: t4
from pyspark.sql import functions as F, types as T
from datetime import datetime

@F.udf(T.StringType())
def normalize_date_any(s):
    if not s:
        return None
    s = s.strip().replace(".", "/")
    for fmt in ("%Y-%m-%d", "%d/%m/%Y", "%Y/%m/%d"):
        try:
            return datetime.strptime(s, fmt).strftime("%Y-%m-%d")
        except Exception:
            continue
    return None  # couldn't parse

df4 = (df3
       .withColumn("order_date_norm", normalize_date_any("order_date"))
       .withColumn("order_date_clean", F.to_date("order_date_norm", "yyyy-MM-dd"))
       .drop("order_date_norm"))

df4.select("order_id","order_date","order_date_clean").show(15, truncate=False)
print("Rows with invalid date:", df4.filter(F.col("order_date_clean").isNull()).count())


+-----------+----------+----------------+
|order_id   |order_date|order_date_clean|
+-----------+----------+----------------+
|ORD00000000|01/01/2024|2024-01-01      |
|ORD00000001|2024-01-02|2024-01-02      |
|ORD00000002|2024-01-03|2024-01-03      |
|ORD00000003|2024-01-04|2024-01-04      |
|ORD00000004|2024-01-05|2024-01-05      |
|ORD00000005|2024-01-06|2024-01-06      |
|ORD00000006|2024-01-07|2024-01-07      |
|ORD00000007|2024-01-08|2024-01-08      |
|ORD00000008|2024-01-09|2024-01-09      |
|ORD00000009|2024-01-10|2024-01-10      |
|ORD00000010|2024-01-11|2024-01-11      |
|ORD00000011|12/01/2024|2024-01-12      |
|ORD00000012|2024-01-13|2024-01-13      |
|ORD00000013|2024/01/14|2024-01-14      |
|ORD00000014|2024-01-15|2024-01-15      |
+-----------+----------+----------------+
only showing top 15 rows
Rows with invalid date: 2595


In [12]:
#The original columns must remain for auditing. t5

df4.select(
    "order_id","customer_id",
    "city","city_std",
    "category","category_std",
    "product","product_std",
    "amount","amount_clean",
    "order_date","order_date_clean",
    "status","status_std"
).show(10, truncate=False)


+-----------+-----------+-----------+---------+-----------+------------+-----------+-----------+-------+------------+----------+----------------+---------+----------+
|order_id   |customer_id|city       |city_std |category   |category_std|product    |product_std|amount |amount_clean|order_date|order_date_clean|status   |status_std|
+-----------+-----------+-----------+---------+-----------+------------+-----------+-----------+-------+------------+----------+----------------+---------+----------+
|ORD00000000|C000000    | hyderabad |Hyderabad| grocery   |Grocery     |Oil        |Oil        |invalid|NULL        |01/01/2024|2024-01-01      |Cancelled|Cancelled |
|ORD00000001|C000001    |Pune       |Pune     |Grocery    |Grocery     |Sugar      |Sugar      |35430  |35430       |2024-01-02|2024-01-02      |Completed|Completed |
|ORD00000002|C000002    |Pune       |Pune     |Electronics|Electronics |Mobile     |Mobile     |65358  |65358       |2024-01-03|2024-01-03      |Completed|Completed 

PHASE 3

In [13]:
#task1 Count how many records had invalid amounts
from pyspark.sql import functions as F

invalid_amount_count = df4.filter(F.col("amount_clean").isNull()).count()
print("Invalid/empty amount rows:", invalid_amount_count)


Invalid/empty amount rows: 25164


In [14]:
#Task 2: Count how many records had invalid dates
invalid_date_count = df4.filter(F.col("order_date_clean").isNull()).count()
print("Invalid/empty date rows:", invalid_date_count)


Invalid/empty date rows: 2595


In [15]:
#TASK 3:Identify duplicate order_id values
dups = (df4.groupBy("order_id")
           .agg(F.count("*").alias("cnt"))
           .filter(F.col("cnt") > 1))

dup_groups = dups.count()
print("Duplicate order_id groups:", dup_groups)
dups.show(20, truncate=False)  # preview a few if any


Duplicate order_id groups: 0
+--------+---+
|order_id|cnt|
+--------+---+
+--------+---+



In [16]:
#Remove duplicates using order_id. TASK 4

df5_simple_dedup = df4.dropDuplicates(["order_id"])
print("Rows after simple dedup:", df5_simple_dedup.count())



Rows after simple dedup: 300000


In [17]:

from pyspark.sql.window import Window

w = Window.partitionBy("order_id").orderBy(F.col("order_date_clean").desc_nulls_last())

df5_latest = (df4
              .withColumn("rn", F.row_number().over(w))
              .filter(F.col("rn") == 1)
              .drop("rn"))

print("Rows after 'keep latest by date' dedup:", df5_latest.count())



Rows after 'keep latest by date' dedup: 300000


In [18]:
#Task 5: Filter only records with status = "Completed"
df6_completed = df5_latest.filter(F.lower(F.col("status_std")) == "completed")
print("Rows with status='Completed':", df6_completed.count())



Rows with status='Completed': 285000


In [19]:

# Task 6: Record row counts at every stage
raw_count        = df.count()
clean_count      = df4.count()
dedup_count      = df5_latest.count()
completed_count  = df6_completed.count()

print("=== Row Count Checkpoints ===")
print(f"Raw:        {raw_count}")
print(f"Cleaned:    {clean_count}")
print(f"Dedup:      {dedup_count}")
print(f"Completed:  {completed_count}")


=== Row Count Checkpoints ===
Raw:        300000
Cleaned:    300000
Dedup:      300000
Completed:  285000


PHASE 4

In [20]:

# PHASE 4 – Task 1
num_partitions = df6_completed.rdd.getNumPartitions()
print("Current partitions:", num_partitions)


print("spark.sql.shuffle.partitions =", spark.conf.get("spark.sql.shuffle.partitions"))


Current partitions: 2
spark.sql.shuffle.partitions = 200


In [21]:
# PHASE 4 – Task 2
from pyspark.sql import functions as F

rev_by_city = (df6_completed
               .groupBy("city_std")
               .agg(F.sum("amount_clean").alias("total_revenue"))
              )

rev_by_city.show(10, truncate=False)


+---------+-------------+
|city_std |total_revenue|
+---------+-------------+
|Bangalore|1628527093   |
|Chennai  |1629865247   |
|Mumbai   |1625518096   |
|Kolkata  |1624300497   |
|Pune     |1646196535   |
|Delhi    |1639639916   |
|Hyderabad|1642443340   |
+---------+-------------+



In [22]:

# PHASE 4 – Task 3
rev_by_city.explain(True)


== Parsed Logical Plan ==
'Aggregate ['city_std], ['city_std, 'sum('amount_clean) AS total_revenue#445]
+- Filter (lower(status_std#74) = completed)
   +- Project [order_id#17, customer_id#18, city#19, category#20, product#21, amount#22, order_date#23, status#24, city_std#71, category_std#72, product_std#73, status_std#74, amount_clean#104, order_date_clean#115]
      +- Filter (rn#318 = 1)
         +- Project [order_id#17, customer_id#18, city#19, category#20, product#21, amount#22, order_date#23, status#24, city_std#71, category_std#72, product_std#73, status_std#74, amount_clean#104, order_date_clean#115, rn#318]
            +- Project [order_id#17, customer_id#18, city#19, category#20, product#21, amount#22, order_date#23, status#24, city_std#71, category_std#72, product_std#73, status_std#74, amount_clean#104, order_date_clean#115, rn#318, rn#318]
               +- Window [row_number() windowspecdefinition(order_id#17, order_date_clean#115 DESC NULLS LAST, specifiedwindowframe(Row

In [23]:
print("Look for 'Exchange' nodes in the physical plan above — those are the shuffle boundaries.")#TASK 4

Look for 'Exchange' nodes in the physical plan above — those are the shuffle boundaries.


In [24]:

# PHASE 4 – Task 5

target_partitions = 16

df6_by_city = df6_completed.repartition(target_partitions, "city_std")
print("Repartitioned partitions:", df6_by_city.rdd.getNumPartitions())

rev_by_city_after = (df6_by_city
                     .groupBy("city_std")
                     .agg(F.sum("amount_clean").alias("total_revenue")))

rev_by_city_after.show(10, truncate=False)


Repartitioned partitions: 16
+---------+-------------+
|city_std |total_revenue|
+---------+-------------+
|Delhi    |1639639916   |
|Chennai  |1629865247   |
|Kolkata  |1624300497   |
|Hyderabad|1642443340   |
|Pune     |1646196535   |
|Bangalore|1628527093   |
|Mumbai   |1625518096   |
+---------+-------------+



In [25]:

# PHASE 4 – Task 6
print("=== Plan BEFORE repartition ===")
rev_by_city.explain(True)

print("\n=== Plan AFTER repartition(city_std) ===")
rev_by_city_after.explain(True)


=== Plan BEFORE repartition ===
== Parsed Logical Plan ==
'Aggregate ['city_std], ['city_std, 'sum('amount_clean) AS total_revenue#445]
+- Filter (lower(status_std#74) = completed)
   +- Project [order_id#17, customer_id#18, city#19, category#20, product#21, amount#22, order_date#23, status#24, city_std#71, category_std#72, product_std#73, status_std#74, amount_clean#104, order_date_clean#115]
      +- Filter (rn#318 = 1)
         +- Project [order_id#17, customer_id#18, city#19, category#20, product#21, amount#22, order_date#23, status#24, city_std#71, category_std#72, product_std#73, status_std#74, amount_clean#104, order_date_clean#115, rn#318]
            +- Project [order_id#17, customer_id#18, city#19, category#20, product#21, amount#22, order_date#23, status#24, city_std#71, category_std#72, product_std#73, status_std#74, amount_clean#104, order_date_clean#115, rn#318, rn#318]
               +- Window [row_number() windowspecdefinition(order_id#17, order_date_clean#115 DESC NULL

PHASE 5

In [27]:
#Task 1 — Total revenue per city
from pyspark.sql import functions as F

rev_city = (df6_completed
            .groupBy("city_std")
            .agg(F.sum("amount_clean").alias("total_revenue"))
            .orderBy(F.col("total_revenue").desc()))

rev_city.show(20, truncate=False)



+---------+-------------+
|city_std |total_revenue|
+---------+-------------+
|Pune     |1646196535   |
|Hyderabad|1642443340   |
|Delhi    |1639639916   |
|Chennai  |1629865247   |
|Bangalore|1628527093   |
|Mumbai   |1625518096   |
|Kolkata  |1624300497   |
+---------+-------------+



In [28]:
#Task 2 — Total revenue per category
rev_category = (df6_completed
                .groupBy("category_std")
                .agg(F.sum("amount_clean").alias("total_revenue"))
                .orderBy(F.col("total_revenue").desc()))

rev_category.show(20, truncate=False)


+------------+-------------+
|category_std|total_revenue|
+------------+-------------+
|Home        |2868467576   |
|Electronics |2867568870   |
|Grocery     |2866272106   |
|Fashion     |2834182172   |
+------------+-------------+



In [29]:
#Task 3 — Average order value per city
avg_order_city = (df6_completed
                  .groupBy("city_std")
                  .agg(F.avg("amount_clean").alias("avg_order_value"))
                  .orderBy(F.col("avg_order_value").desc()))

avg_order_city.show(20, truncate=False)


+---------+------------------+
|city_std |avg_order_value   |
+---------+------------------+
|Bangalore|44098.867908689645|
|Pune     |43930.204013556424|
|Delhi    |43817.20780331374 |
|Mumbai   |43723.75651612556 |
|Kolkata  |43709.816662630175|
|Hyderabad|43708.74045293664 |
|Chennai  |43628.27900315863 |
+---------+------------------+



In [30]:
#Task 4 — Top 10 products by revenue

top_products = (df6_completed
                .groupBy("product_std")
                .agg(F.sum("amount_clean").alias("total_revenue"))
                .orderBy(F.col("total_revenue").desc())
                .limit(10))

top_products.show(truncate=False)


+-----------+-------------+
|product_std|total_revenue|
+-----------+-------------+
|Oil        |963572869    |
|Laptop     |962496295    |
|Tablet     |960719999    |
|Vacuum     |959149427    |
|Mixer      |957140026    |
|Rice       |954494237    |
|Airpurifier|952178123    |
|Jeans      |951286127    |
|Sugar      |948205000    |
|Shoes      |946799102    |
+-----------+-------------+



In [31]:
#Task 5 — Cities sorted by revenue (descending)
sorted_cities = rev_city
sorted_cities.show(truncate=False)



+---------+-------------+
|city_std |total_revenue|
+---------+-------------+
|Pune     |1646196535   |
|Hyderabad|1642443340   |
|Delhi    |1639639916   |
|Chennai  |1629865247   |
|Bangalore|1628527093   |
|Mumbai   |1625518096   |
|Kolkata  |1624300497   |
+---------+-------------+



PHASE 6

In [32]:

from pyspark.sql import functions as F
from pyspark.sql.window import Window


In [33]:
#Task 1 — Rank cities by revenue
from pyspark.sql import functions as F
from pyspark.sql.window import Window

city_revenue = (df6_completed
                .groupBy("city_std")
                .agg(F.sum("amount_clean").alias("total_revenue")))

w1 = Window.orderBy(F.col("total_revenue").desc())

ranked_cities = city_revenue.withColumn("city_rank", F.dense_rank().over(w1))

ranked_cities.show(truncate=False)


+---------+-------------+---------+
|city_std |total_revenue|city_rank|
+---------+-------------+---------+
|Pune     |1646196535   |1        |
|Hyderabad|1642443340   |2        |
|Delhi    |1639639916   |3        |
|Chennai  |1629865247   |4        |
|Bangalore|1628527093   |5        |
|Mumbai   |1625518096   |6        |
|Kolkata  |1624300497   |7        |
+---------+-------------+---------+



In [34]:
#Task 2 — Rank products inside each category by revenue
product_rev = (df6_completed
               .groupBy("category_std", "product_std")
               .agg(F.sum("amount_clean").alias("total_revenue")))

w2 = Window.partitionBy("category_std").orderBy(F.col("total_revenue").desc())

ranked_products = product_rev.withColumn("product_rank", F.dense_rank().over(w2))

ranked_products.show(50, truncate=False)


+------------+-----------+-------------+------------+
|category_std|product_std|total_revenue|product_rank|
+------------+-----------+-------------+------------+
|Electronics |Laptop     |962496295    |1           |
|Electronics |Tablet     |960719999    |2           |
|Electronics |Mobile     |944352576    |3           |
|Fashion     |Jeans      |951286127    |1           |
|Fashion     |Shoes      |946799102    |2           |
|Fashion     |Tshirt     |936096943    |3           |
|Grocery     |Oil        |963572869    |1           |
|Grocery     |Rice       |954494237    |2           |
|Grocery     |Sugar      |948205000    |3           |
|Home        |Vacuum     |959149427    |1           |
|Home        |Mixer      |957140026    |2           |
|Home        |Airpurifier|952178123    |3           |
+------------+-----------+-------------+------------+



In [35]:
#Task 3 — Find the top product for every category
top_product_each_category = ranked_products.filter(F.col("product_rank") == 1)

top_product_each_category.show(truncate=False)


+------------+-----------+-------------+------------+
|category_std|product_std|total_revenue|product_rank|
+------------+-----------+-------------+------------+
|Electronics |Laptop     |962496295    |1           |
|Fashion     |Jeans      |951286127    |1           |
|Grocery     |Oil        |963572869    |1           |
|Home        |Vacuum     |959149427    |1           |
+------------+-----------+-------------+------------+



In [36]:
#Task 4 — Identify the top 3 performing cities
top_3_cities = ranked_cities.filter(F.col("city_rank") <= 3)

top_3_cities.show(truncate=False)


+---------+-------------+---------+
|city_std |total_revenue|city_rank|
+---------+-------------+---------+
|Pune     |1646196535   |1        |
|Hyderabad|1642443340   |2        |
|Delhi    |1639639916   |3        |
+---------+-------------+---------+



PHASE 7

In [37]:
#Task 1 — Create the small lookup table (city → region)
from pyspark.sql import Row

# Small lookup table
lookup_data = [
    Row(city="Delhi",      region="North"),
    Row(city="Mumbai",     region="West"),
    Row(city="Bangalore",  region="South"),
    Row(city="Hyderabad",  region="South"),
    Row(city="Pune",       region="West"),
    Row(city="Chennai",    region="South"),
    Row(city="Kolkata",    region="East")
]

lookup_df = spark.createDataFrame(lookup_data)

lookup_df.show()


+---------+------+
|     city|region|
+---------+------+
|    Delhi| North|
|   Mumbai|  West|
|Bangalore| South|
|Hyderabad| South|
|     Pune|  West|
|  Chennai| South|
|  Kolkata|  East|
+---------+------+



In [38]:
#Task 2 — Apply broadcast join
from pyspark.sql.functions import broadcast

df7_joined = (df6_completed
              .join(
                  broadcast(lookup_df),
                  df6_completed.city_std == lookup_df.city,
                  "left"
              ))

df7_joined.select("order_id","city_std","region","amount_clean").show(10, truncate=False)


+-----------+---------+------+------------+
|order_id   |city_std |region|amount_clean|
+-----------+---------+------+------------+
|ORD00000001|Pune     |West  |35430       |
|ORD00000007|Pune     |West  |45362       |
|ORD00000008|Bangalore|South |10563       |
|ORD00000010|Bangalore|South |66576       |
|ORD00000011|Kolkata  |East  |50318       |
|ORD00000012|Bangalore|South |84768       |
|ORD00000014|Mumbai   |West  |79469       |
|ORD00000015|Pune     |West  |81018       |
|ORD00000017|Bangalore|South |69582       |
|ORD00000019|Mumbai   |West  |NULL        |
+-----------+---------+------+------------+
only showing top 10 rows


In [39]:
df7_joined.explain(True)
#Task 3 — Verify that BroadcastHashJoin is used

== Parsed Logical Plan ==
Join LeftOuter, (city_std#71 = city#870)
:- Filter (lower(status_std#74) = completed)
:  +- Project [order_id#17, customer_id#18, city#19, category#20, product#21, amount#22, order_date#23, status#24, city_std#71, category_std#72, product_std#73, status_std#74, amount_clean#104, order_date_clean#115]
:     +- Filter (rn#318 = 1)
:        +- Project [order_id#17, customer_id#18, city#19, category#20, product#21, amount#22, order_date#23, status#24, city_std#71, category_std#72, product_std#73, status_std#74, amount_clean#104, order_date_clean#115, rn#318]
:           +- Project [order_id#17, customer_id#18, city#19, category#20, product#21, amount#22, order_date#23, status#24, city_std#71, category_std#72, product_std#73, status_std#74, amount_clean#104, order_date_clean#115, rn#318, rn#318]
:              +- Window [row_number() windowspecdefinition(order_id#17, order_date_clean#115 DESC NULLS LAST, specifiedwindowframe(RowFrame, unboundedpreceding$(), current

PHASE 8

In [40]:
#Task 1 — Create the classification logic using UDF
from pyspark.sql import functions as F
from pyspark.sql.types import StringType

# UDF to classify order value
def classify_amount(amount):
    if amount is None:
        return None
    if amount >= 80000:
        return "High"
    elif amount >= 40000:
        return "Medium"
    else:
        return "Low"

classify_udf = F.udf(classify_amount, StringType())


In [41]:
#Task 2 — Add the new column order_value_category
df8 = df7_joined.withColumn("order_value_category",
                            classify_udf(F.col("amount_clean")))

df8.select("order_id", "amount_clean", "order_value_category").show(15, truncate=False)


+-----------+------------+--------------------+
|order_id   |amount_clean|order_value_category|
+-----------+------------+--------------------+
|ORD00000001|35430       |Low                 |
|ORD00000007|45362       |Medium              |
|ORD00000008|10563       |Low                 |
|ORD00000010|66576       |Medium              |
|ORD00000011|50318       |Medium              |
|ORD00000012|84768       |High                |
|ORD00000014|79469       |Medium              |
|ORD00000015|81018       |High                |
|ORD00000017|69582       |Medium              |
|ORD00000019|NULL        |NULL                |
|ORD00000022|48832       |Medium              |
|ORD00000023|12000       |Low                 |
|ORD00000024|18082       |Low                 |
|ORD00000025|58248       |Medium              |
|ORD00000028|70675       |Medium              |
+-----------+------------+--------------------+
only showing top 15 rows


In [42]:
#Task 3 — Analyze distribution of the order value categories

df8.groupBy("order_value_category") \
   .agg(F.count("*").alias("count")) \
   .orderBy(F.col("count").desc()) \
   .show(truncate=False)


+--------------------+------+
|order_value_category|count |
+--------------------+------+
|Low                 |121794|
|Medium              |111365|
|High                |27936 |
|NULL                |23905 |
+--------------------+------+



PHASE 9

In [43]:

#Task 1 — Convert the cleaned DataFrame (df8) to RDD
rdd = df8.rdd

In [44]:
#Task 2 — Compute total revenue using reduce()

amount_rdd = rdd.map(lambda row: row.amount_clean).filter(lambda x: x is not None)

total_revenue_rdd = amount_rdd.reduce(lambda a, b: a + b)

print("Total Revenue (RDD):", total_revenue_rdd)


Total Revenue (RDD): 11436490724


In [45]:
#Task 3 — Compute number of orders per city using map + reduceByKey

orders_per_city_rdd = (rdd
                       .map(lambda row: (row.city_std, 1))
                       .reduceByKey(lambda a, b: a + b))

orders_per_city_rdd.collect()


[('Pune', 40883),
 ('Mumbai', 40612),
 ('Hyderabad', 41041),
 ('Delhi', 40854),
 ('Bangalore', 40311),
 ('Kolkata', 40563),
 ('Chennai', 40736)]

In [46]:

for city, count in orders_per_city_rdd.collect():
    print(city, "→", count)


Pune → 40883
Mumbai → 40612
Hyderabad → 41041
Delhi → 40854
Bangalore → 40311
Kolkata → 40563
Chennai → 40736


PHASE 10

In [47]:

# PHASE 10 – Task 2
df8_cached = df8.cache()


In [48]:

# PHASE 10 – Task 3
df8_cached.count()



285000

In [49]:
#TASK 4
import time
from pyspark.sql import functions as F

# FIRST RUN (without warming cache)
start = time.time()
df8_cached.groupBy("city_std").agg(F.sum("amount_clean")).count()
end = time.time()
print("1st run time (slower – cold cache):", end - start)


1st run time (slower – cold cache): 5.382697820663452


In [50]:

# SECOND RUN (faster – using cached data)
start = time.time()
df8_cached.groupBy("city_std").agg(F.sum("amount_clean")).count()
end = time.time()
print("2nd run time (faster – hot cache):", end - start)


2nd run time (faster – hot cache): 3.4156618118286133


In [51]:

# PHASE 10 – Task 5
df8_cached.unpersist()


DataFrame[order_id: string, customer_id: string, city: string, category: string, product: string, amount: string, order_date: string, status: string, city_std: string, category_std: string, product_std: string, status_std: string, amount_clean: int, order_date_clean: date, city: string, region: string, order_value_category: string]

PHASE 11

In [53]:

from pyspark.sql import functions as F

l = df6_completed.alias("l")
r = lookup_df.alias("r")

df7_joined = (
    l.join(F.broadcast(r), F.col("l.city_std") == F.col("r.city"), "left")
     .drop(F.col("r.city"))  # drop the lookup's 'city' column to avoid duplicate name
)

# Carry on to Phase 8 and beyond using df7_joined (no duplicate 'city')


In [56]:

# Remove any column(s) literally named 'city'
df8_no_dup = df8.drop("city")

# Now write Parquet partitioned by city_std
parquet_path = "/content/out/clean_parquet_partitioned_city"
(df8_no_dup
 .write
 .mode("overwrite")
 .partitionBy("city_std")
 .parquet(parquet_path))

print("Parquet written to:", parquet_path)



Parquet written to: /content/out/clean_parquet_partitioned_city


In [58]:
#PHASE 11 – Task 1
parquet_path = "/content/out/clean_parquet_partitioned_city"

(df8_no_dup
 .write
 .mode("overwrite")
 .partitionBy("city_std")
 .parquet(parquet_path))

print("Parquet written to:", parquet_path)


Parquet written to: /content/out/clean_parquet_partitioned_city


In [59]:
#Task 2 — Write aggregated datasets to ORC
from pyspark.sql import functions as F

orc_path_city    = "/content/out/agg_orc/revenue_by_city"
orc_path_category= "/content/out/agg_orc/revenue_by_category"

rev_by_city_orc = (df8.groupBy("city_std")
                   .agg(F.sum("amount_clean").alias("total_revenue")))

rev_by_category_orc = (df8.groupBy("category_std")
                       .agg(F.sum("amount_clean").alias("total_revenue")))

(rev_by_city_orc.write.mode("overwrite").orc(orc_path_city))
(rev_by_category_orc.write.mode("overwrite").orc(orc_path_category))

print("ORC written to:")
print(" -", orc_path_city)
print(" -", orc_path_category)


ORC written to:
 - /content/out/agg_orc/revenue_by_city
 - /content/out/agg_orc/revenue_by_category


In [61]:

# PHASE 11 – Task 3
parquet_back = spark.read.parquet(parquet_path)
orc_city_back = spark.read.orc(orc_path_city)
orc_cat_back  = spark.read.orc(orc_path_category)

print("=== Parquet (clean partitioned) ===")
parquet_back.printSchema()
print("Row count (Parquet):", parquet_back.count())

print("\n=== ORC (rev by city) ===")
orc_city_back.printSchema()
print("Row count (ORC city):", orc_city_back.count())

print("\n=== ORC (rev by category) ===")
orc_cat_back.printSchema()
print("Row count (ORC category):", orc_cat_back.count())



=== Parquet (clean partitioned) ===
root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- category_std: string (nullable = true)
 |-- product_std: string (nullable = true)
 |-- status_std: string (nullable = true)
 |-- amount_clean: integer (nullable = true)
 |-- order_date_clean: date (nullable = true)
 |-- region: string (nullable = true)
 |-- order_value_category: string (nullable = true)
 |-- city_std: string (nullable = true)

Row count (Parquet): 285000

=== ORC (rev by city) ===
root
 |-- city_std: string (nullable = true)
 |-- total_revenue: long (nullable = true)

Row count (ORC city): 7

=== ORC (rev by category) ===
root
 |-- category_std: string (nullable = true)
 |-- total_revenue: long (nullable = true)

Row count (ORC category): 4


In [62]:
#TASK 4 Compare size on disk:

import os

csv_path = "/content/out/clean_csv"
# Note: writing CSV of the entire dataset can be large; proceed anyway for comparison
(df8
 .write
 .mode("overwrite")
 .option("header", True)
 .csv(csv_path))

def dir_size_mb(path):
    total = 0
    for root, _, files in os.walk(path):
        for f in files:
            fp = os.path.join(root, f)
            total += os.path.getsize(fp)
    return round(total / (1024*1024), 2)

print("Sizes (MB):")
print("  CSV     :", dir_size_mb(csv_path))
print("  Parquet :", dir_size_mb(parquet_path))
print("  ORC city:", dir_size_mb(orc_path_city))
print("  ORC cat :", dir_size_mb(orc_path_category))


Sizes (MB):
  CSV     : 37.51
  Parquet : 6.46
  ORC city: 0.0
  ORC cat : 0.0


In [63]:
#TASK 5
import time

# CSV re-read
t0 = time.time()
csv_back = spark.read.option("header", True).csv(csv_path)
csv_cnt = csv_back.count()
t1 = time.time()

# Parquet re-read
t2 = time.time()
parquet_back2 = spark.read.parquet(parquet_path)
parquet_cnt = parquet_back2.count()
t3 = time.time()

print(f"CSV  count={csv_cnt},  read+count time: {t1 - t0:.2f}s")
print(f"PARQ count={parquet_cnt}, read+count time: {t3 - t2:.2f}s")


CSV  count=285000,  read+count time: 0.44s
PARQ count=285000, read+count time: 0.32s


PHASE 12

In [64]:
df = df.filter(df.amount > 50000).show()

{"ts": "2026-01-15 10:55:33.094", "level": "ERROR", "logger": "DataFrameQueryContextLogger", "msg": "[CAST_INVALID_INPUT] The value 'invalid' of the type \"STRING\" cannot be cast to \"BIGINT\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018", "context": {"file": "line 1 in cell [64]", "line": "", "fragment": "__gt__", "errorClass": "CAST_INVALID_INPUT"}, "exception": {"class": "Py4JJavaError", "msg": "An error occurred while calling o650.showString.\n: org.apache.spark.SparkNumberFormatException: [CAST_INVALID_INPUT] The value 'invalid' of the type \"STRING\" cannot be cast to \"BIGINT\" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018\n== DataFrame ==\n\"__gt__\" was called from\nline 1 in cell [64]\n\n\tat org.apache.spark.sql.errors.Quer

NumberFormatException: [CAST_INVALID_INPUT] The value 'invalid' of the type "STRING" cannot be cast to "BIGINT" because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. SQLSTATE: 22018
== DataFrame ==
"__gt__" was called from
line 1 in cell [64]


The line
df = df.filter(df.amount > 50000).show()
breaks because .show() is an action that returns None.
When we assign this to df, the DataFrame variable becomes None, not a DataFrame.
Therefore Spark cannot apply further transformations on it.
The correct pattern is to apply transformations, assign the DataFrame, and call .show() separately.

PHASE 13


In [65]:

# PHASE 13 – Task 1  Confirm amount_clean is IntegerType
df8.printSchema()


root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- category: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- status: string (nullable = true)
 |-- city_std: string (nullable = true)
 |-- category_std: string (nullable = true)
 |-- product_std: string (nullable = true)
 |-- status_std: string (nullable = true)
 |-- amount_clean: integer (nullable = true)
 |-- order_date_clean: date (nullable = true)
 |-- city: string (nullable = true)
 |-- region: string (nullable = true)
 |-- order_value_category: string (nullable = true)



In [66]:
#Task 2 — Confirm order_date_clean is DateType
df8.schema["order_date_clean"].dataType

DateType()

In [67]:
#Check for NULLs in critical fields
from pyspark.sql import functions as F

critical_cols = [
    "order_id",
    "customer_id",
    "city_std",
    "category_std",
    "product_std",
    "amount_clean",
    "order_date_clean",
    "status_std"
]

null_checks = df8.select([
    F.sum(F.col(c).isNull().cast("int")).alias(c + "_nulls")
    for c in critical_cols
])

null_checks.show(truncate=False)


+--------------+-----------------+--------------+------------------+-----------------+------------------+----------------------+----------------+
|order_id_nulls|customer_id_nulls|city_std_nulls|category_std_nulls|product_std_nulls|amount_clean_nulls|order_date_clean_nulls|status_std_nulls|
+--------------+-----------------+--------------+------------------+-----------------+------------------+----------------------+----------------+
|0             |0                |0             |0                 |0                |23905             |2465                  |0               |
+--------------+-----------------+--------------+------------------+-----------------+------------------+----------------------+----------------+



In [68]:
#Task 4 — Check row counts through all phases
print("Raw Count        :", raw_count)
print("After Cleaning   :", clean_count)
print("After Dedup      :", dedup_count)
print("After Completed  :", completed_count)
print("Final DF Count   :", df8.count())


Raw Count        : 300000
After Cleaning   : 300000
After Dedup      : 300000
After Completed  : 285000
Final DF Count   : 285000
