In [1]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
    .appName("Day20ex")\
    .getOrCreate()

In [2]:
raw_orders = [
    ("ORD001","C001","Ravi"," Delhi ","Laptop","Electronics","45000","2024-01-05","Completed"),
    ("ORD002","C002","Sneha","Mumbai"," Mobile ","Electronics","32000","05/01/2024","Completed"),
    ("ORD003","C003","Aman","Bangalore","Laptop","Electronics","55000","2024/01/06","Completed"),
    ("ORD004","C004","Pooja","Delhi","Tablet"," Electronics ","","2024-01-07","Cancelled"),
    ("ORD005","C005","Neha","Chennai","Laptop","Electronics","48000","invalid_date","Completed"),
    ("ORD006","C006","Rahul","Mumbai","Mobile","Electronics",None,"2024-01-08","Completed"),
    ("ORD007","C007","Kiran","Bangalore","Tablet","Electronics","30000","2024-01-08","Completed"),
    ("ORD008","C008","Amit","Delhi","Laptop","electronics","45000","2024-01-09","Completed"),
    ("ORD009","C009","Priya"," Pune","Mobile","Electronics","28000","09-01-2024","Completed"),
    ("ORD010","C010","Suresh","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),
    ("ORD010","C010","Suresh","Mumbai","Laptop","Electronics","55000","2024-01-10","Completed"),
    ("ORD011","C011","Meena","Chennai","Tablet","Electronics","31000","2024-01-11","Completed"),
    ("ORD012","C012","Arjun","Delhi","Mobile","Electronics","27000","2024/01/11","Completed"),
    ("ORD013","C013","Nikhil","Bangalore","Laptop","Electronics","60000","2024-01-12","Completed"),
    ("ORD014","C014","Rohit","Mumbai","Mobile","Electronics","invalid_price","2024-01-12","Completed"),
    ("ORD015","C015","Anita","Delhi","Tablet","Electronics","29000","2024-01-13","Completed"),
    ("ORD016","C016","Vikas","Chennai","Laptop","Electronics","52000","2024-01-13","Completed"),
    ("ORD017","C017","Sunita","Mumbai","Mobile","Electronics","33000","2024-01-14","Completed"),
    ("ORD018","C018","Deepak","Bangalore","Laptop","Electronics","58000","2024-01-14","Completed"),
    ("ORD019","C019","Pallavi","Delhi","Mobile","Electronics","26000","2024-01-15","Completed"),
    ("ORD020","C020","Manish","Mumbai","Tablet","Electronics","34000","2024-01-15","Completed")
]
columns=["order_id","customer_id","customer_name","city","product","category","price","order_date","order_status"]
df=spark.createDataFrame(raw_orders,columns)
df.show()

+--------+-----------+-------------+---------+--------+-------------+-------------+------------+------------+
|order_id|customer_id|customer_name|     city| product|     category|        price|  order_date|order_status|
+--------+-----------+-------------+---------+--------+-------------+-------------+------------+------------+
|  ORD001|       C001|         Ravi|   Delhi |  Laptop|  Electronics|        45000|  2024-01-05|   Completed|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile |  Electronics|        32000|  05/01/2024|   Completed|
|  ORD003|       C003|         Aman|Bangalore|  Laptop|  Electronics|        55000|  2024/01/06|   Completed|
|  ORD004|       C004|        Pooja|    Delhi|  Tablet| Electronics |             |  2024-01-07|   Cancelled|
|  ORD005|       C005|         Neha|  Chennai|  Laptop|  Electronics|        48000|invalid_date|   Completed|
|  ORD006|       C006|        Rahul|   Mumbai|  Mobile|  Electronics|         NULL|  2024-01-08|   Completed|
|  ORD007|

Rename all columns to snake_case

In [3]:
def to_snake(s: str) -> str:
    return s.strip().replace(" ", "_").replace("-", "_").lower()

df = df.toDF(*[to_snake(c) for c in df.columns])
df.show()


+--------+-----------+-------------+---------+--------+-------------+-------------+------------+------------+
|order_id|customer_id|customer_name|     city| product|     category|        price|  order_date|order_status|
+--------+-----------+-------------+---------+--------+-------------+-------------+------------+------------+
|  ORD001|       C001|         Ravi|   Delhi |  Laptop|  Electronics|        45000|  2024-01-05|   Completed|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile |  Electronics|        32000|  05/01/2024|   Completed|
|  ORD003|       C003|         Aman|Bangalore|  Laptop|  Electronics|        55000|  2024/01/06|   Completed|
|  ORD004|       C004|        Pooja|    Delhi|  Tablet| Electronics |             |  2024-01-07|   Cancelled|
|  ORD005|       C005|         Neha|  Chennai|  Laptop|  Electronics|        48000|invalid_date|   Completed|
|  ORD006|       C006|        Rahul|   Mumbai|  Mobile|  Electronics|         NULL|  2024-01-08|   Completed|
|  ORD007|

Add a column price_with_tax (18%)

In [15]:
from pyspark.sql import functions as F
df = df.withColumn("price_with_tax", F.expr("try_cast(price as double) * 1.18"))
df.show()


+--------+-----------+-------------+---------+--------+-------------+-------------+------------+------------+---------+--------------+--------------+
|order_id|customer_id|customer_name|     city| product|     category|        price|  order_date|order_status|price_int|price_with_tax|price_category|
+--------+-----------+-------------+---------+--------+-------------+-------------+------------+------------+---------+--------------+--------------+
|  ORD001|       C001|         Ravi|   Delhi |  Laptop|  Electronics|        45000|  2024-01-05|   Completed|    45000|       53100.0|          High|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile |  Electronics|        32000|  05/01/2024|   Completed|    32000|       37760.0|          High|
|  ORD003|       C003|         Aman|Bangalore|  Laptop|  Electronics|        55000|  2024/01/06|   Completed|    55000|       64900.0|          High|
|  ORD004|       C004|        Pooja|    Delhi|  Tablet| Electronics |             |  2024-01-07|   C

Add a column price_category (Low / Medium / High)

In [20]:
from pyspark.sql import functions as F
df=df.withColumn("price_category",
           F.when(F.expr("try_cast(price as double)")<500,"Low")
          .when((F.expr("try_cast(price as double)")>=500)&(F.expr("try_cast(price as double)")<1500),"Medium")
          .otherwise("High"))
df.show()

+--------+-----------+-------------+---------+-------+-----------+-------------+------------+------------+---------+--------------+--------------+
|order_id|customer_id|customer_name|     city|product|   category|        price|  order_date|order_status|price_int|price_with_tax|price_category|
+--------+-----------+-------------+---------+-------+-----------+-------------+------------+------------+---------+--------------+--------------+
|  ORD001|       C001|         Ravi|    Delhi| Laptop|Electronics|        45000|  2024-01-05|   Completed|    45000|       53100.0|          High|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile|Electronics|        32000|  05/01/2024|   Completed|    32000|       37760.0|          High|
|  ORD003|       C003|         Aman|Bangalore| Laptop|Electronics|        55000|  2024/01/06|   Completed|    55000|       64900.0|          High|
|  ORD004|       C004|        Pooja|    Delhi| Tablet|Electronics|             |  2024-01-07|   Cancelled|     NULL|  

In [21]:

from pyspark.sql import functions as F

df = (
    df.withColumn("city", F.initcap(F.trim(F.col("city"))))
      .withColumn("product", F.initcap(F.trim(F.col("product"))))
      .withColumn("category", F.initcap(F.trim(F.col("category"))))

)
df.show()


+--------+-----------+-------------+---------+-------+-----------+-------------+------------+------------+---------+--------------+--------------+
|order_id|customer_id|customer_name|     city|product|   category|        price|  order_date|order_status|price_int|price_with_tax|price_category|
+--------+-----------+-------------+---------+-------+-----------+-------------+------------+------------+---------+--------------+--------------+
|  ORD001|       C001|         Ravi|    Delhi| Laptop|Electronics|        45000|  2024-01-05|   Completed|    45000|       53100.0|          High|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile|Electronics|        32000|  05/01/2024|   Completed|    32000|       37760.0|          High|
|  ORD003|       C003|         Aman|Bangalore| Laptop|Electronics|        55000|  2024/01/06|   Completed|    55000|       64900.0|          High|
|  ORD004|       C004|        Pooja|    Delhi| Tablet|Electronics|             |  2024-01-07|   Cancelled|     NULL|  

Convert price to integer

In [22]:
df=df.withColumn("price_num",F.expr("try_cast(regexp_replace(price,'[^0-9]','') as int)"))
df.show()



+--------+-----------+-------------+---------+-------+-----------+-------------+------------+------------+---------+--------------+--------------+---------+
|order_id|customer_id|customer_name|     city|product|   category|        price|  order_date|order_status|price_int|price_with_tax|price_category|price_num|
+--------+-----------+-------------+---------+-------+-----------+-------------+------------+------------+---------+--------------+--------------+---------+
|  ORD001|       C001|         Ravi|    Delhi| Laptop|Electronics|        45000|  2024-01-05|   Completed|    45000|       53100.0|          High|    45000|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile|Electronics|        32000|  05/01/2024|   Completed|    32000|       37760.0|          High|    32000|
|  ORD003|       C003|         Aman|Bangalore| Laptop|Electronics|        55000|  2024/01/06|   Completed|    55000|       64900.0|          High|    55000|
|  ORD004|       C004|        Pooja|    Delhi| Tablet|Elec

Handle invalid and null prices

In [23]:
df=df.fillna({"price_num":0})
df.show()

+--------+-----------+-------------+---------+-------+-----------+-------------+------------+------------+---------+--------------+--------------+---------+
|order_id|customer_id|customer_name|     city|product|   category|        price|  order_date|order_status|price_int|price_with_tax|price_category|price_num|
+--------+-----------+-------------+---------+-------+-----------+-------------+------------+------------+---------+--------------+--------------+---------+
|  ORD001|       C001|         Ravi|    Delhi| Laptop|Electronics|        45000|  2024-01-05|   Completed|    45000|       53100.0|          High|    45000|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile|Electronics|        32000|  05/01/2024|   Completed|    32000|       37760.0|          High|    32000|
|  ORD003|       C003|         Aman|Bangalore| Laptop|Electronics|        55000|  2024/01/06|   Completed|    55000|       64900.0|          High|    55000|
|  ORD004|       C004|        Pooja|    Delhi| Tablet|Elec

Normalize all dates into DateType

In [29]:

from pyspark.sql import functions as F

clean = F.regexp_replace(F.trim(F.col("order_date")), r"[./]", "-")

df = df.withColumn(
    "order_dt",
    F.when(clean.rlike(r"^\d{4}-\d{2}-\d{2}$"), F.to_date(clean, "yyyy-MM-dd"))
     .when(clean.rlike(r"^\d{2}-\d{2}-\d{4}$"), F.to_date(clean, "dd-MM-yyyy"))
     .otherwise(F.lit(None).cast("date"))
)

df.select("order_date", "order_dt").show(truncate=False)


+------------+----------+
|order_date  |order_dt  |
+------------+----------+
|2024-01-05  |2024-01-05|
|05/01/2024  |2024-01-05|
|2024/01/06  |2024-01-06|
|2024-01-07  |2024-01-07|
|invalid_date|NULL      |
|2024-01-08  |2024-01-08|
|2024-01-08  |2024-01-08|
|2024-01-09  |2024-01-09|
|09-01-2024  |2024-01-09|
|2024-01-10  |2024-01-10|
|2024-01-10  |2024-01-10|
|2024-01-11  |2024-01-11|
|2024/01/11  |2024-01-11|
|2024-01-12  |2024-01-12|
|2024-01-12  |2024-01-12|
|2024-01-13  |2024-01-13|
|2024-01-13  |2024-01-13|
|2024-01-14  |2024-01-14|
|2024-01-14  |2024-01-14|
|2024-01-15  |2024-01-15|
+------------+----------+
only showing top 20 rows


Remove duplicate orders

In [31]:
df = df.dropDuplicates(["order_id"])
df.show()


+--------+-----------+-------------+---------+-------+-----------+-------------+------------+------------+---------+--------------+--------------+---------+----------+
|order_id|customer_id|customer_name|     city|product|   category|        price|  order_date|order_status|price_int|price_with_tax|price_category|price_num|  order_dt|
+--------+-----------+-------------+---------+-------+-----------+-------------+------------+------------+---------+--------------+--------------+---------+----------+
|  ORD001|       C001|         Ravi|    Delhi| Laptop|Electronics|        45000|  2024-01-05|   Completed|    45000|       53100.0|          High|    45000|2024-01-05|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile|Electronics|        32000|  05/01/2024|   Completed|    32000|       37760.0|          High|    32000|2024-01-05|
|  ORD003|       C003|         Aman|Bangalore| Laptop|Electronics|        55000|  2024/01/06|   Completed|    55000|       64900.0|          High|    55000|2024

Filter only Completed orders


In [33]:
df = df.filter(F.col("order_status") == "Completed")
df.show()

+--------+-----------+-------------+---------+-------+-----------+-------------+------------+------------+---------+--------------+--------------+---------+----------+
|order_id|customer_id|customer_name|     city|product|   category|        price|  order_date|order_status|price_int|price_with_tax|price_category|price_num|  order_dt|
+--------+-----------+-------------+---------+-------+-----------+-------------+------------+------------+---------+--------------+--------------+---------+----------+
|  ORD001|       C001|         Ravi|    Delhi| Laptop|Electronics|        45000|  2024-01-05|   Completed|    45000|       53100.0|          High|    45000|2024-01-05|
|  ORD002|       C002|        Sneha|   Mumbai| Mobile|Electronics|        32000|  05/01/2024|   Completed|    32000|       37760.0|          High|    32000|2024-01-05|
|  ORD003|       C003|         Aman|Bangalore| Laptop|Electronics|        55000|  2024/01/06|   Completed|    55000|       64900.0|          High|    55000|2024

Create order_year , order_month

In [34]:

from pyspark.sql import functions as F

df = (
    df.withColumn("order_year", F.year(F.col("order_dt")))
      .withColumn("order_month", F.month(F.col("order_dt")))
)

df.select("order_date", "order_dt", "order_year", "order_month").show(truncate=False)


+------------+----------+----------+-----------+
|order_date  |order_dt  |order_year|order_month|
+------------+----------+----------+-----------+
|2024-01-05  |2024-01-05|2024      |1          |
|05/01/2024  |2024-01-05|2024      |1          |
|2024/01/06  |2024-01-06|2024      |1          |
|invalid_date|NULL      |NULL      |NULL       |
|2024-01-08  |2024-01-08|2024      |1          |
|2024-01-08  |2024-01-08|2024      |1          |
|2024-01-09  |2024-01-09|2024      |1          |
|09-01-2024  |2024-01-09|2024      |1          |
|2024-01-10  |2024-01-10|2024      |1          |
|2024-01-11  |2024-01-11|2024      |1          |
|2024/01/11  |2024-01-11|2024      |1          |
|2024-01-12  |2024-01-12|2024      |1          |
|2024-01-12  |2024-01-12|2024      |1          |
|2024-01-13  |2024-01-13|2024      |1          |
|2024-01-13  |2024-01-13|2024      |1          |
|2024-01-14  |2024-01-14|2024      |1          |
|2024-01-14  |2024-01-14|2024      |1          |
|2024-01-15  |2024-0

Aggregate total revenue per city

In [35]:

from pyspark.sql import functions as F

revenue_by_city = df.groupBy("city").agg(F.sum("price_int").alias("total_revenue"))
revenue_by_city.show()

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|Bangalore|       203000|
|  Chennai|       131000|
|   Mumbai|       154000|
|     Pune|        28000|
|    Delhi|       172000|
+---------+-------------+



Aggregate total revenue per product

In [36]:

from pyspark.sql import functions as F
revenue_by_product = df.groupBy("product").agg(F.sum("price_int").alias("total_revenue"))
revenue_by_product.orderBy(F.desc("total_revenue")).show(truncate=False)


+-------+-------------+
|product|total_revenue|
+-------+-------------+
|Laptop |418000       |
|Mobile |146000       |
|Tablet |124000       |
+-------+-------------+



Identify top 3 cities by revenue

In [37]:

from pyspark.sql import functions as F
revenue_by_city = df.groupBy("city").agg(F.sum("price_int").alias("total_revenue"))
top3_cities = revenue_by_city.orderBy(F.desc("total_revenue")).limit(3)
top3_cities.show(truncate=False)


+---------+-------------+
|city     |total_revenue|
+---------+-------------+
|Bangalore|203000       |
|Delhi    |172000       |
|Mumbai   |154000       |
+---------+-------------+



Identify products with average price above threshold

In [38]:

from pyspark.sql import functions as F
threshold = 45000
products_avg_above = (
    df.groupBy("product")
      .agg(F.avg("price_int").alias("avg_price"))
      .filter(F.col("avg_price") > threshold)
      .orderBy(F.desc("avg_price"))
)

products_avg_above.show(truncate=False)


+-------+---------+
|product|avg_price|
+-------+---------+
|Laptop |52250.0  |
+-------+---------+



15. Write cleaned data to Parquet

In [39]:

parquet_path = "/tmp/cleaned_orders_parquet"

df.write.mode("overwrite").parquet(parquet_path)


Read Parquet back and verify schema

In [40]:
parquet_path = "/tmp/cleaned_orders_parquet"
df_parquet = spark.read.parquet(parquet_path)
df_parquet.printSchema()
df_parquet.show(truncate=False)


root
 |-- order_id: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- customer_name: string (nullable = true)
 |-- city: string (nullable = true)
 |-- product: string (nullable = true)
 |-- category: string (nullable = true)
 |-- price: string (nullable = true)
 |-- order_date: string (nullable = true)
 |-- order_status: string (nullable = true)
 |-- price_int: integer (nullable = true)
 |-- price_with_tax: double (nullable = true)
 |-- price_category: string (nullable = true)
 |-- price_num: integer (nullable = true)
 |-- order_dt: date (nullable = true)
 |-- order_year: integer (nullable = true)
 |-- order_month: integer (nullable = true)

+--------+-----------+-------------+---------+-------+-----------+-------------+------------+------------+---------+--------------+--------------+---------+----------+----------+-----------+
|order_id|customer_id|customer_name|city     |product|category   |price        |order_date  |order_status|price_int|price_with_tax|price

Write the same data to ORC

In [41]:
orc_path = "/tmp/cleaned_orders_orc"
df.write.mode("overwrite").orc(orc_path)


Check number of partitions

In [43]:

print("Number of partitions:", df.rdd.getNumPartitions())


Number of partitions: 1


Repartition before writing

In [45]:

df.repartition(4).write.mode("overwrite").parquet("/tmp/cleaned_orders_parquet")

print("Before:", df.rdd.getNumPartitions())
df = df.repartition(4)
print("After:", df.rdd.getNumPartitions())



Before: 1
After: 4


Compare file counts between Parquet and ORC

In [46]:

df_parquet = spark.read.parquet("/tmp/cleaned_orders_parquet")
df_orc     = spark.read.orc("/tmp/cleaned_orders_orc")

print("Parquet partitions:", df_parquet.rdd.getNumPartitions())
print("ORC partitions:", df_orc.rdd.getNumPartitions())


Parquet partitions: 2
ORC partitions: 1


Run explain(True) on final pipeline

In [48]:

print("Explain plan for final df (df6):")
df.explain(True)



Explain plan for final df (df6):
== Parsed Logical Plan ==
Repartition 4, true
+- Project [order_id#37, customer_id#38, customer_name#39, city#454, product#455, category#456, price#43, order_date#44, order_status#45, price_int#577, price_with_tax#307, price_category#416, price_num#535, order_dt#663, order_year#1471, month(order_dt#663) AS order_month#1472]
   +- Project [order_id#37, customer_id#38, customer_name#39, city#454, product#455, category#456, price#43, order_date#44, order_status#45, price_int#577, price_with_tax#307, price_category#416, price_num#535, order_dt#663, year(order_dt#663) AS order_year#1471]
      +- Filter (order_status#45 = Completed)
         +- Filter (order_status#45 = Completed)
            +- Deduplicate [order_id#37]
               +- Deduplicate [order_id#37]
                  +- Project [order_id#37, customer_id#38, customer_name#39, city#454, product#455, category#456, price#43, order_date#44, order_status#45, price_int#577, price_with_tax#307, price_