<a href="https://colab.research.google.com/github/codingniket/Python-Training/blob/main/23_12_2025/Milestone_1_RE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, when,regexp_replace, split, trim, array_compact, transform, get_json_object, lower
spark = SparkSession.builder.appName("MileStone1").getOrCreate()
from pyspark.sql.types import (StructType, StructField, StringType,LongType,IntegerType,ArrayType,MapType)

1. Define an explicit schema

2. Create a DataFrame using the schema
3. Print schema and validate data types

In [7]:
orders_data = [
("O001","Delhi ","Laptop","45000","2024-01-05","Completed"),
("O002","Mumbai","Mobile ","32000","05/01/2024","Completed"),
("O003","Bangalore","Tablet","30000","2024/01/06","Completed"),
("O004","Delhi","Laptop","","2024-01-07","Cancelled"),
("O005","Mumbai","Mobile","invalid","2024-01-08","Completed"),
("O006","Chennai","Tablet",None,"2024-01-08","Completed"),
("O007","Delhi","Laptop","47000","09-01-2024","Completed"),
("O008","Bangalore","Mobile","28000","2024-01-09","Completed"),
("O009","Mumbai","Laptop","55000","2024-01-10","Completed"),
("O009","Mumbai","Laptop","55000","2024-01-10","Completed")
]

In [8]:
orders_schema = StructType([
    StructField("order_id", StringType(), True),
    StructField("city", StringType(), True),
    StructField("product",StringType(),True),
    StructField("amount",StringType(),True),
    StructField("order_date",StringType(),True),
     StructField("status",StringType(),True),
])


order_df =  spark.createDataFrame(orders_data,orders_schema)

In [6]:
order_df.printSchema()
order_df.show()

root
 |-- order_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- product: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- date: string (nullable = true)
 |-- status: string (nullable = true)

+--------+---------+-------+-------+----------+---------+
|order_id|     city|product| amount|      date|   status|
+--------+---------+-------+-------+----------+---------+
|    O001|   Delhi | Laptop|  45000|2024-01-05|Completed|
|    O002|   Mumbai|Mobile |  32000|05/01/2024|Completed|
|    O003|Bangalore| Tablet|  30000|2024/01/06|Completed|
|    O004|    Delhi| Laptop|       |2024-01-07|Cancelled|
|    O005|   Mumbai| Mobile|invalid|2024-01-08|Completed|
|    O006|  Chennai| Tablet|   NULL|2024-01-08|Completed|
|    O007|    Delhi| Laptop|  47000|09-01-2024|Completed|
|    O008|Bangalore| Mobile|  28000|2024-01-09|Completed|
|    O009|   Mumbai| Laptop|  55000|2024-01-10|Completed|
|    O009|   Mumbai| Laptop|  55000|2024-01-10|Completed|
+--------+---

In [9]:
from pyspark.sql.functions import col, upper ,to_date, coalesce, split, lit, array_remove, try_to_timestamp,regexp_extract

 Trim all string columns

. Standardize city and product values

. Convert amount to IntegerType

. Handle invalid and null amount values

. Remove duplicate orders

. Keep only Completed order

In [12]:
numeric_price_str = regexp_extract(col("amount"), r"(\d+)", 0)
clean_order_df = order_df.withColumn(
    "order_date",
    coalesce(
        to_date(try_to_timestamp(col("order_date"), lit("yyyy-MM-dd"))),
        to_date(try_to_timestamp(col("order_date"), lit("dd/MM/yyyy"))),
        to_date(try_to_timestamp(col("order_date"), lit("yyyy/MM/dd")))
    )
)\
.withColumn("amount",when((numeric_price_str == "") | numeric_price_str.isNull(), lit(0)).otherwise(numeric_price_str.cast('int')))\
.withColumn("product",trim(col("product")))\
.withColumn("city",trim(col("city")))

clean_order_df = clean_order_df.dropDuplicates()

completed_df = clean_order_df.filter(col("status") == "Completed")



completed_df.show()


+--------+---------+-------+------+----------+---------+
|order_id|     city|product|amount|order_date|   status|
+--------+---------+-------+------+----------+---------+
|    O002|   Mumbai| Mobile| 32000|2024-01-05|Completed|
|    O005|   Mumbai| Mobile|     0|2024-01-08|Completed|
|    O001|    Delhi| Laptop| 45000|2024-01-05|Completed|
|    O003|Bangalore| Tablet| 30000|2024-01-06|Completed|
|    O009|   Mumbai| Laptop| 55000|2024-01-10|Completed|
|    O008|Bangalore| Mobile| 28000|2024-01-09|Completed|
|    O006|  Chennai| Tablet|     0|2024-01-08|Completed|
|    O007|    Delhi| Laptop| 47000|      NULL|Completed|
+--------+---------+-------+------+----------+---------+



In [13]:
order_df = completed_df

Tasks

. Total revenue per city

. Total revenue per product

. Average order value per city

In [14]:
total_revenue_per_city = order_df.groupBy("city").agg({"amount": "sum"})
total_revenue_per_city.show()

+---------+-----------+
|     city|sum(amount)|
+---------+-----------+
|Bangalore|      58000|
|  Chennai|          0|
|   Mumbai|      87000|
|    Delhi|      92000|
+---------+-----------+



In [15]:
total_revenue_per_product = order_df.groupBy("product").agg({"amount": "sum"})
total_revenue_per_product.show()

+-------+-----------+
|product|sum(amount)|
+-------+-----------+
| Laptop|     147000|
| Mobile|      60000|
| Tablet|      30000|
+-------+-----------+



In [16]:
avg_order_value_per_city = order_df.groupBy("city").agg({"amount": "avg"})
avg_order_value_per_city.show()

+---------+-----------+
|     city|avg(amount)|
+---------+-----------+
|Bangalore|    29000.0|
|  Chennai|        0.0|
|   Mumbai|    29000.0|
|    Delhi|    46000.0|
+---------+-----------+



Tasks

. Rank cities by total revenue

. Identify top-performing city

In [21]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, desc

city_revenue_ranked = order_df.groupBy("city").agg({"amount": "sum"}).withColumnRenamed("sum(amount)", "total_revenue")
city_revenue_ranked = city_revenue_ranked.orderBy(desc("total_revenue"))

window_spec = Window.orderBy(desc("total_revenue"))
city_revenue_ranked = city_revenue_ranked.withColumn("city_rank", rank().over(window_spec))
city_revenue_ranked.show()


+---------+-------------+---------+
|     city|total_revenue|city_rank|
+---------+-------------+---------+
|    Delhi|        92000|        1|
|   Mumbai|        87000|        2|
|Bangalore|        58000|        3|
|  Chennai|            0|        4|
+---------+-------------+---------+



In [22]:
city_revenue_ranked.limit(3).show()

+---------+-------------+---------+
|     city|total_revenue|city_rank|
+---------+-------------+---------+
|    Delhi|        92000|        1|
|   Mumbai|        87000|        2|
|Bangalore|        58000|        3|
+---------+-------------+---------+



Tasks

. Cache the cleaned DataFrame

. Run two aggregations and observe behavior

. Use explain(True) to inspect the plan

In [23]:
order_df.cache()

DataFrame[order_id: string, city: string, product: string, amount: int, order_date: date, status: string]

In [24]:
order_df.groupBy("city").sum("amount").show()
order_df.groupBy("product").avg("amount").show()

+---------+-----------+
|     city|sum(amount)|
+---------+-----------+
|   Mumbai|      87000|
|Bangalore|      58000|
|    Delhi|      92000|
|  Chennai|          0|
+---------+-----------+

+-------+-----------+
|product|avg(amount)|
+-------+-----------+
| Laptop|    49000.0|
| Mobile|    20000.0|
| Tablet|    15000.0|
+-------+-----------+



In [25]:
order_df.explain(True)

== Parsed Logical Plan ==
'Filter '`=`('status, Completed)
+- Deduplicate [city#63, order_id#31, amount#61, product#62, order_date#60, status#36]
   +- Project [order_id#31, trim(city#32, None) AS city#63, product#62, amount#61, order_date#60, status#36]
      +- Project [order_id#31, city#32, trim(product#33, None) AS product#62, amount#61, order_date#60, status#36]
         +- Project [order_id#31, city#32, product#33, CASE WHEN ((regexp_extract(amount#34, (\d+), 0) = ) OR isnull(regexp_extract(amount#34, (\d+), 0))) THEN 0 ELSE cast(regexp_extract(amount#34, (\d+), 0) as int) END AS amount#61, order_date#60, status#36]
            +- Project [order_id#31, city#32, product#33, amount#34, coalesce(to_date(try_to_timestamp(order_date#35, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(order_date#35, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(order_date#35, Some