In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
  .appName("Retail Sales Analysis") \
  .getOrCreate()

In [3]:
sales_data = [
("T001","North","Delhi","Store-01","Laptop","2024-01-01",75000),
("T002","North","Delhi","Store-01","Mobile","2024-01-02",32000),
("T003","North","Chandigarh","Store-02","Tablet","2024-01-03",26000),
("T004","South","Bangalore","Store-03","Laptop","2024-01-01",78000),
("T005","South","Chennai","Store-04","Mobile","2024-01-02",30000),
("T006","South","Bangalore","Store-03","Tablet","2024-01-03",24000),
("T007","East","Kolkata","Store-05","Laptop","2024-01-01",72000),
("T008","East","Kolkata","Store-05","Mobile","2024-01-02",28000),
("T009","East","Patna","Store-06","Tablet","2024-01-03",23000),
("T010","West","Mumbai","Store-07","Laptop","2024-01-01",80000),
("T011","West","Mumbai","Store-07","Mobile","2024-01-02",35000),
("T012","West","Pune","Store-08","Tablet","2024-01-03",27000),
("T013","North","Delhi","Store-01","Laptop","2024-01-04",76000),
("T014","South","Chennai","Store-04","Laptop","2024-01-04",79000),
("T015","East","Patna","Store-06","Mobile","2024-01-04",29000),
("T016","West","Pune","Store-08","Laptop","2024-01-04",77000),
("T017","North","Chandigarh","Store-02","Mobile","2024-01-05",31000),
("T018","South","Bangalore","Store-03","Mobile","2024-01-05",34000),
("T019","East","Kolkata","Store-05","Tablet","2024-01-05",25000),
("T020","West","Mumbai","Store-07","Tablet","2024-01-05",29000),
("T021","North","Delhi","Store-01","Tablet","2024-01-06",28000),
("T022","South","Chennai","Store-04","Tablet","2024-01-06",26000),
("T023","East","Patna","Store-06","Laptop","2024-01-06",74000),
("T024","West","Pune","Store-08","Mobile","2024-01-06",33000)
]
columns = [
"txn_id","region","city","store_id",
"product","sale_date","amount"
]
df_sales = spark.createDataFrame(sales_data, columns)
df_sales.show(5)
df_sales.printSchema()

+------+------+----------+--------+-------+----------+------+
|txn_id|region|      city|store_id|product| sale_date|amount|
+------+------+----------+--------+-------+----------+------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|
+------+------+----------+--------+-------+----------+------+
only showing top 5 rows
root
 |-- txn_id: string (nullable = true)
 |-- region: string (nullable = true)
 |-- city: string (nullable = true)
 |-- store_id: string (nullable = true)
 |-- product: string (nullable = true)
 |-- sale_date: string (nullable = true)
 |-- amount: long (nullable = true)



exercise 1
Objective
Practice column selection, renaming, and derived columns.

1. Select only txn_id , region , product , and amount
2. Rename amount to revenue
3.Create a derived column amount_in_thousands
4. Select distinct combinations of region and product
5. Select all columns but exclude store_id
6. Create a new column sale_year extracted from sale_date
7. Reorder columns in a business-friendly format

In [4]:
df_sales.select("txn_id","region","product","amount").show(5)

+------+------+-------+------+
|txn_id|region|product|amount|
+------+------+-------+------+
|  T001| North| Laptop| 75000|
|  T002| North| Mobile| 32000|
|  T003| North| Tablet| 26000|
|  T004| South| Laptop| 78000|
|  T005| South| Mobile| 30000|
+------+------+-------+------+
only showing top 5 rows


In [5]:
df1=df_sales.withColumnRenamed("amount","revenue")
df1.show(5)

+------+------+----------+--------+-------+----------+-------+
|txn_id|region|      city|store_id|product| sale_date|revenue|
+------+------+----------+--------+-------+----------+-------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01|  75000|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02|  32000|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03|  26000|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01|  78000|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02|  30000|
+------+------+----------+--------+-------+----------+-------+
only showing top 5 rows


In [6]:
df2=df_sales.withColumn("amount_in_thousands",df_sales.amount/1000)
df2.show()

+------+------+----------+--------+-------+----------+------+-------------------+
|txn_id|region|      city|store_id|product| sale_date|amount|amount_in_thousands|
+------+------+----------+--------+-------+----------+------+-------------------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|               75.0|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|               32.0|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|               26.0|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|               78.0|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|               30.0|
|  T006| South| Bangalore|Store-03| Tablet|2024-01-03| 24000|               24.0|
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|               72.0|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|               28.0|
|  T009|  East|     Patna|Store-06| Tablet|2024-01-03| 23000|               23.0|
|  T010|  West| 

In [7]:
df_sales.select("region","product").distinct().show()

+------+-------+
|region|product|
+------+-------+
| North| Laptop|
| North| Tablet|
|  East| Tablet|
|  East| Laptop|
| South| Tablet|
| North| Mobile|
|  West| Tablet|
|  East| Mobile|
| South| Mobile|
| South| Laptop|
|  West| Mobile|
|  West| Laptop|
+------+-------+



In [9]:
df_sales.drop("store_id").show()

+------+------+----------+-------+----------+------+
|txn_id|region|      city|product| sale_date|amount|
+------+------+----------+-------+----------+------+
|  T001| North|     Delhi| Laptop|2024-01-01| 75000|
|  T002| North|     Delhi| Mobile|2024-01-02| 32000|
|  T003| North|Chandigarh| Tablet|2024-01-03| 26000|
|  T004| South| Bangalore| Laptop|2024-01-01| 78000|
|  T005| South|   Chennai| Mobile|2024-01-02| 30000|
|  T006| South| Bangalore| Tablet|2024-01-03| 24000|
|  T007|  East|   Kolkata| Laptop|2024-01-01| 72000|
|  T008|  East|   Kolkata| Mobile|2024-01-02| 28000|
|  T009|  East|     Patna| Tablet|2024-01-03| 23000|
|  T010|  West|    Mumbai| Laptop|2024-01-01| 80000|
|  T011|  West|    Mumbai| Mobile|2024-01-02| 35000|
|  T012|  West|      Pune| Tablet|2024-01-03| 27000|
|  T013| North|     Delhi| Laptop|2024-01-04| 76000|
|  T014| South|   Chennai| Laptop|2024-01-04| 79000|
|  T015|  East|     Patna| Mobile|2024-01-04| 29000|
|  T016|  West|      Pune| Laptop|2024-01-04| 

In [11]:
from pyspark.sql.functions import year,col
df_sales.withColumn("sale_year", year((col("sale_date")))).show()

+------+------+----------+--------+-------+----------+------+---------+
|txn_id|region|      city|store_id|product| sale_date|amount|sale_year|
+------+------+----------+--------+-------+----------+------+---------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|     2024|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|     2024|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|     2024|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|     2024|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|     2024|
|  T006| South| Bangalore|Store-03| Tablet|2024-01-03| 24000|     2024|
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|     2024|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|     2024|
|  T009|  East|     Patna|Store-06| Tablet|2024-01-03| 23000|     2024|
|  T010|  West|    Mumbai|Store-07| Laptop|2024-01-01| 80000|     2024|
|  T011|  West|    Mumbai|Store-07| Mobile|2024-01-02| 35000|   

In [12]:
df_sales.select("store_id","product","sale_date","amount","region","city",).show()

+--------+-------+----------+------+------+----------+
|store_id|product| sale_date|amount|region|      city|
+--------+-------+----------+------+------+----------+
|Store-01| Laptop|2024-01-01| 75000| North|     Delhi|
|Store-01| Mobile|2024-01-02| 32000| North|     Delhi|
|Store-02| Tablet|2024-01-03| 26000| North|Chandigarh|
|Store-03| Laptop|2024-01-01| 78000| South| Bangalore|
|Store-04| Mobile|2024-01-02| 30000| South|   Chennai|
|Store-03| Tablet|2024-01-03| 24000| South| Bangalore|
|Store-05| Laptop|2024-01-01| 72000|  East|   Kolkata|
|Store-05| Mobile|2024-01-02| 28000|  East|   Kolkata|
|Store-06| Tablet|2024-01-03| 23000|  East|     Patna|
|Store-07| Laptop|2024-01-01| 80000|  West|    Mumbai|
|Store-07| Mobile|2024-01-02| 35000|  West|    Mumbai|
|Store-08| Tablet|2024-01-03| 27000|  West|      Pune|
|Store-01| Laptop|2024-01-04| 76000| North|     Delhi|
|Store-04| Laptop|2024-01-04| 79000| South|   Chennai|
|Store-06| Mobile|2024-01-04| 29000|  East|     Patna|
|Store-08|

Understand row-level filtering and predicate pushdown.
Exercises
1. Filter transactions where amount > 50000
2. Filter only Laptop sales
3. Filter sales from North and South regions
4. Filter sales between 25000 and 75000
5. Filter transactions from Delhi stores only
6. Apply multiple filters using both filter and where
7. Change the order of filters and compare explain(True)
8. Identify which filters Spark pushes down

In [15]:
df_sales.filter(df_sales.amount>50000).show()

+------+------+---------+--------+-------+----------+------+
|txn_id|region|     city|store_id|product| sale_date|amount|
+------+------+---------+--------+-------+----------+------+
|  T001| North|    Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T004| South|Bangalore|Store-03| Laptop|2024-01-01| 78000|
|  T007|  East|  Kolkata|Store-05| Laptop|2024-01-01| 72000|
|  T010|  West|   Mumbai|Store-07| Laptop|2024-01-01| 80000|
|  T013| North|    Delhi|Store-01| Laptop|2024-01-04| 76000|
|  T014| South|  Chennai|Store-04| Laptop|2024-01-04| 79000|
|  T016|  West|     Pune|Store-08| Laptop|2024-01-04| 77000|
|  T023|  East|    Patna|Store-06| Laptop|2024-01-06| 74000|
+------+------+---------+--------+-------+----------+------+



In [16]:
df_sales.filter(df_sales.product=="Laptop").show()

+------+------+---------+--------+-------+----------+------+
|txn_id|region|     city|store_id|product| sale_date|amount|
+------+------+---------+--------+-------+----------+------+
|  T001| North|    Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T004| South|Bangalore|Store-03| Laptop|2024-01-01| 78000|
|  T007|  East|  Kolkata|Store-05| Laptop|2024-01-01| 72000|
|  T010|  West|   Mumbai|Store-07| Laptop|2024-01-01| 80000|
|  T013| North|    Delhi|Store-01| Laptop|2024-01-04| 76000|
|  T014| South|  Chennai|Store-04| Laptop|2024-01-04| 79000|
|  T016|  West|     Pune|Store-08| Laptop|2024-01-04| 77000|
|  T023|  East|    Patna|Store-06| Laptop|2024-01-06| 74000|
+------+------+---------+--------+-------+----------+------+



In [20]:
df_sales.filter((df_sales.region=="North")|(df_sales.region=="South")).show()

+------+------+----------+--------+-------+----------+------+
|txn_id|region|      city|store_id|product| sale_date|amount|
+------+------+----------+--------+-------+----------+------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|
|  T006| South| Bangalore|Store-03| Tablet|2024-01-03| 24000|
|  T013| North|     Delhi|Store-01| Laptop|2024-01-04| 76000|
|  T014| South|   Chennai|Store-04| Laptop|2024-01-04| 79000|
|  T017| North|Chandigarh|Store-02| Mobile|2024-01-05| 31000|
|  T018| South| Bangalore|Store-03| Mobile|2024-01-05| 34000|
|  T021| North|     Delhi|Store-01| Tablet|2024-01-06| 28000|
|  T022| South|   Chennai|Store-04| Tablet|2024-01-06| 26000|
+------+------+----------+--------+-------+----------+------+



In [21]:
df_sales.filter((df_sales.amount>25000)&(df_sales.amount<75000)).show()

+------+------+----------+--------+-------+----------+------+
|txn_id|region|      city|store_id|product| sale_date|amount|
+------+------+----------+--------+-------+----------+------+
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|
|  T011|  West|    Mumbai|Store-07| Mobile|2024-01-02| 35000|
|  T012|  West|      Pune|Store-08| Tablet|2024-01-03| 27000|
|  T015|  East|     Patna|Store-06| Mobile|2024-01-04| 29000|
|  T017| North|Chandigarh|Store-02| Mobile|2024-01-05| 31000|
|  T018| South| Bangalore|Store-03| Mobile|2024-01-05| 34000|
|  T020|  West|    Mumbai|Store-07| Tablet|2024-01-05| 29000|
|  T021| North|     Delhi|Store-01| Tablet|2024-01-06| 28000|
|  T022| South|   Chennai|Store-04| Tablet|2024-01-06| 26000|
|  T023|

In [22]:
df_sales.filter(df_sales.city=="Delhi").show()

+------+------+-----+--------+-------+----------+------+
|txn_id|region| city|store_id|product| sale_date|amount|
+------+------+-----+--------+-------+----------+------+
|  T001| North|Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T002| North|Delhi|Store-01| Mobile|2024-01-02| 32000|
|  T013| North|Delhi|Store-01| Laptop|2024-01-04| 76000|
|  T021| North|Delhi|Store-01| Tablet|2024-01-06| 28000|
+------+------+-----+--------+-------+----------+------+



In [23]:
df_sales.filter((df_sales.region=="North" )).show()

+------+------+----------+--------+-------+----------+------+
|txn_id|region|      city|store_id|product| sale_date|amount|
+------+------+----------+--------+-------+----------+------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|
|  T013| North|     Delhi|Store-01| Laptop|2024-01-04| 76000|
|  T017| North|Chandigarh|Store-02| Mobile|2024-01-05| 31000|
|  T021| North|     Delhi|Store-01| Tablet|2024-01-06| 28000|
+------+------+----------+--------+-------+----------+------+



Exercises 3
1. Total sales amount per region
2. Average sales amount per product
3. Maximum sale per city

4. Minimum sale per store
5. Count of transactions per region
6. Total revenue per store
7. Region-wise product sales count
8. Average transaction value per city
9. Identify regions with total sales above a threshold
10. Use explain(True) and identify shuffle stages

In [24]:
df_sales.show()

+------+------+----------+--------+-------+----------+------+
|txn_id|region|      city|store_id|product| sale_date|amount|
+------+------+----------+--------+-------+----------+------+
|  T001| North|     Delhi|Store-01| Laptop|2024-01-01| 75000|
|  T002| North|     Delhi|Store-01| Mobile|2024-01-02| 32000|
|  T003| North|Chandigarh|Store-02| Tablet|2024-01-03| 26000|
|  T004| South| Bangalore|Store-03| Laptop|2024-01-01| 78000|
|  T005| South|   Chennai|Store-04| Mobile|2024-01-02| 30000|
|  T006| South| Bangalore|Store-03| Tablet|2024-01-03| 24000|
|  T007|  East|   Kolkata|Store-05| Laptop|2024-01-01| 72000|
|  T008|  East|   Kolkata|Store-05| Mobile|2024-01-02| 28000|
|  T009|  East|     Patna|Store-06| Tablet|2024-01-03| 23000|
|  T010|  West|    Mumbai|Store-07| Laptop|2024-01-01| 80000|
|  T011|  West|    Mumbai|Store-07| Mobile|2024-01-02| 35000|
|  T012|  West|      Pune|Store-08| Tablet|2024-01-03| 27000|
|  T013| North|     Delhi|Store-01| Laptop|2024-01-04| 76000|
|  T014|

In [25]:
from pyspark.sql.functions import sum
df_sales.groupby("region").agg(sum("amount")).show()

+------+-----------+
|region|sum(amount)|
+------+-----------+
| South|     271000|
|  East|     251000|
|  West|     281000|
| North|     268000|
+------+-----------+



In [27]:

from pyspark.sql.functions import avg
df_sales.groupBy("product").agg(avg("amount").alias("avg_sales")).show()


+-------+---------+
|product|avg_sales|
+-------+---------+
| Laptop|  76375.0|
| Mobile|  31500.0|
| Tablet|  26000.0|
+-------+---------+



In [28]:

from pyspark.sql.functions import max
df_sales.groupBy("city").agg(max("amount").alias("max_sale")).show()


+----------+--------+
|      city|max_sale|
+----------+--------+
| Bangalore|   78000|
|     Patna|   74000|
|   Chennai|   79000|
|    Mumbai|   80000|
|   Kolkata|   72000|
|      Pune|   77000|
|     Delhi|   76000|
|Chandigarh|   31000|
+----------+--------+



In [30]:
from pyspark.sql.functions import min
df_sales.groupBy("store_id").agg(min("amount").alias("min_sale")).show()


+--------+--------+
|store_id|min_sale|
+--------+--------+
|Store-05|   25000|
|Store-06|   23000|
|Store-03|   24000|
|Store-01|   28000|
|Store-04|   26000|
|Store-07|   29000|
|Store-08|   27000|
|Store-02|   26000|
+--------+--------+



In [31]:
from pyspark.sql.functions import count
df_sales.groupBy("region").agg(count("txn_id")).show()

+------+-------------+
|region|count(txn_id)|
+------+-------------+
| South|            6|
|  East|            6|
|  West|            6|
| North|            6|
+------+-------------+



In [32]:
from pyspark.sql.functions import sum
df_sales.groupBy("store_id").agg(sum("amount")).show()

+--------+-----------+
|store_id|sum(amount)|
+--------+-----------+
|Store-05|     125000|
|Store-06|     126000|
|Store-03|     136000|
|Store-01|     211000|
|Store-04|     135000|
|Store-07|     144000|
|Store-08|     137000|
|Store-02|      57000|
+--------+-----------+



In [33]:
from pyspark.sql.functions import count
df_sales.groupBy("region","product").agg(count("txn_id")).show()

+------+-------+-------------+
|region|product|count(txn_id)|
+------+-------+-------------+
| North| Laptop|            2|
| North| Tablet|            2|
|  East| Tablet|            2|
|  East| Laptop|            2|
| South| Tablet|            2|
| North| Mobile|            2|
|  West| Tablet|            2|
|  East| Mobile|            2|
| South| Mobile|            2|
| South| Laptop|            2|
|  West| Mobile|            2|
|  West| Laptop|            2|
+------+-------+-------------+



In [34]:
from  pyspark.sql.functions import avg
df_sales.groupBy("city").agg(avg("amount")).show()

+----------+------------------+
|      city|       avg(amount)|
+----------+------------------+
| Bangalore|45333.333333333336|
|     Patna|           42000.0|
|   Chennai|           45000.0|
|    Mumbai|           48000.0|
|   Kolkata|41666.666666666664|
|      Pune|45666.666666666664|
|     Delhi|           52750.0|
|Chandigarh|           28500.0|
+----------+------------------+



In [35]:
from pyspark.sql.functions import sum
df_sales.filter(df_sales.amount>50000).groupBy("region").agg(sum("amount")).show()

+------+-----------+
|region|sum(amount)|
+------+-----------+
| South|     157000|
|  East|     146000|
|  West|     157000|
| North|     151000|
+------+-----------+



In [36]:
from pyspark.sql.functions import avg
df_sales.groupBy("region").agg(avg("amount")).show()

+------+------------------+
|region|       avg(amount)|
+------+------------------+
| South|45166.666666666664|
|  East|41833.333333333336|
|  West|46833.333333333336|
| North|44666.666666666664|
+------+------------------+



EXERCISE SET 4 â€” MULTI-DIMENSIONAL
AGGREGATION

Objective
Work with multiple grouping keys.
Exercises
1. Region + Product wise total sales
2. City + Store wise average sales
3. Region + City wise transaction count
4. Product + Store wise max sale
5. Identify top-selling product per region using aggregation only