In [2]:
'''
You are given a large dataset containing daily sales information from multiple Walmart stores. Each record in the dataset includes the following details:

Store ID
Product ID
Sale Date
Quantity Sold
Total Sales Amount
Your task is to identify the top 5 products with the highest total sales across all stores. The final result should be ordered by sale date and total sales amount.

Input Schema & Example
Column Name	Data Type
store_id	Integer
product_id	Integer
sale_date	String
quantity_sold	Integer
total_sales	Double
Example Input Table
store_id	product_id	sale_date	quantity_sold	total_sales
1	101	2025-05-10	2	25.00
2	101	2025-05-10	1	15.00
1	102	2025-05-10	5	50.00
3	103	2025-05-10	3	30.00
2	104	2025-05-10	4	45.00
1	105	2025-05-10	2	60.00
1	105	2025-05-10	1	15.00
1	106	2025-05-10	2	10.00
Output Schema
Column Name	Data Type
sale_date	String
product_id	Integer
total_sales	Double
Example Output Table
sale_date	product_id	total_sales
2025-05-10	105	75.00
2025-05-10	102	50.00
2025-05-10	104	45.00
2025-05-10	103	30.00
2025-05-10	101	40.00
ðŸ’¡ Explanation
On 2025-05-10, product 105 had the highest combined sales (60.00 + 15.00 = 75.00).
Product 101 had two entries (25.00 + 15.00 = 40.00).
The output only shows top 5 products per day, ordered by sale date and total sales.
Starter Code
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

data = [
    # 2025-05-10
    (1, 101, "2025-05-10", 2, 25.00),
    (2, 101, "2025-05-10", 1, 15.00),
    (1, 102, "2025-05-10", 5, 50.00),
    (3, 103, "2025-05-10", 3, 30.00),
    (2, 104, "2025-05-10", 4, 45.00),
    (1, 105, "2025-05-10", 2, 60.00),
    (1, 105, "2025-05-10", 1, 15.00),
    (1, 106, "2025-05-10", 2, 10.00),

    # 2025-05-11
    (1, 201, "2025-05-11", 1, 20.00),
    (2, 201, "2025-05-11", 2, 40.00),
    (2, 202, "2025-05-11", 2, 40.00),
    (3, 203, "2025-05-11", 3, 35.00),
    (1, 204, "2025-05-11", 1, 25.00),
    (2, 205, "2025-05-11", 2, 30.00),
    (1, 206, "2025-05-11", 4, 50.00),
]

columns = ["store_id", "product_id", "sale_date", "quantity_sold", "total_sales"]

df = spark.createDataFrame(data, columns)

# Your logic goes here to create df_result

display(df_result)
'''

# Initialize Spark session
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window

spark = SparkSession.builder.appName('Spark Playground').getOrCreate()

data = [
    # 2025-05-10
    (1, 101, "2025-05-10", 2, 25.00),
    (2, 101, "2025-05-10", 1, 15.00),
    (1, 102, "2025-05-10", 5, 50.00),
    (3, 103, "2025-05-10", 3, 30.00),
    (2, 104, "2025-05-10", 4, 45.00),
    (1, 105, "2025-05-10", 2, 60.00),
    (1, 105, "2025-05-10", 1, 15.00),
    (1, 106, "2025-05-10", 2, 10.00),

    # 2025-05-11
    (1, 201, "2025-05-11", 1, 20.00),
    (2, 201, "2025-05-11", 2, 40.00),
    (2, 202, "2025-05-11", 2, 40.00),
    (3, 203, "2025-05-11", 3, 35.00),
    (1, 204, "2025-05-11", 1, 25.00),
    (2, 205, "2025-05-11", 2, 30.00),
    (1, 206, "2025-05-11", 4, 50.00),
]

columns = ["store_id", "product_id", "sale_date", "quantity_sold", "total_sales"]

df = spark.createDataFrame(data, columns)

# Aggregate total sales per product per day
df_aggregate = (
  df.groupBy("sale_date", "product_id")
  .agg(F.sum("total_sales").alias("total_sales"))
)

# Define window to rank products by sales per day
window_spec = Window.partitionBy("sale_date").orderBy(F.col("total_sales").desc())

# Rank and filter top 5 products per day
df_result = (
  df_aggregate.withColumn("row_number", F.row_number().over(window_spec))
  .filter(F.col("row_number") <= 5)
  .drop("row_number")
  .orderBy("sale_date", F.col("total_sales").desc())
)

# Display result
df_result.show()

+----------+----------+-----------+
| sale_date|product_id|total_sales|
+----------+----------+-----------+
|2025-05-10|       105|       75.0|
|2025-05-10|       102|       50.0|
|2025-05-10|       104|       45.0|
|2025-05-10|       101|       40.0|
|2025-05-10|       103|       30.0|
|2025-05-11|       201|       60.0|
|2025-05-11|       206|       50.0|
|2025-05-11|       202|       40.0|
|2025-05-11|       203|       35.0|
|2025-05-11|       205|       30.0|
+----------+----------+-----------+

