In [2]:
'''
You have a big dataset with daily sales information from different Walmart stores. Each row in the dataset shows details like store ID, product ID, sale date, quantity sold, and total sales amount. You need to write a PySpark program that calculates the following:

Total sales for each store on a daily basis.

Input Schema & Example
Column Name	Data Type
store_id	Integer
product_id	Integer
sale_date	String
quantity_sold	Integer
total_sales	Double
Example Input Table
store_id	product_id	sale_date	quantity_sold	total_sales
1	101	2025-05-10	2	25.00
1	102	2025-05-10	1	15.00
1	103	2025-05-11	3	30.00
2	101	2025-05-10	2	40.00
Output Schema, Example & Explanation
Column Name	Data Type
store_id	Integer
sale_date	String
daily_total_sales	Double
Example Output Table
store_id	sale_date	daily_total_sales
1	2025-05-10	40.00
1	2025-05-11	30.00
2	2025-05-10	40.00
Explanation
On 2025-05-10, store 1 made $25.00 + $15.00 = $40.00.
On 2025-05-11, store 1 made $30.00.
Store 2 on 2025-05-10 made $40.00.
Starter Code
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum

spark = SparkSession.builder.getOrCreate()

data = [
    (1, 101, "2025-05-10", 2, 25.00),
    (1, 102, "2025-05-10", 1, 15.00),
    (1, 103, "2025-05-11", 3, 30.00),
    (2, 101, "2025-05-10", 2, 40.00)
]

columns = ["store_id", "product_id", "sale_date", "quantity_sold", "total_sales"]

df = spark.createDataFrame(data, columns)

Use display(df) to show the final DataFrame.
'''

# Initialize Spark session
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark = SparkSession.builder.appName('Spark Playground').getOrCreate()

#Copy the starter code or load the file path available in the problem statement 
data = [
    (1, 101, "2025-05-10", 2, 25.00),
    (1, 102, "2025-05-10", 1, 15.00),
    (1, 103, "2025-05-11", 3, 30.00),
    (2, 101, "2025-05-10", 2, 40.00)
]

columns = ["store_id", "product_id", "sale_date", "quantity_sold", "total_sales"]

df = spark.createDataFrame(data, columns)

df_result = (
  df.groupBy("store_id", "sale_date")
  .agg(F.sum("total_sales").alias("daily_total_sales"))
)

# Display the final DataFrame
df_result.show()

[Stage 0:>                                                          (0 + 8) / 8]

+--------+----------+-----------------+
|store_id| sale_date|daily_total_sales|
+--------+----------+-----------------+
|       1|2025-05-10|             40.0|
|       1|2025-05-11|             30.0|
|       2|2025-05-10|             40.0|
+--------+----------+-----------------+



                                                                                

In [3]:
'''
Bonus Challenge: Can you solve this using Spark SQL and temporary views?
'''
# Register temporary view
df.createOrReplaceTempView("sales")

df_result_2 = spark.sql("""
    SELECT
        store_id,
        sale_date,
        SUM(total_sales) AS daily_total_sales
    FROM sales
    GROUP BY store_id, sale_date
    ORDER BY store_id, sale_date
""")

df_result_2.show()

+--------+----------+-----------------+
|store_id| sale_date|daily_total_sales|
+--------+----------+-----------------+
|       1|2025-05-10|             40.0|
|       1|2025-05-11|             30.0|
|       2|2025-05-10|             40.0|
+--------+----------+-----------------+

