In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, DateType
from pyspark.sql import Window as W
import pyspark.sql.functions as F

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("RetailSales") \
    .getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/01 16:05:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Define the schema
schema = StructType([
    StructField("transactions_id", IntegerType(), True),
    StructField("sale_date", DateType(), True),
    StructField("sale_time", StringType(), True),
    StructField("customer_id", IntegerType(), True),
    StructField("gender", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("category", StringType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("price_per_unit", FloatType(), True),
    StructField("cogs", FloatType(), True),  # Cost of Goods Sold
    StructField("total_sale", FloatType(), True)
])

# Read the CSV file into a DataFrame
df = spark.read.csv("/data/Retail-Sales-Analysis.csv", header=True, schema=schema)

df.createOrReplaceTempView("retail_sales")
spark.sql("SELECT * FROM retail_sales").show(20,False)

                                                                                

+---------------+----------+---------+-----------+------+---+-----------+--------+--------------+-----+----------+
|transactions_id|sale_date |sale_time|customer_id|gender|age|category   |quantity|price_per_unit|cogs |total_sale|
+---------------+----------+---------+-----------+------+---+-----------+--------+--------------+-----+----------+
|180            |2022-11-05|10:47:00 |117        |Male  |41 |Clothing   |3       |300.0         |129.0|900.0     |
|522            |2022-07-09|11:00:00 |52         |Male  |46 |Beauty     |3       |500.0         |145.0|1500.0    |
|559            |2022-12-12|10:48:00 |5          |Female|40 |Clothing   |4       |300.0         |84.0 |1200.0    |
|1180           |2022-01-06|08:53:00 |85         |Male  |41 |Clothing   |3       |300.0         |129.0|900.0     |
|1522           |2022-11-14|08:35:00 |48         |Male  |46 |Beauty     |3       |500.0         |235.0|1500.0    |
|1559           |2022-08-20|07:40:00 |49         |Female|40 |Clothing   |4      

In [3]:
# Write a SQL query to calculate the total sales (total_sale) for each category
spark.sql("""
select category,sum(total_sale) from retail_sales group by 1
""").show(20,False)

[Stage 1:>                                                          (0 + 1) / 1]

+-----------+---------------+
|category   |sum(total_sale)|
+-----------+---------------+
|Beauty     |286840.0       |
|Clothing   |311070.0       |
|Electronics|313810.0       |
+-----------+---------------+



                                                                                

In [10]:
# Write a SQL query to find the average age of customers who purchased items from the 'Beauty' category
spark.sql("""
select round(avg(age), 2) as avg_age from retail_sales where category = 'Beauty'
""").show(20,False)

+-------+
|avg_age|
+-------+
|40.41  |
+-------+



In [15]:
# Write a SQL query to find the total number of transactions (transaction_id) made by each gender in each category
spark.sql("""
select gender,category, count(transactions_id) as no_of_tran from retail_sales group by 1,2
""").show(20,False)

+------+-----------+----------+
|gender|category   |no_of_tran|
+------+-----------+----------+
|Female|Clothing   |348       |
|Female|Beauty     |332       |
|Male  |Electronics|344       |
|Male  |Beauty     |282       |
|Female|Electronics|340       |
|Male  |Clothing   |354       |
+------+-----------+----------+



In [23]:
# Write a SQL query to find the top 5 customers based on the highest total 
spark.sql("""
select customer_id, sum(total_sale) as highest_total  from retail_sales group by 1 order by 2 desc limit 5
""").show(20,False)

spark.sql("""
select customer_id, highest_total from (
select *, row_number() over (order by highest_total desc) as rnk from
(select customer_id, sum(total_sale) as highest_total  from retail_sales group by 1)ttl
) where rnk <= 5
""").show(20,False)

+-----------+-------------+
|customer_id|highest_total|
+-----------+-------------+
|3          |38440.0      |
|1          |30750.0      |
|5          |30405.0      |
|2          |25295.0      |
|4          |23580.0      |
+-----------+-------------+



24/09/01 16:41:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/01 16:41:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/01 16:41:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/01 16:41:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/01 16:41:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/09/01 16:41:19 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-----------+-------------+
|customer_id|highest_total|
+-----------+-------------+
|3          |38440.0      |
|1          |30750.0      |
|5          |30405.0      |
|2          |25295.0      |
|4          |23580.0      |
+-----------+-------------+



In [24]:
# Write a SQL query to find the number of unique customers who purchased items from each category
spark.sql("""
select category, count(distinct customer_id) as no_of_unique_cust from retail_sales group by 1       
""").show(20,False)


+-----------+-----------------+
|category   |no_of_unique_cust|
+-----------+-----------------+
|Beauty     |141              |
|Clothing   |149              |
|Electronics|144              |
+-----------+-----------------+



In [38]:
# Write a SQL query to create each shift and number of orders (Example Morning <12, Afternoon Between 12 & 17, Evening >17)
spark.sql("""
select *, 
case when substr(sale_time, 0, 2) < 12 then 'Morning'
when substr(sale_time, 0, 2) >= 12 and substr(sale_time, 0, 2) <= 17 then 'Afternoon'
when substr(sale_time, 0, 2) > 17 then 'Evening'
end as shift
from retail_sales
""").show(10,False)

spark.sql("""
select  
case when substr(sale_time, 0, 2) < 12 then 'Morning'
when substr(sale_time, 0, 2) >= 12 and substr(sale_time, 0, 2) <= 17 then 'Afternoon'
when substr(sale_time, 0, 2) > 17 then 'Evening'
end as shift,
count(transactions_id) as no_of_orders
from retail_sales
group by shift
""").show(20,False)

+---------------+----------+---------+-----------+------+---+-----------+--------+--------------+-----+----------+-------+
|transactions_id|sale_date |sale_time|customer_id|gender|age|category   |quantity|price_per_unit|cogs |total_sale|shift  |
+---------------+----------+---------+-----------+------+---+-----------+--------+--------------+-----+----------+-------+
|180            |2022-11-05|10:47:00 |117        |Male  |41 |Clothing   |3       |300.0         |129.0|900.0     |Morning|
|522            |2022-07-09|11:00:00 |52         |Male  |46 |Beauty     |3       |500.0         |145.0|1500.0    |Morning|
|559            |2022-12-12|10:48:00 |5          |Female|40 |Clothing   |4       |300.0         |84.0 |1200.0    |Morning|
|1180           |2022-01-06|08:53:00 |85         |Male  |41 |Clothing   |3       |300.0         |129.0|900.0     |Morning|
|1522           |2022-11-14|08:35:00 |48         |Male  |46 |Beauty     |3       |500.0         |235.0|1500.0    |Morning|
|1559           

In [66]:
# Write a SQL query to calculate the average sale for each month. Find out best selling month in each year
spark.sql("""
select sale_year,sale_month,avg_sale from (
select *,row_number() over (partition by sale_year order by avg_sale desc) as rnk from (
select substr(sale_date,0,4) as sale_year, substr(sale_date,0,7) as sale_month, round(avg(total_sale), 2) as avg_sale 
from retail_sales 
group by 1,2) sls
)x where rnk = 1
""").show(20,False)

+---------+----------+--------+
|sale_year|sale_month|avg_sale|
+---------+----------+--------+
|2022     |2022-07   |541.34  |
|2023     |2023-02   |535.53  |
+---------+----------+--------+



In [4]:
spark.stop()