In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, DateType
from pyspark.sql import Window as W
import pyspark.sql.functions as F

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("WalmartSales") \
    .getOrCreate()
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/02 06:34:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Define the schema
schema = StructType([
    StructField("invoice_id", StringType(), True),
    StructField("branch", StringType(), True),
    StructField("city", StringType(), True),
    StructField("customer_type", StringType(), True),
    StructField("gender", StringType(), True),
    StructField("product_line", StringType(), True),
    StructField("unit_price", FloatType(), True),
    StructField("quantity", IntegerType(), True),
    StructField("vat", FloatType(), True),
    StructField("total", FloatType(), True),
    StructField("date", DateType(), True),
    StructField("time", StringType(), True),
    StructField("payment_method", StringType(), True),
    StructField("rating", FloatType(), True)
])

# Read the CSV file into a DataFrame
df = spark.read.csv("/data/Walmart-Sales-dataset.csv", header=True, schema=schema)

df.createOrReplaceTempView("walmart_sales")
spark.sql("SELECT * FROM walmart_sales LIMIT 10").show(20,False)

                                                                                

+-----------+------+------------+-------------+------+----------------------+----------+--------+-------+--------+----------+--------+--------------+------+
|invoice_id |branch|city        |customer_type|gender|product_line          |unit_price|quantity|vat    |total   |date      |time    |payment_method|rating|
+-----------+------+------------+-------------+------+----------------------+----------+--------+-------+--------+----------+--------+--------------+------+
|750-67-8428|D     |San Jose    |Member       |Female|Health and beauty     |74.69     |7       |26.1415|548.9715|2019-01-05|13:08:00|Ewallet       |9.1   |
|226-31-3081|A     |Philadelphia|Normal       |Female|Electronic accessories|15.28     |5       |3.82   |80.22   |2019-03-08|10:29:00|Cash          |9.6   |
|631-41-3108|A     |Phoenix     |Normal       |Male  |Home and lifestyle    |46.33     |7       |16.2155|340.5255|2019-03-03|13:23:00|Credit card   |7.4   |
|123-19-1176|B     |New York    |Member       |Male  |Heal

In [7]:
# -- ---------------------------------------------
# -- Business Problems :: Basic Level
# -- ---------------------------------------------
# Q.1 Find the total sales amount for each branch

spark.sql("""
select branch, round(sum(total), 2) as total_sales_amount from walmart_sales group by 1
""").show(20,False)

spark.sql("""
select substr(date, 0, 7) as month, avg(total) as avg_sales_amount from walmart_sales group by 1
""").show(20,False)

+------+------------------+
|branch|total_sales_amount|
+------+------------------+
|E     |60061.02          |
|A     |39383.12          |
|D     |85349.6           |
|B     |68516.88          |
|C     |69656.14          |
+------+------------------+

+-------+------------------+
|month  |avg_sales_amount  |
+-------+------------------+
|2019-02|320.8560195645877 |
|2019-03|317.26233981588615|
|2019-01|330.3746251165867 |
+-------+------------------+



In [9]:

# Q.2 Calculate the average customer rating for each city.

spark.sql("""
select city, round(avg(rating), 2) as avg_rating from walmart_sales group by 1
""").show(20,False)

+------------+----------+
|city        |avg_rating|
+------------+----------+
|Chicago     |7.0       |
|San Diego   |7.0       |
|Phoenix     |7.15      |
|San Antonio |6.73      |
|San Jose    |7.22      |
|Philadelphia|6.89      |
|New York    |6.75      |
|Dallas      |6.97      |
|Houston     |7.07      |
|Los Angeles |6.99      |
+------------+----------+



In [10]:

# Q.3 Count the number of sales transactions for each customer type.

spark.sql("""
select customer_type, count(*) from walmart_sales group by 1 
""").show(20,False)

+-------------+--------+
|customer_type|count(1)|
+-------------+--------+
|Member       |501     |
|Normal       |499     |
+-------------+--------+



In [11]:

# Q.4 Find the total quantity of products sold for each product line.

spark.sql("""
select product_line, sum(quantity) from walmart_sales group by 1 
""").show(20,False)

+----------------------+-------------+
|product_line          |sum(quantity)|
+----------------------+-------------+
|Food and beverages    |952          |
|Fashion accessories   |902          |
|Electronic accessories|971          |
|Home and lifestyle    |911          |
|Sports and travel     |920          |
|Health and beauty     |854          |
+----------------------+-------------+



In [13]:
# Q.4 v1 Calculate the total VAT collected for each payment method.
spark.sql("""
select payment_method, sum(vat) from walmart_sales group by 1 
""").show(20,False)

+--------------+-----------------+
|payment_method|sum(vat)         |
+--------------+-----------------+
|Credit card   |4798.432011127472|
|Cash          |5343.169990181923|
|Ewallet       |5237.766995728016|
+--------------+-----------------+



In [15]:
# -- ---------------------------------------------
# -- Business Problems :: Medium Level
# -- ---------------------------------------------
# Q.5 Find the total sales amount and average customer rating for each branch.
spark.sql("""
select branch, round(sum(total), 2) as total_sales_amount, round(avg(rating), 2) as avg_ratting from walmart_sales group by 1
""").show(20,False)

+------+------------------+-----------+
|branch|total_sales_amount|avg_ratting|
+------+------------------+-----------+
|E     |60061.02          |7.02       |
|A     |39383.12          |6.92       |
|D     |85349.6           |7.0        |
|B     |68516.88          |7.06       |
|C     |69656.14          |6.85       |
+------+------------------+-----------+



In [19]:
# Q.6 Calculate the total sales amount for each city and gender combination.
spark.sql("""
select city, 
round(sum(if(gender = 'Male', total, 0)), 2) as male_total_sales_amt,
round(sum(if(gender = 'Female', total, 0)), 2) as female_total_sales_amt
from walmart_sales group by 1
""").show(20,False)

+------------+--------------------+----------------------+
|city        |male_total_sales_amt|female_total_sales_amt|
+------------+--------------------+----------------------+
|Chicago     |18266.64            |16599.01              |
|San Diego   |18379.32            |18028.95              |
|Phoenix     |13027.78            |17397.02              |
|San Antonio |13262.03            |17003.61              |
|San Jose    |17475.15            |17258.71              |
|Philadelphia|15638.99            |15511.25              |
|New York    |14368.08            |13552.59              |
|Dallas      |15705.43            |15103.46              |
|Houston     |14632.76            |10744.89              |
|Los Angeles |14327.65            |26683.43              |
+------------+--------------------+----------------------+



In [20]:
# Q.7 Find the average quantity of products sold for each product line to female customers.
spark.sql("""
select product_line, avg(quantity) from walmart_sales where gender = 'Female' group by 1 
""").show(20,False)

+----------------------+-----------------+
|product_line          |avg(quantity)    |
+----------------------+-----------------+
|Food and beverages    |5.711111111111111|
|Fashion accessories   |5.520833333333333|
|Electronic accessories|5.809523809523809|
|Home and lifestyle    |6.30379746835443 |
|Sports and travel     |5.636363636363637|
|Health and beauty     |5.359375         |
+----------------------+-----------------+



In [24]:

# Q.8 Count the number of sales transactions for members in each branch.
spark.sql("""
select branch, sum(if(customer_type = 'Member', 1, 0)) as no_of_sales_tran_for_members  from walmart_sales group by 1
""").show(20,False)

+------+----------------------------+
|branch|no_of_sales_tran_for_members|
+------+----------------------------+
|E     |107                         |
|A     |59                          |
|D     |134                         |
|B     |90                          |
|C     |111                         |
+------+----------------------------+



In [47]:
# Q.9 Find the total sales amount for each day. (Return day name and their total sales order DESC by amt)
spark.sql("""
select date, round(sum(total), 2) as total_sales_amount from walmart_sales 
group by 1
order by 2 desc
""").show(20,False)

+----------+------------------+
|date      |total_sales_amount|
+----------+------------------+
|2019-03-09|7474.05           |
|2019-02-07|7228.21           |
|2019-03-14|7214.63           |
|2019-02-15|6830.79           |
|2019-03-02|6560.31           |
|2019-03-05|6230.88           |
|2019-01-23|5994.19           |
|2019-01-15|5944.26           |
|2019-02-27|5859.45           |
|2019-03-19|5740.39           |
|2019-02-03|5467.93           |
|2019-03-20|5458.2            |
|2019-01-24|5402.05           |
|2019-02-17|5299.57           |
|2019-01-08|5293.73           |
|2019-01-31|5232.5            |
|2019-01-12|5184.76           |
|2019-02-08|5084.66           |
|2019-01-28|4999.71           |
|2019-01-19|4914.72           |
+----------+------------------+
only showing top 20 rows



In [32]:
# -- ---------------------------------------------
# -- Business Problems :: Advanced Level
# -- ---------------------------------------------
# Q.10 Calculate the total sales amount for each hour of the day
spark.sql("""
select hour(time), round(sum(total), 2) as total_sales_amount from walmart_sales 
group by 1
order by 2 desc
""").show(24,False)

+----------+------------------+
|hour(time)|total_sales_amount|
+----------+------------------+
|19        |39699.51          |
|13        |34723.23          |
|10        |31421.48          |
|15        |31179.51          |
|14        |30828.4           |
|11        |30377.33          |
|12        |26065.88          |
|18        |26030.34          |
|16        |25226.32          |
|17        |24445.22          |
|20        |22969.53          |
+----------+------------------+



In [37]:
# Q.11 Find the total sales amount for each month. (return month name and their sales)
spark.sql("""
select substr(date, 0, 7), round(sum(total), 2) as total_sales_amount from walmart_sales 
group by 1
order by 2 desc
""").show(20,False)

+------------------+------------------+
|substr(date, 0, 7)|total_sales_amount|
+------------------+------------------+
|2019-01           |116291.87         |
|2019-03           |109455.51         |
|2019-02           |97219.37          |
+------------------+------------------+



In [42]:
# Q.12 Calculate the total sales amount for each branch where the average customer rating is greater than 8.
spark.sql("""
select branch, round(sum(total), 2) as total_sales_amount from walmart_sales where rating > 8 group by 1
""").show(20,False)

+------+------------------+
|branch|total_sales_amount|
+------+------------------+
|E     |22919.15          |
|A     |10589.82          |
|D     |25094.18          |
|B     |23156.45          |
|C     |17076.45          |
+------+------------------+



In [44]:
# Q.13 Find the total VAT collected for each product line where the total sales amount is more than 500.
spark.sql("""
select product_line, round(sum(vat), 2) as total_vat from walmart_sales where total > 500 group by 1
""").show(20,False)

+----------------------+---------+
|product_line          |total_vat|
+----------------------+---------+
|Food and beverages    |1272.53  |
|Fashion accessories   |1199.4   |
|Sports and travel     |1327.28  |
|Electronic accessories|1283.51  |
|Home and lifestyle    |1305.41  |
|Health and beauty     |1209.71  |
+----------------------+---------+



In [46]:
# Q.14 Calculate the average sales amount for each gender in each branch.
spark.sql("""
select branch, 
round(avg(if(gender = 'Male', total, null)), 2) as male_total_sales_amt,
round(avg(if(gender = 'Female', total, null)), 2) as female_total_sales_amt
from walmart_sales group by 1
""").show(20,False)

+------+--------------------+----------------------+
|branch|male_total_sales_amt|female_total_sales_amt|
+------+--------------------+----------------------+
|E     |314.66              |283.79                |
|A     |325.6               |382.47                |
|D     |309.85              |340.74                |
|B     |342.28              |333.08                |
|C     |276.99              |349.59                |
+------+--------------------+----------------------+



In [49]:
# Q.15 Count the number of sales transactions for each day of the week.
spark.sql("""
select dayofweek(date), count(total) from walmart_sales 
group by 1
order by 2 desc
""").show(20,False)

+---------------+------------+
|dayofweek(date)|count(total)|
+---------------+------------+
|7              |164         |
|3              |158         |
|4              |143         |
|6              |139         |
|5              |138         |
|1              |133         |
|2              |125         |
+---------------+------------+



In [51]:
# Q.16 Find the total sales amount for each city and customer type combination where the number of sales transactions is greater than 50.
spark.sql("""
select city, customer_type, sum(total) from walmart_sales 
group by 1, 2 having count(*) > 50
order by 3 desc
""").show(20,False)

+------------+-------------+------------------+
|city        |customer_type|sum(total)        |
+------------+-------------+------------------+
|Los Angeles |Member       |21068.407461166382|
|Dallas      |Member       |19981.573503494263|
|Los Angeles |Normal       |19942.671062469482|
|Chicago     |Normal       |19891.735509872437|
|San Diego   |Member       |18923.604154586792|
|Philadelphia|Normal       |18028.03791809082 |
|New York    |Member       |17583.425966262817|
|San Diego   |Normal       |17484.663130760193|
+------------+-------------+------------------+



In [53]:
# Q.17 Calculate the average unit price for each product line and payment method combination.
spark.sql("""
select product_line, payment_method, avg(unit_price) from walmart_sales 
group by 1, 2 
order by 3 desc
""").show(20,False)

+----------------------+--------------+------------------+
|product_line          |payment_method|avg(unit_price)   |
+----------------------+--------------+------------------+
|Fashion accessories   |Cash          |61.11596460509718 |
|Food and beverages    |Cash          |60.85807052411531 |
|Sports and travel     |Ewallet       |58.14055536411427 |
|Home and lifestyle    |Ewallet       |57.75687462091446 |
|Home and lifestyle    |Cash          |57.483333438050515|
|Sports and travel     |Credit card   |56.44490576690098 |
|Sports and travel     |Cash          |56.435762599363166|
|Health and beauty     |Credit card   |55.885600090026855|
|Fashion accessories   |Credit card   |55.86928571973528 |
|Electronic accessories|Ewallet       |55.823773546038936|
|Health and beauty     |Ewallet       |55.23056658258978 |
|Fashion accessories   |Ewallet       |54.78553867340088 |
|Food and beverages    |Credit card   |54.57770478920858 |
|Health and beauty     |Cash          |53.39550985608782

In [54]:
spark.stop()