# Python Apache Spark

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import DoubleType, StringType
from pyspark.sql import functions as f
from pyspark.sql.window import Window 

In [2]:
spark = SparkSession.builder \
    .appName("RetailSalesAnalysis") \
    .getOrCreate()

In [3]:
df = spark.read.csv("dados/global_retail_sales_data.csv", header=True, inferSchema=True)

In [4]:
df.printSchema()

root
 |-- Transaction ID: string (nullable = true)
 |-- Date: date (nullable = true)
 |-- Store ID: string (nullable = true)
 |-- Product ID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Sub-Category: string (nullable = true)
 |-- Quantity Sold: integer (nullable = true)
 |-- Price: double (nullable = true)
 |-- Discount: double (nullable = true)
 |-- Total Sales: double (nullable = true)
 |-- Customer ID: string (nullable = true)
 |-- Payment Method: string (nullable = true)
 |-- Store Location: string (nullable = true)
 |-- Customer Age: integer (nullable = true)
 |-- Customer Gender: string (nullable = true)
 |-- Customer Segment: string (nullable = true)
 |-- Loyalty Program: string (nullable = true)
 |-- Feedback Rating: integer (nullable = true)
 |-- Employee ID: string (nullable = true)
 |-- Store Type: string (nullable = true)



In [5]:
df.limit(5).toPandas()

Unnamed: 0,Transaction ID,Date,Store ID,Product ID,Category,Sub-Category,Quantity Sold,Price,Discount,Total Sales,Customer ID,Payment Method,Store Location,Customer Age,Customer Gender,Customer Segment,Loyalty Program,Feedback Rating,Employee ID,Store Type
0,T0000001,2023-10-13,S074,P00991,Groceries,Dairy,7,56.49,0.2,316.34,d49b8699-04cd-424c-8ffa-dd6210950328,Online Payment,"Martinstad, Guinea",41,Female,VIP,No,2,E0258,Online
1,T0000002,2022-12-13,S044,P00266,Groceries,Snacks,1,521.88,0.03,506.22,e46ee47a-c8d8-488d-89a2-3c7f58090a46,Online Payment,"New Micheleview, Paraguay",68,Female,Regular,No,3,E0081,Online
2,T0000003,2022-09-15,S030,P00711,Groceries,Fruits,6,961.08,0.02,5651.15,01255bf5-ae89-45bc-99dc-6aade26b41f9,Cash,"Martinstad, Guinea",47,Male,VIP,Yes,2,E0446,Online
3,T0000004,2023-02-28,S042,P00733,Furniture,Tables,8,417.71,0.26,2472.84,a62979a4-9370-4b08-abac-8ed0284115ab,Cash,"Lake Diana, Nepal",37,Female,VIP,Yes,5,E0152,Outlet
4,T0000005,2023-12-12,S010,P00815,Electronics,Laptops,8,70.94,0.28,408.61,296c62a6-e926-4870-9bce-219275fa542f,Online Payment,"Port Melanie, Moldova",21,Male,Regular,No,4,E0055,Outlet


### Q1. How many transactions were recorded in the dataset?

In [6]:
df.count()

10000

### Q2. What is the total revenue generated across all transactions?

In [7]:
df.agg(f.sum("Total Sales").alias("Total Revenue")).show()

+--------------------+
|       Total Revenue|
+--------------------+
|2.3508131990000144E7|
+--------------------+



In [8]:
### Q3. 

### Q11. What is the total sales amount for each store across different months?

In [9]:
# Extract the year and month from the Date column
df = df.withColumn("Year", f.year(f.col("Date")))
df = df.withColumn("Month", f.month(f.col("Date")))

In [10]:
# Group by Store ID, Year, and Month and calculate the total sales
total_sales_per_store_month = df.groupBy("Store ID", "Year", "Month") \
                                .agg(f.sum("Total Sales").alias("Total Sales"))

In [11]:
# Show the result
total_sales_per_store_month.orderBy("Store ID", "Year", "Month").show()

+--------+----+-----+------------------+
|Store ID|Year|Month|       Total Sales|
+--------+----+-----+------------------+
|    S001|2022|    9| 5989.860000000001|
|    S001|2022|   10|          17982.27|
|    S001|2022|   11|           4408.25|
|    S001|2022|   12|16871.569999999996|
|    S001|2023|    1|          10861.85|
|    S001|2023|    2|           11993.2|
|    S001|2023|    3|            5205.3|
|    S001|2023|    4|            8344.8|
|    S001|2023|    5|          16104.75|
|    S001|2023|    6|           4629.76|
|    S001|2023|    7|          17687.41|
|    S001|2023|    8|           8633.63|
|    S001|2023|    9|           10148.9|
|    S001|2023|   10|          19128.98|
|    S001|2023|   11|10259.849999999999|
|    S001|2023|   12|          16521.89|
|    S001|2024|    1|21536.999999999996|
|    S001|2024|    2|           4073.62|
|    S001|2024|    3|            4911.1|
|    S001|2024|    4|14524.789999999999|
+--------+----+-----+------------------+
only showing top

### Q12. Which store has the highest average feedback rating?

In [12]:
# Group by Store ID and calculate the average feedback rating
avg_feedback_per_store = df.groupBy("Store ID") \
                           .agg(f.avg("Feedback Rating").alias("Average Feedback Rating"))

# Order by Average Feedback Rating in descending order and get the top store
top_store = avg_feedback_per_store.orderBy(f.col("Average Feedback Rating").desc()).limit(1)

# Show the result
top_store.show()

+--------+-----------------------+
|Store ID|Average Feedback Rating|
+--------+-----------------------+
|    S005|     3.4234234234234235|
+--------+-----------------------+



### Q13. How does the average discount percentage vary by product category?

In [13]:
# Group by Category and calculate the average discount percentage
avg_discount_per_category = df.groupBy("Category") \
                              .agg(f.avg("Discount").alias("Average Discount"))

# Order the result by Average Discount for better readability (optional)
avg_discount_per_category = avg_discount_per_category.orderBy(f.col("Average Discount").desc())

# Show the result
avg_discount_per_category.show()

+-----------+-------------------+
|   Category|   Average Discount|
+-----------+-------------------+
|   Clothing|0.15178286852589584|
|Electronics|0.15120393120393072|
|       Toys|0.15084521384928679|
|  Furniture|0.14925607287449394|
|  Groceries|0.14816063460585016|
+-----------+-------------------+



### Q14. What is the most popular payment method used by customers in different countries?

In [14]:
# Extract the country from the store location (assuming store location format is "City, Country")
df = df.withColumn("Country", f.split(f.col("Store Location"), ", ").getItem(1))

# Group by Country and Payment Method, and count the transactions
payment_method_count = df.groupBy("Country", "Payment Method") \
                         .agg(f.count("Transaction ID").alias("Count"))

# Use a window function to determine the most popular payment method per country
window = Window.partitionBy("Country").orderBy(f.col("Count").desc())

# Add a rank to determine the most popular payment method
ranked_payment_methods = payment_method_count.withColumn("Rank", f.row_number().over(window))

# Filter to keep only the most popular payment method per country
most_popular_payment_method = ranked_payment_methods.filter(f.col("Rank") == 1)

# Show the result
most_popular_payment_method.select("Country", "Payment Method", "Count").show()

+--------------------+--------------+-----+
|             Country|Payment Method|Count|
+--------------------+--------------+-----+
|         Afghanistan|Online Payment|   39|
|             Albania|   Credit Card|   80|
|             Andorra|          Cash|   42|
|              Angola|Online Payment|   39|
|            Anguilla|   Credit Card|   35|
|Antarctica (the t...|          Cash|   35|
| Antigua and Barbuda|Online Payment|   35|
|             Austria|   Credit Card|   50|
|             Bahamas|   Credit Card|   38|
|             Bahrain|Online Payment|   43|
|             Belarus|   Credit Card|   34|
|               Benin|Online Payment|   44|
|             Bermuda|          Cash|   39|
|              Bhutan|Online Payment|   81|
|British Virgin Is...|   Credit Card|   39|
|                Chad|Online Payment|   38|
|               Chile|          Cash|   40|
|    Christmas Island|   Credit Card|   35|
|Cocos (Keeling) I...|Online Payment|   46|
|            Colombia|          

### Q15. Identify the top 10 products with the highest total sales.

In [15]:
# Group by Product ID and calculate the total sales
total_sales_per_product = df.groupBy("Product ID") \
                            .agg(f.sum("Total Sales").alias("Total Sales"))

# Order by Total Sales in descending order and get the top 10 products
top_10_products = total_sales_per_product.orderBy(f.col("Total Sales").desc()).limit(10)

# Show the result
top_10_products.show()

+----------+------------------+
|Product ID|       Total Sales|
+----------+------------------+
|    P00350|           65583.9|
|    P00225|53441.270000000004|
|    P00842|53168.240000000005|
|    P00415| 51834.43000000001|
|    P00743|51455.810000000005|
|    P00691|          50373.47|
|    P00186|50283.850000000006|
|    P00386| 49821.72999999999|
|    P00958|49193.369999999995|
|    P00340| 48476.54999999999|
+----------+------------------+



### Q16. Which customer segment (Regular, VIP) contributes the most to total sales?

In [16]:
# Group by Customer Segment and calculate the total sales
total_sales_per_segment = df.groupBy("Customer Segment") \
                        .agg(f.sum("Total Sales").alias("Total Sales"))

# Order by Total Sales in descending order
total_sales_per_segment = total_sales_per_segment.orderBy(f.col("Total Sales").desc())

# Show the result
total_sales_per_segment.show()

+----------------+--------------------+
|Customer Segment|         Total Sales|
+----------------+--------------------+
|         Regular|1.1948929059999974E7|
|             VIP|1.1559202930000018E7|
+----------------+--------------------+



### Q17. Analyze the sales trend over time for different product categories.

In [17]:
# Extract the year and month from the Date column
df = df.withColumn("Year", f.year(f.col("Date")))
df = df.withColumn("Month", f.month(f.col("Date")))

# Group by Category, Year, and Month, and calculate the total sales
sales_trend = df.groupBy("Category", "Year", "Month") \
                .agg(f.sum("Total Sales").alias("Total Sales"))

# Order the result by Category, Year, and Month
sales_trend = sales_trend.orderBy("Category", "Year", "Month")

sales_trend.show(5)

+--------+----+-----+------------------+
|Category|Year|Month|       Total Sales|
+--------+----+-----+------------------+
|Clothing|2022|    9|180573.57000000004|
|Clothing|2022|   10|         281169.85|
|Clothing|2022|   11|184325.97999999998|
|Clothing|2022|   12|188024.62999999998|
|Clothing|2023|    1|200487.02000000008|
+--------+----+-----+------------------+
only showing top 5 rows



### Q18. Which stores have the highest sales growth rate over the past year?

In [18]:
# Extract the year from the Date column
df = df.withColumn("Year", f.year(f.col("Date")))

# Calculate total sales for each store by year
sales_per_store_year = df.groupBy("Store ID", "Year") \
                         .agg(f.sum("Total Sales").alias("Total Sales"))

# Create a window specification to calculate the sales for the previous year
window_spec = Window.partitionBy("Store ID").orderBy("Year")

# Calculate the previous year's sales for each store
sales_per_store_year = sales_per_store_year.withColumn("Previous Year Sales", f.lag("Total Sales").over(window_spec))

# Calculate the growth rate as (current year sales - previous year sales) / previous year sales
sales_per_store_year = sales_per_store_year.withColumn("Growth Rate", 
                                                       (f.col("Total Sales") - f.col("Previous Year Sales")) / f.col("Previous Year Sales"))

# Filter out rows where growth rate cannot be calculated (e.g., the first year)
sales_per_store_year = sales_per_store_year.filter(f.col("Previous Year Sales").isNotNull())

# Identify the stores with the highest growth rate
top_growth_stores = sales_per_store_year.orderBy(f.col("Growth Rate").desc()).limit(10)

# Show the result
top_growth_stores.select("Store ID", "Year", "Total Sales", "Previous Year Sales", "Growth Rate").show()

+--------+----+------------------+-------------------+------------------+
|Store ID|Year|       Total Sales|Previous Year Sales|       Growth Rate|
+--------+----+------------------+-------------------+------------------+
|    S097|2023|144121.25999999998|            9851.29|13.629684031228393|
|    S053|2023|130308.31000000001|           18299.13|  6.12101121747318|
|    S061|2023|127207.56999999996|           20900.41| 5.086367205236642|
|    S040|2023|125535.66999999998| 22710.819999999996|4.5275709991977395|
|    S096|2023|         137113.59|           24839.58| 4.519964105673284|
|    S003|2023|105611.00999999995|           20788.22| 4.080329628991802|
|    S077|2023|122598.29999999999| 25560.850000000006| 3.796331107924813|
|    S039|2023|167740.20999999993|           36790.56| 3.559327447040761|
|    S041|2023|          141922.1| 32321.710000000006|3.3909217674436154|
|    S055|2023|113229.07999999999|           27062.88| 3.183925731481645|
+--------+----+------------------+----

### Q19. What is the average purchase amount by different age groups?

In [19]:
### Which product sub-categories have the highest return rate (e.g., returns not explicitly stated but implied by negative sales or quantity adjustments)?
### What is the relationship between discount percentage and customer feedback rating?
### How does the customer loyalty program impact total sales and frequency of purchases?
### Which cities have the highest number of VIP customers?
### Identify any seasonal trends in sales for specific product categories.
### What is the average transaction amount for online stores compared to physical stores?
### Analyze the relationship between store location and the preferred payment method.
### What is the average number of items purchased per transaction across different store types?
### Identify the top 5 employees with the highest total sales and analyze their performance trends.
### What is the correlation between customer age and the likelihood of purchasing high-value products?
### How do sales vary across different days of the week and hours of the day?