In [1]:
from common.session import get_spark_session


spark  = get_spark_session("04_purchases")

spark

spark.conf.set("spark.default.parallelism", 3)

In [2]:
users_cols = ["user_id", "name", "country"]
users_data = [(1, 'Alice'   , 'US')
              ,(2, 'Bob'     , 'UK')
              ,(3, 'Charlie' , 'US')
              ,(4, 'Diana'   , 'DE')]

df_users = spark.createDataFrame(users_data, users_cols)
df_users.show()


purchase_schema = 'purchase_id int, user_id int ,amount double, category string, ts string'
purchase_data = [ (1001,1,50.0  ,'Electronics' ,'2025-08-01 10:00:00')
                 ,(1002,2,20.0  ,'Books'       ,'2025-08-02 11:00:00')
                 ,(1003,1,70.0  ,'Books'       ,'2025-08-02 15:00:00')
                 ,(1004,3,100.0 ,'Electronics' ,'2025-08-03 09:30:00')
                 ,(1005,4,35.0  ,'Fashion'     ,'2025-08-03 12:00:00')
                 ,(1006,2,80.0  ,'Electronics','2025-08-04 16:00:00')]

df_purch = spark.createDataFrame(purchase_data, purchase_schema)
df_purch.show()


+-------+-------+-------+
|user_id|   name|country|
+-------+-------+-------+
|      1|  Alice|     US|
|      2|    Bob|     UK|
|      3|Charlie|     US|
|      4|  Diana|     DE|
+-------+-------+-------+

+-----------+-------+------+-----------+-------------------+
|purchase_id|user_id|amount|   category|                 ts|
+-----------+-------+------+-----------+-------------------+
|       1001|      1|  50.0|Electronics|2025-08-01 10:00:00|
|       1002|      2|  20.0|      Books|2025-08-02 11:00:00|
|       1003|      1|  70.0|      Books|2025-08-02 15:00:00|
|       1004|      3| 100.0|Electronics|2025-08-03 09:30:00|
|       1005|      4|  35.0|    Fashion|2025-08-03 12:00:00|
|       1006|      2|  80.0|Electronics|2025-08-04 16:00:00|
+-----------+-------+------+-----------+-------------------+



In [3]:
'''Tasks
Join users with purchases to get user details with their transactions.
For each country, compute:
    Total revenue
    Average order value (AOV)
    Number of unique users who purchased
    Find the top category per country by total spend.
    Get the most valuable user (highest total spend) per country.
'''
from pyspark.sql.functions import sum, avg, countDistinct, format_number, col, row_number
from pyspark.sql.window import Window

df_country_purch = df_users.join(df_purch, "user_id")

df_country_purch.cache()
df_country_purch.count()

# country stats
df_country_Stats = df_country_purch.groupBy("country").agg(
    sum('amount').alias('total'),
    avg('amount').alias('AOV'),
    countDistinct('user_id').alias('dist_users')
).withColumn('AOV', format_number('AOV', 2))


# top category by country
df_by_country_category = df_country_purch.groupBy(['country', 'category']).agg(sum('amount').alias('category_amount'))
category_widnow = Window.partitionBy('country').orderBy(col('category_amount').desc())
df_by_country_category = df_by_country_category.withColumn('category_rank', row_number().over(category_widnow)).where('category_rank = 1')


# top user per country
df_by_country_by_user = df_country_purch.groupBy(['country', 'user_id']).agg(sum('amount').alias('user_amount'))
category_widnow = Window.partitionBy('country').orderBy(col('user_amount').desc())
df_by_country_by_user = df_by_country_by_user.withColumn('user_rank', row_number().over(category_widnow)).where('user_rank = 1')

df_country_Stats.show()
df_by_country_category.show()
df_by_country_by_user.show()

+-------+-----+-----+----------+
|country|total|  AOV|dist_users|
+-------+-----+-----+----------+
|     US|220.0|73.33|         2|
|     UK|100.0|50.00|         1|
|     DE| 35.0|35.00|         1|
+-------+-----+-----+----------+

+-------+-----------+---------------+-------------+
|country|   category|category_amount|category_rank|
+-------+-----------+---------------+-------------+
|     DE|    Fashion|           35.0|            1|
|     UK|Electronics|           80.0|            1|
|     US|Electronics|          150.0|            1|
+-------+-----------+---------------+-------------+

+-------+-------+-----------+---------+
|country|user_id|user_amount|user_rank|
+-------+-------+-----------+---------+
|     DE|      4|       35.0|        1|
|     UK|      2|      100.0|        1|
|     US|      1|      120.0|        1|
+-------+-------+-----------+---------+

