In [1]:
from common.session import get_spark_session

spark = get_spark_session("01_events_data")


In [4]:
df_evetns = spark.read.csv('./data/01_user_events.csv',header=True)
df_evetns.show()

df_users = spark.read.csv('./data/01_users.csv', header=True)
df_users.show()

+-------+----------+-------------------+------+
|user_id|event_type|          timestamp|amount|
+-------+----------+-------------------+------+
|    101|  purchase|2025-08-14 10:00:00|   250|
|    102|      view|2025-08-14 10:05:00|     0|
|    101|  purchase|2025-08-14 11:00:00|   400|
|    103|      view|2025-08-14 11:10:00|     0|
|    102|  purchase|2025-08-14 11:30:00|   100|
+-------+----------+-------------------+------+

+-------+-------+
|user_id|country|
+-------+-------+
|    101|    USA|
|    102|     UK|
|    103|Germany|
+-------+-------+



In [28]:
'''
    Using PySpark, process the above data to produce a Parquet file showing the total purchase amount per country, sorted by amount in descending order. 
    Save the result to /output/purchases_by_country/ in overwrite mode.
'''
from pyspark.sql.functions import col, sum, avg

# filter events of type purchase
df_purchases = df_evetns.filter("event_type = 'purchase'")
display(df_purchases.schema)
df_purchases.show()


# join with users to get country
df_purchase_country = df_purchases.join(df_users, df_purchases.user_id == df_users.user_id).select("country", "amount").withColumn("amount", col("amount").cast("double"))
display(df_purchases.schema)
df_purchase_country.show()


# group by country
# df_purchase_country.groupBy("country").agg({'amount': 'sum'}).withColumnRenamed("sum(amount)", "Total").show()
df_purchase_country.groupBy("country").agg(sum("amount").alias("Total"), avg("amount").alias("Avg")).orderBy(col("Total").desc()).show()



StructType([StructField('user_id', StringType(), True), StructField('event_type', StringType(), True), StructField('timestamp', StringType(), True), StructField('amount', StringType(), True)])

+-------+----------+-------------------+------+
|user_id|event_type|          timestamp|amount|
+-------+----------+-------------------+------+
|    101|  purchase|2025-08-14 10:00:00|   250|
|    101|  purchase|2025-08-14 11:00:00|   400|
|    102|  purchase|2025-08-14 11:30:00|   100|
+-------+----------+-------------------+------+



StructType([StructField('user_id', StringType(), True), StructField('event_type', StringType(), True), StructField('timestamp', StringType(), True), StructField('amount', StringType(), True)])

+-------+------+
|country|amount|
+-------+------+
|    USA| 250.0|
|    USA| 400.0|
|     UK| 100.0|
+-------+------+

+-------+-----+-----+
|country|Total|  Avg|
+-------+-----+-----+
|    USA|650.0|325.0|
|     UK|100.0|100.0|
+-------+-----+-----+



In [41]:
'''
    Using PySpark, produce a DataFrame showing the number of events per country, regardless of event type.
    Sort the result by event_count in descending order
'''
df_users.createOrReplaceTempView("users")
df_evetns.createOrReplaceTempView("events")

result = spark.sql("""SELECT u.country, count(1) as event_count
             FROM users AS u
             INNER JOIN events AS e
                ON u.user_id = e.user_id
             GROUP BY u.country
             ORDER BY event_count DESC
          """)
cols = ['country', 'event_count']
data = [('USA', 2), ('UK', 2), ('Germany', 1)]
expected = spark.createDataFrame(data, cols)


actual = result.collect()
expct = expected.collect()

assert sorted(actual) == sorted(expct), f"Data missmatch, result {actual}, excpected {expct}"