In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession \
    .builder \
    .appName("Spark SQL") \
    .getOrCreate()

# Step 2: Read the Parquet file
df = spark.read.parquet("../week3/rplace.parquet")

# Step 3: Create a temporary view
df.createOrReplaceTempView("data")


In [None]:
# NOT RUNNABLE WITHOUT CRASHING SPARK ******************

# find user with the highest average distance between pixel placements

# find all users with more than 100 placements
# result = spark.sql(f"""
#     SELECT
#         user_id_numeric,
#         COUNT(*) AS placement_count
#     FROM data
#     GROUP BY user_id_numeric
#     HAVING COUNT(*) > 100
#     ORDER BY placement_count DESC
# """)

# result.show(10)

# spark.stop()

In [None]:
# NOT RUNNABLE WITHOUT CRASHING SPARK ******************

# grab all users who only placed one pixel
# result = spark.sql(f"""
#     SELECT
#         user_id_numeric
#     FROM data
#     GROUP BY user_id_numeric
#     HAVING COUNT(*) = 1
# """)

# result.show(10)

# spark.stop()

In [None]:
# NOT RUNNABLE WITHOUT CRASHING SPARK ******************

# find the user with the most colorful placements

# grab top ten users by number of unique colors placed
# result = spark.sql(f"""
#     SELECT 
#         user_id_numeric, 
#         COUNT(DISTINCT pixel_color) AS color_count
#     FROM data 
#     GROUP BY user_id_numeric
#     ORDER BY color_count DESC
#     LIMIT 10
# """)

# result = df.select("user_id_numeric", "pixel_color") \
#     .groupBy("user_id_numeric") \
#     .agg(F.countDistinct("pixel_color").alias("color_count")) \
#     .orderBy(F.desc("color_count")) \
#     .limit(10)

# result.show(10)

# spark.stop()

## Ideas
1. Most colorful user
- Initially tried this approach, but the queries required could not be completed on Apache Spark without crashing the spark session and disallowing any further queries to be run unless I restarted the IDE. To inefficient to keep chasing.
2. User with the most distance placed between pixels on average
- Suffered the same fate as idea 1: not enough memory to do a meaningful query. Potentially possible to batch queries that cause this much grief, but I have a feeling that would be incredibly tedious and difficult to do correctly, assuming it is possible with what I want to do for these ideas.
3. 1 time user statistics vs 2-25 vs 26-100 
- Again, not enough memory. Going to try to do these ideas in duckDB and then come back to spark and try my hand at batching in some queries.