# Imports

In [13]:
from pyspark.sql import DataFrame, functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, BooleanType, LongType, DoubleType, ArrayType, DateType, MapType
from matplotlib import pyplot as plt
import seaborn as sns
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, MinHashLSH, MinHashLSHModel, CountVectorizer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.stat import Summarizer
from pyspark.ml.linalg import DenseVector
from pyspark.sql.functions import col
import pandas as pd
import numpy as np
from pyspark.ml.fpm import FPGrowth
from pyspark.sql import DataFrame, functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, BooleanType, LongType, DoubleType, ArrayType, DateType, MapType
from pyspark.ml.feature import Bucketizer
from matplotlib import pyplot as plt
import seaborn as sns
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, MinHashLSH, MinHashLSHModel, CountVectorizer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.stat import Summarizer
from pyspark.ml.linalg import DenseVector
from pyspark.sql.functions import col
import pandas as pd
import numpy as np
from pyspark.sql import Window
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALSModel
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.fpm import FPGrowth
from pyspark.mllib.evaluation import RankingMetrics

## Create Spark session

In [14]:
from dtu_ctfds_02807_proj.context_creation import create_context

local = True
sc, spark = create_context(local)

Already been executed once, not running again!


# Load data

In [15]:
if local:
    game_reviews = spark.read.parquet('data/many_reviews_users_parquet')
    games = spark.read.parquet("data/steam_games_parquet")
else:
    game_reviews = spark.read.parquet("file:////work/ds/steam_reviews_parquet")
    games = spark.read.parquet('file:////work/ds/steam_games_parquet')
    

game_reviews: DataFrame = game_reviews.persist()
games: DataFrame = games.persist()

In [16]:
positive_reviews = game_reviews.filter(game_reviews.recommended == True)

unique_users = positive_reviews.select("author_steamid").distinct()

# Randomly split the unique users into train and test sets
users_train, users_test = unique_users.randomSplit([0.8, 0.2], seed=42)

# Join back to filter the original DataFrame
reviews_train = positive_reviews.join(users_train, on="author_steamid", how="inner")
reviews_test = positive_reviews.join(users_test, on="author_steamid", how="inner")

In [18]:
user_games_train = (
    reviews_train.groupBy("author_steamid")
    .agg(F.collect_set("app_name").alias("games"))
    .filter(col("games").isNotNull())  # Ensure no null lists
)

# Step 3: Train the FP-Growth model
fpGrowth = FPGrowth(itemsCol="games", minSupport=0.01, minConfidence=0.5)
model = fpGrowth.fit(user_games_train)

# Step 4: Display frequent itemsets and association rules (optional)
# Uncomment the lines below to view the frequent itemsets and rules
frequent_itemsets = model.freqItemsets
frequent_itemsets.show(truncate=False)

association_rules = model.associationRules
association_rules.show(truncate=False)

                                                                                

+-----------------------------------------------+----+
|items                                          |freq|
+-----------------------------------------------+----+
|[Freeman: Guerrilla Warfare]                   |626 |
|[Bloons TD 6]                                  |1983|
|[Bloons TD 6, Among Us]                        |724 |
|[Bloons TD 6, Tom Clancy's Rainbow Six Siege]  |671 |
|[Bloons TD 6, PAYDAY 2]                        |585 |
|[Bloons TD 6, Terraria]                        |1027|
|[Bloons TD 6, Grand Theft Auto V]              |627 |
|[Bloons TD 6, Garry's Mod]                     |849 |
|[Bloons TD 6, Wallpaper Engine]                |598 |
|[Borderlands 3]                                |2787|
|[Borderlands 3, Dying Light]                   |755 |
|[Borderlands 3, Fallout 4]                     |760 |
|[Borderlands 3, Tom Clancy's Rainbow Six Siege]|753 |
|[Borderlands 3, PAYDAY 2]                      |665 |
|[Borderlands 3, Terraria]                      |755 |
|[Borderla



+----------------------------------------------------------------------------+--------------------------------+------------------+------------------+--------------------+
|antecedent                                                                  |consequent                      |confidence        |lift              |support             |
+----------------------------------------------------------------------------+--------------------------------+------------------+------------------+--------------------+
|[Arma 3, PLAYERUNKNOWN'S BATTLEGROUNDS]                                     |[Tom Clancy's Rainbow Six Siege]|0.5206310679611651|1.9783084843524341|0.014941748080037615|
|[Arma 3, PLAYERUNKNOWN'S BATTLEGROUNDS]                                     |[Grand Theft Auto V]            |0.5266990291262136|1.7338132509467186|0.015115894328056703|
|[Assassin's Creed Origins, Assassin's Creed Odyssey]                        |[The Witcher 3: Wild Hunt]      |0.612568639414277 |1.9171315119405

                                                                                

In [31]:
# Step 5: Generate recommendations for the test set
# Prepare the test set by grouping games for each user
user_games_test = (
    reviews_test.groupBy("author_steamid")
    .agg(F.collect_set("app_name").alias("games"))
    .filter(col("games").isNotNull())
)

# Generate recommendations based on association rules
recommendations = (
    user_games_test.alias("ug")
    .join(
        model.associationRules.alias("ar"),
        F.expr("array_contains(ar.antecedent, ug.games[0])"),  # Match antecedents with user games
    )
    .select(
        col("ug.author_steamid").alias("user_index"),
        col("ar.consequent").alias("recommendations"),
        col("ar.confidence"),
    )
    .groupBy("user_index")
    .agg(
        F.collect_list("recommendations").alias("all_recommendations"),
    )
)

# Flatten the recommendation lists and keep the top N recommendations by confidence
N = 20  # Define N
user_recommendations = recommendations.withColumn(
    "recommendations", F.slice(F.flatten(col("all_recommendations")), 1, N)
).select("user_index", "recommendations")

# Step 6: Prepare ground truth data for the test set
ground_truth = (
    reviews_test.groupBy("author_steamid")
    .agg(F.collect_list("app_name").alias("groundTruth"))
    .withColumnRenamed("author_steamid", "user_index")
)

# Step 7: Join recommendations with ground truth
recommendation_and_truth = user_recommendations.join(
    ground_truth, on="user_index"
).select("recommendations", "groundTruth")

# Step 8: Convert data into an RDD of (predicted, actual) for RankingMetrics
rdd_recommendation_truth = recommendation_and_truth.rdd.map(
    lambda row: (row["recommendations"], row["groundTruth"])
)

# Step 9: Evaluate Precision@N, Recall@N, and F1@N using RankingMetrics
metrics = RankingMetrics(rdd_recommendation_truth)

recall_at_n = metrics.recallAt(N)
precision_at_n = metrics.precisionAt(N)

f1_at_n = (
    2 * (precision_at_n * recall_at_n) / (precision_at_n + recall_at_n)
    if (precision_at_n + recall_at_n) > 0
    else 0
)

# Print evaluation results
print(f"Precision@{N}: {precision_at_n}")
print(f"Recall@{N}: {recall_at_n}")
print(f"F1@{N}: {f1_at_n}")


                                                                                

Precision@20: 0.3041675503711556
Recall@20: 0.44367575986306856
F1@20: 0.3609092096961965


# The random case

In [33]:
N = reviews_train.select("app_id").distinct().count()
# N = games.select("app_id").distinct().count()
R = reviews_test.filter(
        F.col("recommended") == True
    )\
    .groupBy("author_steamid")\
    .count()\
    .agg(F.avg("count"))\
    .first()[0]

# Parameters
k = 20  # Number of random recommendations

# Calculate Precision, Recall, and F1
precision = R / N
recall = k / N
f1 = 2 * (precision * recall) / (precision + recall)

print(f"Precision (Random): {precision}")
print(f"Recall (Random): {recall}")
print(f"F1 Score (Random): {f1}")

Precision (Random): 0.04530308238799129
Recall (Random): 0.06349206349206349
F1 Score (Random): 0.0528771051336372
