# Imports

In [1]:
from pyspark.sql import DataFrame, functions as F
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, BooleanType, LongType, DoubleType, ArrayType, DateType, MapType
from pyspark.ml.feature import Bucketizer
from matplotlib import pyplot as plt
import seaborn as sns
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, MinHashLSH, MinHashLSHModel, CountVectorizer, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.stat import Summarizer
from pyspark.ml.linalg import DenseVector
from pyspark.sql.functions import col
import pandas as pd
import numpy as np
from pyspark.sql import Window
from pyspark.ml.recommendation import ALS
from pyspark.ml.feature import StringIndexer
from pyspark.ml.recommendation import ALSModel
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import RankingMetrics


## Create Spark session

In [2]:
from dtu_ctfds_02807_proj.context_creation import create_context

local = True
sc, spark = create_context(local)

Cell has not been executed before, running...


24/12/05 01:19:33 WARN Utils: Your hostname, macbook.local resolves to a loopback address: 127.0.0.1; using 192.168.86.58 instead (on interface en0)
24/12/05 01:19:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/05 01:19:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/05 01:19:33 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 60805)
Traceback (most recent call last):
  File "/Users/esben/Library/Application Support/pdm/python/cpython@3.10.15/lib/python3.10/socketserver.py", line 316, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/Users/esben/Library/Application Support/pdm/python/cpython@3.10.15/lib/python3.10/socketserver.py", line 347, in process_request
    self.finish_request(request, client_address)
  File "/Users/esben/Library/Application Support/pdm/python/cpython@3.10.15/lib/python3.10/socketserver.py", line 360, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/Users/esben/Library/Application Support/pdm/python/cpython@3.10.15/lib/python3.10/socketserver.py", line 747, in __init__
    self.handle()
  File "/Users/esben/dev/dtu/03semester/CTFDS/DTU-CTFDS-02807-PROJ/.venv/lib/python3.10/site-packages/pyspark/accumulator

# Load data

In [None]:
if local:
    game_reviews = spark.read.parquet('data/many_reviews_users_parquet')
    games = spark.read.parquet("data/steam_games_parquet")
else:
    game_reviews = spark.read.parquet("file:////work/ds/steam_reviews_parquet")
    games = spark.read.parquet('file:////work/ds/steam_games_parquet')
    
game_reviews: DataFrame = game_reviews.persist()
games: DataFrame = games.persist()

In [None]:
game_reviews: DataFrame = game_reviews.persist()
games: DataFrame = games.persist()

# Content based filtering

In [8]:
# Select relevant columns for content-based filtering
game_data = games\
    .filter(games.tags.isNotNull())\
    .filter(games.detailed_description.isNotNull())\
    .select("app_id", "name", "genres", "publishers", "detailed_description", "tags")

# Preprocessing pipeline
# Tokenize the detailed description
tokenizer = Tokenizer(inputCol="detailed_description", outputCol="tokens")

# Remove stopwords
stopwords_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered_tokens")

# Apply TF (Term Frequency)
hashing_tf = HashingTF(inputCol="filtered_tokens", outputCol="raw_features", numFeatures=1000)

# Apply IDF (Inverse Document Frequency)
idf = IDF(inputCol="raw_features", outputCol="text_features")

genres_vectorizer = CountVectorizer(inputCol="genres", outputCol="genres_vector", binary=True)
publishers_vectorizer = CountVectorizer(inputCol="publishers", outputCol="publishers_vector", binary=True)

# Convert tags (Map<String, Int>) into feature vector
# Extract keys and values from the tags map
tags_keys = F.map_keys(game_data.tags)
tags_values = F.map_values(game_data.tags)

# Create a DataFrame with tag names (keys) and their respective weights (values)
game_data = game_data\
    .withColumn("tags_keys", tags_keys)\
    .withColumn("tags_values", tags_values)

# Use CountVectorizer to convert the tag keys into a feature vector
tags_vectorizer = CountVectorizer(inputCol="tags_keys", outputCol="tags_key_features")

vector_assembler = VectorAssembler(
    inputCols=["text_features", "genres_vector", "publishers_vector", "tags_key_features"],
    outputCol="features"
)

# Locality Sensitive Hashing (LSH) for similarity search
minhash_lsh = MinHashLSH(inputCol="features", outputCol="hashes", numHashTables=3)

# Build the pipeline
pipeline = Pipeline(stages=[
    tokenizer,
    stopwords_remover,
    hashing_tf,
    idf,
    genres_vectorizer,
    publishers_vectorizer,
    tags_vectorizer,
    vector_assembler,
    minhash_lsh
])

In [9]:
# Fit the pipeline to the data
content_based_model = pipeline.fit(game_data)
processed_games_df = content_based_model.transform(game_data)

                                                                                

In [8]:
pipeline.save("models/content_based_pipeline")

24/12/04 23:10:14 ERROR Instrumentation: java.io.IOException: Path models/content_based_pipeline already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:683)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at org.apache.spark.ml.Pipeline$PipelineWriter.super$save(Pipeline.scala:204)
	at org.apache.spark.ml.Pipeline$PipelineWriter.$anonfun$save$2(Pipeline.scala:204)
	at org.apache.spark.ml.MLEvents.withSaveInstanceEvent(events.scala:174)
	at org.apache.spark.ml.MLEvents.withSaveInstanceEvent$(events.scala:169)
	at org.apache.spark.ml.util.Instrumentation.withSaveInstanceEvent(Instrumentation.scala:42)
	at org.apache.spark.ml.Pipeline$PipelineWriter.$anonfun$save$1(Pipeline.scala:204)
	at org.apache.spark.ml.Pipeline$PipelineWriter.$anonfun$save$1$adapted(Pipeline.scala:204)
	at org.apache.spark.ml.util.In

Py4JJavaError: An error occurred while calling o773.save.
: java.io.IOException: Path models/content_based_pipeline already exists. To overwrite it, please use write.overwrite().save(path) for Scala and use write().overwrite().save(path) for Java and Python.
	at org.apache.spark.ml.util.FileSystemOverwrite.handleOverwrite(ReadWrite.scala:683)
	at org.apache.spark.ml.util.MLWriter.save(ReadWrite.scala:167)
	at org.apache.spark.ml.Pipeline$PipelineWriter.super$save(Pipeline.scala:204)
	at org.apache.spark.ml.Pipeline$PipelineWriter.$anonfun$save$2(Pipeline.scala:204)
	at org.apache.spark.ml.MLEvents.withSaveInstanceEvent(events.scala:174)
	at org.apache.spark.ml.MLEvents.withSaveInstanceEvent$(events.scala:169)
	at org.apache.spark.ml.util.Instrumentation.withSaveInstanceEvent(Instrumentation.scala:42)
	at org.apache.spark.ml.Pipeline$PipelineWriter.$anonfun$save$1(Pipeline.scala:204)
	at org.apache.spark.ml.Pipeline$PipelineWriter.$anonfun$save$1$adapted(Pipeline.scala:204)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.Pipeline$PipelineWriter.save(Pipeline.scala:204)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)


# Finding similar games to a given game

In [10]:
# FIFA 23 = 1811260
# Counter Strike Zero = 80
# Garry's Mod = 4000

sample_game = processed_games_df.filter(games.app_id == 80).limit(1)
sample_game_features = sample_game.select("features").first()

print("Looking for games similar to:")
sample_game.show(truncate=False)

sample_game_features = DenseVector(sample_game_features["features"])

mh_model: MinHashLSHModel = content_based_model.stages[-1]

n_similar_games = 20
print(f"Finding top {n_similar_games} most similar games...")
mh_model.approxNearestNeighbors(
    dataset=processed_games_df, key=sample_game_features, numNearestNeighbors=20, distCol="JaccardDistance"
)\
    .orderBy(F.asc("JaccardDistance"))\
    .show(truncate=False)

Looking for games similar to:
+------+----+------+----------+--------------------+----+---------+-----------+------+---------------+------------+-------------+-------------+-----------------+-----------------+--------+------+
|app_id|name|genres|publishers|detailed_description|tags|tags_keys|tags_values|tokens|filtered_tokens|raw_features|text_features|genres_vector|publishers_vector|tags_key_features|features|hashes|
+------+----+------+----------+--------------------+----+---------+-----------+------+---------------+------------+-------------+-------------+-----------------+-----------------+--------+------+
+------+----+------+----------+--------------------+----+---------+-----------+------+---------------+------------+-------------+-------------+-----------------+-----------------+--------+------+



TypeError: 'NoneType' object is not subscriptable

# Finding similar games to a combination of games

In [14]:
# FIFA 23 = 1811260
# Counter Strike Zero = 80
# Garry's Mod = 4000

# Step 1: Filter the relevant games
sample_games = processed_games_df.filter(
    (processed_games_df.app_id == 80) |
    (processed_games_df.app_id == 1811260)
)

print("Looking for games similar to:")
sample_games.show(truncate=False)

# Step 2: Compute the mean feature vector using Summarizer
mean_features_row = sample_games.select(
    Summarizer.mean(col("features")).alias("mean_features")
).first()

# Extract mean features as a DenseVector
sample_game_features = DenseVector(mean_features_row["mean_features"])

# Step 3: Perform similarity search using MinHashLSH
mh_model: MinHashLSHModel = content_based_model.stages[-1]

n_similar_games = 20
print(f"Finding top {n_similar_games} most similar games...")
mh_model.approxNearestNeighbors(
    dataset=processed_games_df,
    key=sample_game_features,
    numNearestNeighbors=n_similar_games,
    distCol="JaccardDistance"
)\
    .orderBy(col("JaccardDistance").asc())\
    .show(truncate=False)


Looking for games similar to:


                                                                                

+-------+------------------------------+--------------------+-----------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

Finding top 20 most similar games...




+-------+------------------------------+--------------------------------------------------------------------+------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

                                                                                

# Validation

In [11]:
game_ratings = game_reviews\
    .filter(F.col("author_steamid").isNotNull())\
    .filter(game_reviews.recommended == True)\
    .withColumn(
        'rating',
        F.when(game_reviews.recommended == True, 1).otherwise(0)
    )

# Filter out users with less than 5 reviews
user_rating_counts = game_ratings.groupBy("author_steamid")\
    .agg(F.count("app_id").alias("rating_count"))\
    .filter(F.col("rating_count") > 5)

game_ratings = game_ratings.join(user_rating_counts, "author_steamid", 'inner')

game_ratings = game_ratings.join(processed_games_df, "app_id", 'inner')

window_spec = Window\
    .partitionBy("author_steamid")\
    .orderBy(F.desc("timestamp_created"))

df_with_row_number = game_ratings\
    .withColumn('row_number', F.row_number().over(window_spec))

reviews_test = df_with_row_number\
    .filter(F.col('row_number') == 1)\
    .drop('row_number')\
    .cache()

reviews_train = df_with_row_number\
    .filter(F.col('row_number') > 1)\
    .drop('row_number')\
    .groupBy('author_steamid')\
    .agg(
        Summarizer.mean(col("features")).alias("features"),
    ).cache()

In [None]:
from pyspark.mllib.evaluation import RankingMetrics

N = 20  # Number of recommendations to evaluate

# Step 1: Use approxSimilarityJoin to compute similarities
mh_model: MinHashLSHModel = content_based_model.stages[-1]  # Assuming the MinHashLSH model is the last stage

# Perform approximate similarity join between reviews_train and reviews_test
similarity_joined = mh_model.approxSimilarityJoin(
    reviews_train.select("author_steamid", "features"),
    processed_games_df.select("app_id", "features"),
    threshold=1,
    distCol="JaccardDistance"
)

similarity_joined.printSchema()

# Step 2: Collect top N recommendations per user based on similarity
user_recommendations = (
    similarity_joined
    .select(
        F.col("datasetA.author_steamid").alias("author_steamid"),
        F.col("datasetB.app_id").alias("recommended_app_id"),
        F.col("JaccardDistance")
    )
    .orderBy("author_steamid", F.asc("JaccardDistance"))  # Sort by distance
    .groupBy("author_steamid")
    .agg(F.collect_list("recommended_app_id").alias("recommendations"))
)

# Limit recommendations to top N
user_recommendations = user_recommendations.withColumn(
    "recommendations", F.slice("recommendations", 1, N)
)

# Step 3: Prepare ground truth data
ground_truth = reviews_test.groupBy("author_steamid").agg(
    F.collect_list("app_id").alias("groundTruth")
)

# Step 4: Join recommendations with ground truth
recommendation_and_truth = user_recommendations.join(
    ground_truth, on="author_steamid"
).select("recommendations", "groundTruth")

# Step 5: Convert to RDD for RankingMetrics
rdd_recommendation_truth = recommendation_and_truth.rdd.map(
    lambda row: (row["recommendations"], row["groundTruth"])
)

# Step 6: Evaluate using RankingMetrics
metrics = RankingMetrics(rdd_recommendation_truth)

precision_at_n = metrics.precisionAt(N)
recall_at_n = metrics.recallAt(N)
f1_at_n = 2 * (precision_at_n * recall_at_n) / (precision_at_n + recall_at_n) if (precision_at_n + recall_at_n) > 0 else 0

print(f"Precision@{N}: {precision_at_n}")
print(f"Recall@{N}: {recall_at_n}")
print(f"F1@{N}: {f1_at_n}")


root
 |-- datasetA: struct (nullable = false)
 |    |-- author_steamid: long (nullable = true)
 |    |-- features: vector (nullable = false)
 |    |-- hashes: array (nullable = true)
 |    |    |-- element: vector (containsNull = true)
 |-- datasetB: struct (nullable = false)
 |    |-- app_id: long (nullable = true)
 |    |-- features: vector (nullable = true)
 |    |-- hashes: array (nullable = true)
 |    |    |-- element: vector (containsNull = true)
 |-- JaccardDistance: double (nullable = false)



ERROR:root:KeyboardInterrupt while sending command.             (136 + 8) / 200]
Traceback (most recent call last):
  File "/Users/esben/dev/dtu/03semester/CTFDS/DTU-CTFDS-02807-PROJ/.venv/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/Users/esben/dev/dtu/03semester/CTFDS/DTU-CTFDS-02807-PROJ/.venv/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/Users/esben/Library/Application Support/pdm/python/cpython@3.10.15/lib/python3.10/socket.py", line 717, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt

KeyboardInterrupt: 

                                                                                

: 

In [19]:
# Calculate N (Total games) and R (Average relevant games per user)
# N = games.select("app_id").distinct().count()
N = 315

R = reviews_test.filter(
        F.col("recommended") == True
    )\
    .groupBy("author_steamid")\
    .count()\
    .agg(F.avg("count"))\
    .first()[0]

# Parameters
k = 20  # Number of random recommendations

# Calculate Precision, Recall, and F1
precision = R / N
recall = k / N
f1 = 2 * (precision * recall) / (precision + recall)

print(f"Precision (Random): {precision}")
print(f"Recall (Random): {recall}")
print(f"F1 Score (Random): {f1}")


Precision (Random): 0.0031746031746031746
Recall (Random): 0.06349206349206349
F1 Score (Random): 0.006046863189720333


                                                                                