In [1]:
import os
import time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# spark imports
import sys
import pyspark
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType

# data science imports
import math
import numpy as np
import pandas as pd

# recommenders imports

from recommenders.utils.timer import Timer
from recommenders.utils.notebook_utils import is_jupyter
from recommenders.datasets.spark_splitters import spark_random_split
from recommenders.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation, SparkDiversityEvaluation
from recommenders.utils.spark_utils import start_or_get_spark
from recommenders.utils.notebook_utils import store_metadata

In [2]:
print(f"System version: {sys.version}")
print("Spark version: {}".format(pyspark.__version__))

System version: 3.9.20 (main, Oct  3 2024, 07:27:41) 
[GCC 11.2.0]
Spark version: 3.5.3


In [3]:
# top k items to recommend
TOP_K = 10

# Column names for the dataset
COL_USER = "user_id"
COL_TRACK = "track_id"
COL_COUNT = "playcount"

In [4]:
# the following settings work well for debugging locally on VM - change when running on a cluster
# set up a giant single executor with many threads and specify memory cap
spark = start_or_get_spark("ALS PySpark", memory="16g", config={'spark.local.dir': "/home/manuel-albino/spark-temp", 'spark.cleaner.ttl': "true"})
spark.conf.set("spark.sql.analyzer.failAmbiguousSelfJoin", "false")

spark.catalog.clearCache()

24/11/06 17:28:44 WARN Utils: Your hostname, manuel-albino-asus resolves to a loopback address: 127.0.1.1; using 192.168.87.143 instead (on interface wlp1s0)
24/11/06 17:28:44 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/06 17:28:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/06 17:28:45 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


In [5]:
# read in the dataset into pyspark DataFrame
song_ratings = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv("../remappings/data/Modified_Listening_History.txt")

#remapping
song_ratings = song_ratings.withColumn("track_id_temp", song_ratings.track_id).withColumn("user_id_temp", song_ratings.user_id)

song_ratings = song_ratings.withColumn("track_id", song_ratings.user_id_temp).withColumn("user_id", song_ratings.track_id_temp)

# key = old column, value = new column
mapping = {
    "track_id": COL_USER,
    "user_id": COL_TRACK,
    "playcount": COL_COUNT
}

song_ratings = song_ratings.select(*[F.col(old).alias(new) for old, new in mapping.items()])
sample = song_ratings.sample(False, 0.002, 0)

# show matrix (track, user, playcount)
sample.show(2, truncate=False)

                                                                                

+-------+--------+---------+
|user_id|track_id|playcount|
+-------+--------+---------+
|20     |19508   |5        |
|55     |1524    |1        |
+-------+--------+---------+
only showing top 2 rows



In [6]:
train, test = spark_random_split(sample, ratio=0.75, seed=123)
print ("N train", train.cache().count())
print ("N test", test.cache().count())

                                                                                

N train 14578




N test 4829


                                                                                

In [7]:
alpha = 1 

# Transform playcount to confidence using the current alpha
train_with_confidence = train.withColumn("confidence", 1 + alpha * F.log(1 + F.col(COL_COUNT)))

train_with_confidence.show(10, truncate=False)

header = {
    "userCol": COL_USER,
    "itemCol": COL_TRACK,
    "ratingCol": "confidence",
}

als = ALS(
    rank=10,
    maxIter=15,
    implicitPrefs=True,
    regParam=0.05,
    coldStartStrategy='drop',
    nonnegative=False,
    seed=42,
    **header
)

+-------+--------+---------+------------------+
|user_id|track_id|playcount|confidence        |
+-------+--------+---------+------------------+
|20     |19508   |5        |2.791759469228055 |
|55     |1524    |1        |1.6931471805599454|
|233    |2066    |1        |1.6931471805599454|
|263    |20555   |7        |3.0794415416798357|
|343    |30642   |1        |1.6931471805599454|
|374    |11074   |11       |3.4849066497880004|
|419    |7308    |1        |1.6931471805599454|
|446    |8652    |4        |2.6094379124341005|
|503    |28518   |3        |2.386294361119891 |
|533    |2198    |1        |1.6931471805599454|
+-------+--------+---------+------------------+
only showing top 10 rows



In [8]:
with Timer() as train_time:
    model = als.fit(train_with_confidence)

print("Took {} seconds for training.".format(train_time.interval))



Took 5.544946925000033 seconds for training.


In [9]:
with Timer() as test_time:

    # Get the cross join of all user-item pairs and score them.
    users = train.select(COL_USER).distinct()
    items = train.select(COL_TRACK).distinct()
    user_item = users.crossJoin(items)
    dfs_pred = model.transform(user_item)

    # Remove seen items.
    dfs_pred_exclude_train = dfs_pred.alias("pred").join(
        train.alias("train"),
        (dfs_pred[COL_USER] == train[COL_USER]) & (dfs_pred[COL_TRACK] == train[COL_TRACK]),
        how='outer'
    )

    top_all = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[f"train.{COL_COUNT}"].isNull()) \
        .select('pred.' + COL_USER, 'pred.' + COL_TRACK, 'pred.' + "prediction")

    # In Spark, transformations are lazy evaluation
    # Use an action to force execute and measure the test time 
    top_all.cache().count()

    window = Window.partitionBy(COL_USER).orderBy(F.col("prediction").desc())    

    top_k_reco = top_all.select("*", F.row_number().over(window).alias("rank")).filter(F.col("rank") <= TOP_K).drop("rank")

    print(top_k_reco.count())

print("Took {} seconds for prediction.".format(test_time.interval))



24/11/06 17:29:04 WARN Column: Constructing trivially true equals predicate, 'user_id#47 = user_id#47'. Perhaps you need to use aliases.
24/11/06 17:29:04 WARN Column: Constructing trivially true equals predicate, 'track_id#48 = track_id#48'. Perhaps you need to use aliases.

142700
Took 128.717059672 seconds for prediction.


                                                                                

In [10]:
top_all.show()

+-------+--------+--------------+
|user_id|track_id|    prediction|
+-------+--------+--------------+
|     20|    2909|  -5.01851E-30|
|     20|    2926|-2.6883428E-30|
|     20|    4069|-3.0519938E-22|
|     20|    4304|-1.7599168E-32|
|     20|    5365| -5.594408E-34|
|     20|    7844| -1.759787E-31|
|     20|   11109|  8.204271E-34|
|     20|   11839| 6.3963253E-27|
|     20|   12386|  9.030782E-26|
|     20|   12565| 2.4726398E-27|
|     20|   13098|-3.0713183E-34|
|     20|   15321|  6.230454E-26|
|     20|   16849| -2.459086E-34|
|     20|   19238|  8.586324E-35|
|     20|   22071|  2.383432E-15|
|     20|   23142|  2.513782E-28|
|     20|   23407| 1.0425959E-27|
|     20|   28345| -1.627138E-27|
|     20|   29045|-5.5587636E-35|
|     20|   29650| 2.4267886E-25|
+-------+--------+--------------+
only showing top 20 rows



In [11]:
def get_ranking_results(ranking_eval):
    metrics = {
        "Precision@k": ranking_eval.precision_at_k(),
        "Recall@k": ranking_eval.recall_at_k(),
        "NDCG@k": ranking_eval.ndcg_at_k(),
        "Mean average precision": ranking_eval.map_at_k()
      
    }
    return metrics   

def get_diversity_results(diversity_eval):
    metrics = {
        "catalog_coverage":diversity_eval.catalog_coverage(),
        "distributional_coverage":diversity_eval.distributional_coverage(), 
        "novelty": diversity_eval.novelty(), 
        "diversity": diversity_eval.diversity(), 
        "serendipity": diversity_eval.serendipity()
    }
    return metrics 



In [12]:
def generate_summary(data, algo, k, ranking_metrics, diversity_metrics):
    summary = {"Data": data, "Algo": algo, "K": k}

    if ranking_metrics is None:
        ranking_metrics = {           
            "Precision@k": np.nan,
            "Recall@k": np.nan,            
            "nDCG@k": np.nan,
            "MAP": np.nan,
        }
    summary.update(ranking_metrics)
    summary.update(diversity_metrics)
    return summary

In [13]:
als_ranking_eval = SparkRankingEvaluation(
    test, 
    top_all, 
    k = TOP_K, 
    col_user=COL_USER, 
    col_item=COL_TRACK,
    col_rating=COL_COUNT, 
    col_prediction="prediction",
    relevancy_method="top_k"
)

als_ranking_metrics = get_ranking_results(als_ranking_eval)

                                                                                

In [14]:
als_diversity_eval = SparkDiversityEvaluation(
    train_df = train, 
    reco_df = top_k_reco,
    col_user = COL_USER, 
    col_item = COL_TRACK
)

als_diversity_metrics = get_diversity_results(als_diversity_eval)

                                                                                

In [15]:
results = generate_summary(sample.count(), "als", TOP_K, als_ranking_metrics, als_diversity_metrics)

                                                                                

In [16]:
cols = ["Data", "Algo", "K", "Precision@k", "Recall@k", "NDCG@k", "Mean average precision","catalog_coverage", "distributional_coverage","novelty", "diversity", "serendipity" ]
df_results = pd.DataFrame(columns=cols)

df_results.loc[1] = results 

In [17]:
df_results

Unnamed: 0,Data,Algo,K,Precision@k,Recall@k,NDCG@k,Mean average precision,catalog_coverage,distributional_coverage,novelty,diversity,serendipity
1,19407,als,10,0.001604,0.016043,0.006759,0.003883,0.010383,5.691431,9.090559,0.999871,0.999875


In [18]:
# cleanup spark instance
spark.stop()