In [1]:
import os
import time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# spark imports
import sys
import pyspark
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F
from pyspark.sql.window import Window

# data science imports
import numpy as np
import pandas as pd

# recommenders imports
from recommenders.utils.timer import Timer
from recommenders.evaluation.spark_evaluation import SparkRankingEvaluation, SparkDiversityEvaluation
from recommenders.utils.spark_utils import start_or_get_spark

from tqdm import tqdm

In [2]:
print(f"System version: {sys.version}")
print("Spark version: {}".format(pyspark.__version__))

System version: 3.9.20 (main, Oct  3 2024, 07:27:41) 
[GCC 11.2.0]
Spark version: 3.5.3


## Variables

In [3]:
# top k items to recommend
TOP_K = 50

# Column names for the dataset
COL_USER = "user_id"
COL_TRACK = "track_id"
COL_COUNT = "playcount"

## Spark Init

In [4]:
# the following settings work well for debugging locally on VM - change when running on a cluster
# set up a giant single executor with many threads and specify memory cap
spark = start_or_get_spark("ALS PySpark", memory="16g", config={'spark.local.dir': "/home/matildeschade/spark-temp", 'spark.cleaner.ttl': "true"})
spark.conf.set("spark.sql.analyzer.failAmbiguousSelfJoin", "false")

spark.catalog.clearCache()

24/12/02 09:05:20 WARN Utils: Your hostname, schade-Asus-Vivobook resolves to a loopback address: 127.0.1.1; using 172.25.20.98 instead (on interface wlo1)
24/12/02 09:05:20 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/02 09:05:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/02 09:05:21 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


## Load and Split Data

### Load Data

In [5]:
# Read in the dataset into pyspark DataFrame    
test_listening_history = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv("../remappings/data/dataset/test_listening_history_OverEqual_50_Interactions.txt")
    
train_listening_history = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv("../remappings/data/dataset/train_listening_history_OverEqual_50_Interactions.txt")

# Change columns to correct place (user_id, track_id, playcount)
test_listening_history = test_listening_history.withColumn("track_id_temp", test_listening_history.track_id).withColumn("user_id_temp", test_listening_history.user_id)
test_listening_history = test_listening_history.withColumn("track_id", test_listening_history.user_id_temp).withColumn("user_id", test_listening_history.track_id_temp)

train_listening_history = train_listening_history.withColumn("track_id_temp", train_listening_history.track_id).withColumn("user_id_temp", train_listening_history.user_id)
train_listening_history = train_listening_history.withColumn("track_id", train_listening_history.user_id_temp).withColumn("user_id", train_listening_history.track_id_temp)

# key = old column, value = new column
mapping = {
    "track_id": COL_USER,
    "user_id": COL_TRACK,
    "playcount": COL_COUNT
}

test_listening_history = test_listening_history.select(*[F.col(old).alias(new) for old, new in mapping.items()])
train_listening_history = train_listening_history.select(*[F.col(old).alias(new) for old, new in mapping.items()])

# Sample
test_listening_history = test_listening_history.sample(False, 0.01, 0)
train_listening_history = train_listening_history.sample(False, 0.01, 0)

test_listening_history.show(2, truncate=False)
train_listening_history.show(2, truncate=False)

                                                                                

+-------+--------+---------+
|user_id|track_id|playcount|
+-------+--------+---------+
|11     |12239   |1        |
|212    |3001    |1        |
+-------+--------+---------+
only showing top 2 rows

+-------+--------+---------+
|user_id|track_id|playcount|
+-------+--------+---------+
|11     |5578    |1        |
|15     |6991    |1        |
+-------+--------+---------+
only showing top 2 rows



### Split Data

In [6]:
train, test = train_listening_history, test_listening_history

alpha = 1 

# Transform playcount to confidence using the current alpha
train = train.withColumn("confidence", 1 + alpha * F.log(1 + F.col(COL_COUNT))).drop(COL_COUNT)

train.show(10, truncate=False)

print ("N train", train.cache().count())
print ("N test", test.cache().count())

+-------+--------+------------------+
|user_id|track_id|confidence        |
+-------+--------+------------------+
|11     |5578    |1.6931471805599454|
|15     |6991    |1.6931471805599454|
|76     |30993   |1.6931471805599454|
|121    |28093   |1.6931471805599454|
|121    |43554   |2.386294361119891 |
|142    |42504   |4.13549421592915  |
|250    |18532   |1.6931471805599454|
|328    |32483   |1.6931471805599454|
|328    |11494   |1.6931471805599454|
|337    |11916   |2.386294361119891 |
+-------+--------+------------------+
only showing top 10 rows



                                                                                

N train 14317




N test 3639


                                                                                

## Train de ALS model

### Specify ALS hyperparameters

In [7]:
ranks = [10, 20, 30, 40]
maxIters = [10, 20, 30, 40]
regParams = [.05, .1, .15]
alphas = [20, 40, 60, 80]

In [8]:
# For loop will automatically create and store ALS models
model_list = []

for r in ranks:
    for mi in maxIters:
        for rp in regParams:
            for a in alphas:
                model_list.append(ALS(userCol= COL_USER, itemCol= COL_TRACK, ratingCol= COL_TRACK, rank = r, maxIter = mi, regParam = rp, alpha = a, coldStartStrategy="drop", nonnegative = True, implicitPrefs = True))

# Print the model list, and the length of model_list
print (model_list, "Length of model_list: ", len(model_list))

# Validate
len(model_list) == (len(ranks)*len(maxIters)*len(regParams)*len(alphas))

[ALS_be57150ae4dc, ALS_ef10309223f4, ALS_5a775b6c0839, ALS_1cbd3312568e, ALS_880753c05b9c, ALS_0209bb0a6139, ALS_9b5707569849, ALS_51e7f91e1584, ALS_6a3ce187d718, ALS_632b39224f95, ALS_465a4430e159, ALS_eea0da4d22e7, ALS_624436777182, ALS_c136b7a2fbf2, ALS_4e4599410319, ALS_8d2cb218fa86, ALS_2af4e06f240e, ALS_18947d38971b, ALS_8b3fa8d1f945, ALS_4b189851dd7c, ALS_65973de2c39b, ALS_a6d53343678d, ALS_b14ce48726bb, ALS_27af8538bcaf, ALS_4ec886a8b06e, ALS_8c4ee0c8b365, ALS_bfeed325047b, ALS_4b5222dcdb0b, ALS_7a62191d0cda, ALS_e2548a469d4f, ALS_2f97c58d20d7, ALS_123a330726f6, ALS_477a9b609af6, ALS_c9fa3b645e57, ALS_497dec5f32a8, ALS_c80f4892997d, ALS_09fdbe212f0f, ALS_7ed2d0950d32, ALS_1decca6d44c2, ALS_898a21c5b1c0, ALS_f82c9c99f416, ALS_cb90dbba90c3, ALS_2cf3a0914c03, ALS_2403b888c957, ALS_7dac265ea674, ALS_430a6db66885, ALS_0bf6a4f9a9a7, ALS_e52c79d68db6, ALS_57024d23a682, ALS_db5950453b66, ALS_d575f07a262a, ALS_38bf61ae7d24, ALS_e2f39463cb91, ALS_5eb38797a33b, ALS_2301f9b01b2a, ALS_9c4cc

True

In [9]:
# Expected percentile rank error metric function
def ROEM(predictions, userCol = COL_USER, itemCol = COL_TRACK, ratingCol = COL_COUNT):
  # Creates table that can be queried
  predictions.createOrReplaceTempView("predictions")
  
  # Sum of total number of plays of all songs
  denominator = predictions.groupBy().sum(ratingCol).collect()[0][0]

  # Calculating rankings of songs predictions by user
  spark.sql("SELECT " + userCol + " , " + ratingCol + " , PERCENT_RANK() OVER (PARTITION BY " + userCol + " ORDER BY prediction DESC) AS rank FROM predictions").createOrReplaceTempView("rankings")

  # Multiplies the rank of each song by the number of plays and adds the products together
  numerator = spark.sql('SELECT SUM(' + ratingCol + ' * rank) FROM rankings').collect()[0][0]
  performance = numerator/denominator
  
  return performance

In [10]:
# # Building 5 folds within the training set.
# train1, train2, train3, train4, train5 = train.randomSplit([0.2, 0.2, 0.2, 0.2, 0.2], seed = 1)
# fold1 = train2.union(train3).union(train4).union(train5)
# fold2 = train3.union(train4).union(train5).union(train1)
# fold3 = train4.union(train5).union(train1).union(train2)
# fold4 = train5.union(train1).union(train2).union(train3)
# fold5 = train1.union(train2).union(train3).union(train4)

# foldlist = [(fold1, train1), (fold2, train2), (fold3, train3), (fold4, train4), (fold5, train5)]

# # Empty list to fill with ROEMs from each model
# ROEMS = []

# # Loops through all models and all folds
# for model in model_list:
#     for ft_pair in foldlist:

#         # Fits model to fold within training data
#         fitted_model = model.fit(ft_pair[0])

#         # Generates predictions using fitted_model on respective CV test data
#         predictions = fitted_model.transform(ft_pair[1])

#         # Generates and prints a ROEM metric CV test data
#         r = ROEM(predictions)
#         print ("ROEM: ", r)

#     # Fits model to all of training data and generates preds for test data
#     v_fitted_model = model.fit(train)
#     v_predictions = v_fitted_model.transform(test)
#     v_ROEM = ROEM(v_predictions)

#     # Adds validation ROEM to ROEM list
#     ROEMS.append(v_ROEM)
#     print ("Validation ROEM: ", v_ROEM)

In [11]:
# Extract the best_model
best_model = model_list[38]

# Extract the Rank
best_rank = best_model.getRank()
print ("Rank: ", best_rank)

# Extract the MaxIter value
best_maxIter = best_model.getMaxIter()
print ("MaxIter: ", best_maxIter)

# Extract the RegParam value
best_regParam = best_model.getRegParam()
print ("RegParam: ", best_regParam)

# Extract the Alpha value
best_alpha = best_model.getAlpha()
print ("Alpha: ", best_alpha)

Rank:  10
MaxIter:  40
RegParam:  0.05
Alpha:  60.0


In [12]:
header = {
    "userCol": COL_USER,
    "itemCol": COL_TRACK,
    "ratingCol": 'confidence',
}

als = ALS(
    rank=best_rank,
    maxIter=best_maxIter,
    implicitPrefs=True,
    regParam=best_regParam,
    alpha=best_alpha,
    coldStartStrategy='drop',
    nonnegative=True,
    seed=42,
    **header)

In [13]:
with Timer() as train_time:
    model = als.fit(train)

print("Took {} seconds for training.".format(train_time.interval))

24/12/02 09:05:45 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/12/02 09:05:45 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
                                                                                

Took 40.26203268800009 seconds for training.


## Predict and Evaluate

### Prediction

In [14]:
with Timer() as t:
    prediction = model.transform(test)
print("Took {} seconds for prediction.".format(t))

prediction.cache().show()

Took 0.1556 seconds for prediction.




+-------+--------+---------+------------+
|user_id|track_id|playcount|  prediction|
+-------+--------+---------+------------+
| 314281|    1484|        2|         0.0|
| 400982|    8652|        1|         0.0|
| 418759|   12891|        2|         0.0|
| 504351|   34697|        1|         0.0|
| 527042|    5192|        1|1.6201421E-6|
| 534217|   21726|        1|         0.0|
| 897873|   21442|        1|0.0040780953|
|  28298|     112|        2|1.4247114E-6|
| 425459|    2925|        1|         0.0|
| 425459|    1063|        1|         0.0|
| 552000|   14072|        1|         0.0|
| 556666|   46612|        1|         0.0|
|  22684|    5610|        4|         0.0|
| 262341|     655|        1|         0.0|
| 278907|   31402|        1|         0.0|
| 415140|    2657|        1|2.3151435E-5|
| 447922|    1042|        3|         0.0|
| 798940|   12050|        1|         0.0|
| 366105|   23630|        1|         0.0|
| 366105|   48497|        1|         0.0|
+-------+--------+---------+------

                                                                                

In [15]:
# top k recommendations for each user
window = Window.partitionBy(COL_USER).orderBy(F.col("prediction").desc())    

top_k_reco = prediction.select("*", F.row_number().over(window).alias("rank")).filter(F.col("rank") <= TOP_K).drop("rank")

### Evaluation

In [16]:
rank_eval = SparkRankingEvaluation(test, prediction, k = TOP_K, col_user=COL_USER, col_item=COL_TRACK, 
                                    col_rating=COL_COUNT, col_prediction="prediction", 
                                    relevancy_method="top_k")

                                                                                

In [17]:
print("Model:\tALS",
      "Precision@K:\t%f" % rank_eval.precision_at_k(),
      "Recall@K:\t%f" % rank_eval.recall_at_k(), 
      "NDCG:\t%f" % rank_eval.ndcg_at_k(),
      "MAP:\t%f" % rank_eval.map_at_k(), sep='\n')

Model:	ALS
Precision@K:	0.021597
Recall@K:	0.972130
NDCG:	0.978618
MAP:	0.972130


In [18]:
# Cleanup spark instance
spark.stop()