In [1]:
import os
import time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# spark imports
import sys
import pyspark
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F
from pyspark.sql.window import Window

# data science imports
import numpy as np
import pandas as pd

# recommenders imports
from recommenders.utils.timer import Timer
from recommenders.evaluation.spark_evaluation import SparkRankingEvaluation, SparkDiversityEvaluation
from recommenders.utils.spark_utils import start_or_get_spark

from tqdm import tqdm

In [2]:
print(f"System version: {sys.version}")
print("Spark version: {}".format(pyspark.__version__))

System version: 3.9.20 (main, Oct  3 2024, 07:27:41) 
[GCC 11.2.0]
Spark version: 3.5.3


## Variables

In [3]:
# top k items to recommend
TOP_K = 50

# Column names for the dataset
COL_USER = "user_id"
COL_TRACK = "track_id"
COL_COUNT = "playcount"

## Spark Init

In [4]:
# the following settings work well for debugging locally on VM - change when running on a cluster
# set up a giant single executor with many threads and specify memory cap
spark = start_or_get_spark("ALS PySpark", memory="16g", config={'spark.local.dir': "/home/matildeschade/spark-temp", 'spark.cleaner.ttl': "true"})
spark.conf.set("spark.sql.analyzer.failAmbiguousSelfJoin", "false")

spark.catalog.clearCache()

24/12/02 14:45:25 WARN Utils: Your hostname, schade-Asus-Vivobook resolves to a loopback address: 127.0.1.1; using 192.168.87.180 instead (on interface wlo1)
24/12/02 14:45:25 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/02 14:45:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/02 14:45:27 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).


## Load and Split Data

### Load Data

In [5]:
# Read in the dataset into pyspark DataFrame    
test_listening_history = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv("../remappings/data/dataset/test_listening_history_OverEqual_50_Interactions.txt")
    
train_listening_history = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv("../remappings/data/dataset/train_listening_history_OverEqual_50_Interactions.txt")

# Change columns to correct place (user_id, track_id, playcount)
test_listening_history = test_listening_history.withColumn("track_id_temp", test_listening_history.track_id).withColumn("user_id_temp", test_listening_history.user_id)
test_listening_history = test_listening_history.withColumn("track_id", test_listening_history.user_id_temp).withColumn("user_id", test_listening_history.track_id_temp)

train_listening_history = train_listening_history.withColumn("track_id_temp", train_listening_history.track_id).withColumn("user_id_temp", train_listening_history.user_id)
train_listening_history = train_listening_history.withColumn("track_id", train_listening_history.user_id_temp).withColumn("user_id", train_listening_history.track_id_temp)

# key = old column, value = new column
mapping = {
    "track_id": COL_USER,
    "user_id": COL_TRACK,
    "playcount": COL_COUNT
}

test_listening_history = test_listening_history.select(*[F.col(old).alias(new) for old, new in mapping.items()])
train_listening_history = train_listening_history.select(*[F.col(old).alias(new) for old, new in mapping.items()])

# Sample
test_listening_history = test_listening_history.sample(False, 0.005, 0)
train_listening_history = train_listening_history.sample(False, 0.005, 0)

test_listening_history.show(2, truncate=False)
train_listening_history.show(2, truncate=False)

                                                                                

+-------+--------+---------+
|user_id|track_id|playcount|
+-------+--------+---------+
|373    |11156   |3        |
|844    |659     |1        |
+-------+--------+---------+
only showing top 2 rows

+-------+--------+---------+
|user_id|track_id|playcount|
+-------+--------+---------+
|76     |30993   |1        |
|121    |43554   |3        |
+-------+--------+---------+
only showing top 2 rows



### Split Data

In [6]:
train, test = train_listening_history, test_listening_history

# alpha = 1 

# # Transform playcount to confidence using the current alpha
# train = train.withColumn("confidence", 1 + alpha * F.log(1 + F.col(COL_COUNT))).drop(COL_COUNT)

# train.show(10, truncate=False)

print ("N train", train.cache().count())
print ("N test", test.cache().count())

                                                                                

N train 7090




N test 1791


                                                                                

## Train de ALS model

### Specify ALS hyperparameters

In [7]:
ranks = [10, 20, 30, 40]
maxIters = [10, 20, 30, 40]
regParams = [.05, .1, .15]
alphas = [20, 40, 60, 80]

In [8]:
# For loop will automatically create and store ALS models
model_list = []

for r in tqdm(ranks):
    for mi in maxIters:
        for rp in regParams:
            for a in alphas:
                model_list.append(ALS(userCol= COL_USER, itemCol= COL_TRACK, ratingCol=COL_COUNT, rank = r, maxIter = mi, regParam = rp, alpha = a, coldStartStrategy="drop", nonnegative = True, implicitPrefs = True))

# Print the model list, and the length of model_list
print (model_list, "Length of model_list: ", len(model_list))

# Validate
len(model_list) == (len(ranks)*len(maxIters)*len(regParams)*len(alphas))

[ALS_73cb934327f5, ALS_f66da5d08785, ALS_9eb6344fd317, ALS_e11e3f207bbd, ALS_bd3247c2d7b5, ALS_66a817b20253, ALS_4165d5a51b64, ALS_b51a3076485e, ALS_81db86f9a199, ALS_c5be940d0438, ALS_eef8e4625a1b, ALS_0ee54647deff, ALS_d5d9654d9669, ALS_2ddacbda7610, ALS_cfe34f4c9c84, ALS_d24453354e6d, ALS_264033ef5981, ALS_86a2767ff4e2, ALS_aa1af09a564f, ALS_fa1608686259, ALS_b116e537a895, ALS_4ab0dd476e4a, ALS_e6a171fef842, ALS_1f915a004178, ALS_3878c02f8d0d, ALS_3cd719664d21, ALS_563f9102118c, ALS_a66b0b85ad83, ALS_ca99df918f9d, ALS_a278c32840cf, ALS_e763bdc16e4c, ALS_8c4eb49d5bd9, ALS_3c6681192d4b, ALS_6281c3c21447, ALS_1fa7b056a2ce, ALS_96acb33fc150, ALS_07917ab4c5b8, ALS_eddae49176fc, ALS_da37ad8e1b25, ALS_d9dcfcc176a4, ALS_8451f5fb3e9a, ALS_b8547cbf7575, ALS_e907695eb4c5, ALS_6a2bf350b51d, ALS_c719a8bdfad2, ALS_9ff4dc13687d, ALS_50a6021f8657, ALS_e1c4c6ed041c, ALS_7dccc45927e5, ALS_62f57638d794, ALS_a82d5f4f9b9e, ALS_37227e5322ff, ALS_c2360e045a79, ALS_4d8050189d89, ALS_c634285a44c1, ALS_1a6c4

True

In [9]:
# Expected percentile rank error metric function
def ROEM(predictions, userCol = COL_USER, itemCol = COL_TRACK, ratingCol = COL_COUNT):
  # Creates table that can be queried
  predictions.createOrReplaceTempView("predictions")
  
  # Sum of total number of plays of all songs
  denominator = predictions.groupBy().sum(ratingCol).collect()[0][0]

  # Calculating rankings of songs predictions by user
  spark.sql("SELECT " + userCol + " , " + ratingCol + " , PERCENT_RANK() OVER (PARTITION BY " + userCol + " ORDER BY prediction DESC) AS rank FROM predictions").createOrReplaceTempView("rankings")

  # Multiplies the rank of each song by the number of plays and adds the products together
  numerator = spark.sql('SELECT SUM(' + ratingCol + ' * rank) FROM rankings').collect()[0][0]
  performance = numerator/denominator
  
  return performance

In [10]:
# # Building 5 folds within the training set.
# train1, train2, train3, train4, train5 = train.randomSplit([0.2, 0.2, 0.2, 0.2, 0.2], seed = 1)
# fold1 = train2.union(train3).union(train4).union(train5)
# fold2 = train3.union(train4).union(train5).union(train1)
# fold3 = train4.union(train5).union(train1).union(train2)
# fold4 = train5.union(train1).union(train2).union(train3)
# fold5 = train1.union(train2).union(train3).union(train4)

# foldlist = [(fold1, train1), (fold2, train2), (fold3, train3), (fold4, train4), (fold5, train5)]

# # Empty list to fill with ROEMs from each model
# ROEMS = []

# # Loops through all models and all folds
# for model in model_list:
#     for ft_pair in foldlist:

#         # Fits model to fold within training data
#         fitted_model = model.fit(ft_pair[0])

#         # Generates predictions using fitted_model on respective CV test data
#         predictions = fitted_model.transform(ft_pair[1])

#         # Generates and prints a ROEM metric CV test data
#         r = ROEM(predictions)
#         print ("ROEM: ", r)

#     # Fits model to all of training data and generates preds for test data
#     v_fitted_model = model.fit(train)
#     v_predictions = v_fitted_model.transform(test)
#     v_ROEM = ROEM(v_predictions)

#     # Adds validation ROEM to ROEM list
#     ROEMS.append(v_ROEM)
#     print ("Validation ROEM: ", v_ROEM)

In [11]:
# Extract the best_model
best_model = model_list[38]

# Extract the Rank
best_rank = best_model.getRank()
print ("Rank: ", best_rank)

# Extract the MaxIter value
best_maxIter = best_model.getMaxIter()
print ("MaxIter: ", best_maxIter)

# Extract the RegParam value
best_regParam = best_model.getRegParam()
print ("RegParam: ", best_regParam)

# Extract the Alpha value
best_alpha = best_model.getAlpha()
print ("Alpha: ", best_alpha)

Rank:  10
MaxIter:  40
RegParam:  0.05
Alpha:  60.0


In [12]:
# header = {
#     "userCol": COL_USER,
#     "itemCol": COL_TRACK,
#     "ratingCol": 'confidence',
# }

# als = ALS(
#     rank=best_rank,
#     maxIter=best_maxIter,
#     implicitPrefs=True,
#     regParam=best_regParam,
#     alpha=best_alpha,
#     coldStartStrategy='drop',
#     nonnegative=True,
#     seed=42,
#     **header)

In [13]:
with Timer() as train_time:
    model = best_model.fit(train)

print("Took {} seconds for training.".format(train_time.interval))

24/12/02 14:45:52 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/12/02 14:45:52 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


Took 24.439710680000076 seconds for training.


## Predict and Evaluate

### Prediction

In [14]:
with Timer() as test_time:

    # Get the cross join of all user-item pairs and score them.
    users = train.select(COL_USER).distinct()
    items = train.select(COL_TRACK).distinct()
    user_item = users.crossJoin(items)
    dfs_pred = model.transform(user_item)

    # Remove seen items.
    dfs_pred_exclude_train = dfs_pred.alias("pred").join(
        train.alias("train"),
        (dfs_pred[COL_USER] == train[COL_USER]) & (dfs_pred[COL_TRACK] == train[COL_TRACK]),
        how='outer'
    )

    top_all = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[f"train.{COL_COUNT}"].isNull()) \
        .select('pred.' + COL_USER, 'pred.' + COL_TRACK, 'pred.' + "prediction")

    # In Spark, transformations are lazy evaluation
    # Use an action to force execute and measure the test time 
    top_all.cache().count()

print("Took {} seconds for prediction.".format(test_time.interval))

24/12/02 14:46:15 WARN Column: Constructing trivially true equals predicate, 'user_id#100 = user_id#100'. Perhaps you need to use aliases.
24/12/02 14:46:15 WARN Column: Constructing trivially true equals predicate, 'track_id#101 = track_id#101'. Perhaps you need to use aliases.

Took 101.23276237799996 seconds for prediction.


                                                                                

In [15]:
top_all.show()

+-------+--------+----------+
|user_id|track_id|prediction|
+-------+--------+----------+
|     76|     981|       0.0|
|     76|    1846|       0.0|
|     76|    5574|  1.144651|
|     76|   11896|       0.0|
|     76|   13210| 1.3216627|
|     76|   14016|       0.0|
|     76|   19932|       0.0|
|     76|   20577|       0.0|
|     76|   22155|       0.0|
|     76|   22187|       0.0|
|     76|   22899|       0.0|
|     76|   24023| 1.2022697|
|     76|   24634|       0.0|
|     76|   25595|       0.0|
|     76|   29823|       0.0|
|     76|   31088|       0.0|
|     76|   36593|       0.0|
|     76|   39351|       0.0|
|     76|   40617|       0.0|
|    121|     625|       0.0|
+-------+--------+----------+
only showing top 20 rows



In [16]:
# top_k_rec = model.recommendForAllUsers(TOP_K)
# top_k_rec.show(10)



+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|     76|[{5610, 3.2195482...|
|    667|[{2629, 4.5185795...|
|    844|[{1939, 2.989554}...|
|   2962|[{4788, 2.0995054...|
|   3242|[{5692, 3.9883537...|
|   5695|[{5610, 1.1986922...|
|   6962|[{1939, 1.0460057...|
|  10011|[{3521, 0.0}, {33...|
|  14058|[{2687, 3.7338943...|
|  14525|[{3521, 0.0}, {33...|
+-------+--------------------+
only showing top 10 rows



                                                                                

In [17]:
# with Timer() as t:
#     predictions = model.transform(test)
# print("Took {} seconds for prediction.".format(t))

# predictions.cache().show()

### Evaluation

In [18]:
rank_eval = SparkRankingEvaluation(test, top_all, k = TOP_K, col_user=COL_USER, col_item=COL_TRACK, 
                                    col_rating=COL_COUNT, col_prediction="prediction", relevancy_method="top_k")

                                                                                

In [19]:
print("Model:\tALS",
      "Precision@K:\t%f" % rank_eval.precision_at_k(),
      "Recall@K:\t%f" % rank_eval.recall_at_k(), 
      "NDCG:\t%f" % rank_eval.ndcg_at_k(),
      "MAP:\t%f" % rank_eval.map_at_k(), sep='\n')



Model:	ALS
Precision@K:	0.000254
Recall@K:	0.012685
NDCG:	0.003078
MAP:	0.000828


                                                                                

In [20]:
# Cleanup spark instance
spark.stop()