In [None]:
import os
import time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# spark imports
import sys
import pyspark
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType

# data science imports
import math
import numpy as np
import pandas as pd

# recommenders imports

from recommenders.utils.timer import Timer
from recommenders.utils.notebook_utils import is_jupyter
from recommenders.datasets.spark_splitters import spark_random_split
from recommenders.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation
from recommenders.utils.spark_utils import start_or_get_spark
from recommenders.utils.notebook_utils import store_metadata

ModuleNotFoundError: No module named 'pyspark'

In [None]:
print(f"System version: {sys.version}")
print("Spark version: {}".format(pyspark.__version__))

In [None]:
# top k items to recommend
TOP_K = 10

# Column names for the dataset
COL_USER = "user_id"
COL_TRACK = "track_id"
COL_COUNT = "playcount"

In [None]:
# the following settings work well for debugging locally on VM - change when running on a cluster
# set up a giant single executor with many threads and specify memory cap
spark = start_or_get_spark("ALS PySpark", memory="16g", config={'spark.local.dir': "/home/manuel-albino/spark-temp", 'spark.cleaner.ttl': "true"})
spark.conf.set("spark.sql.analyzer.failAmbiguousSelfJoin", "false")

spark.catalog.clearCache()

In [None]:
# read in the dataset into pyspark DataFrame
song_ratings = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv("../remappings/data/Modified_Listening_History.txt")

#remapping
song_ratings = song_ratings.withColumn("track_id_temp", song_ratings.track_id).withColumn("user_id_temp", song_ratings.user_id)

song_ratings = song_ratings.withColumn("track_id", song_ratings.user_id_temp).withColumn("user_id", song_ratings.track_id_temp)

# key = old column, value = new column
mapping = {
    "track_id": COL_USER,
    "user_id": COL_TRACK,
    "playcount": COL_COUNT
}

song_ratings = song_ratings.select(*[F.col(old).alias(new) for old, new in mapping.items()])
sample = song_ratings.sample(False, 0.01, 0)

# show matrix (track, user, playcount)
sample.show(2, truncate=False)

In [None]:
train, test = spark_random_split(sample, ratio=0.75, seed=123)
print ("N train", train.cache().count())
print ("N test", test.cache().count())

In [None]:
alpha = 1 

# Transform playcount to confidence using the current alpha
train_with_confidence = train.withColumn("confidence", 1 + alpha * F.log(1 + F.col(COL_COUNT)))

header = {
    "userCol": COL_USER,
    "itemCol": COL_TRACK,
    "ratingCol": "confidence",
}

als = ALS(
    rank=10,
    maxIter=15,
    implicitPrefs=True,
    regParam=0.05,
    coldStartStrategy='drop',
    nonnegative=False,
    seed=42,
    **header
)

In [None]:
with Timer() as train_time:
    model = als.fit(train_with_confidence)

print("Took {} seconds for training.".format(train_time.interval))



In [None]:
with Timer() as test_time:
    # Get the distinct user and item columns
    users = train.select(COL_USER).distinct()
    items = train.select(COL_TRACK).distinct()
    
    # Perform a cross join to get all user-item pairs
    user_item = users.crossJoin(items)

    # Alias columns to ensure clarity during joins
    dfs_pred = model.transform(user_item.alias("user_item"))

    # Exclude seen items by joining predictions with the training data
    dfs_pred_exclude_train = dfs_pred.alias("pred").join(
        train.alias("train"),
        (F.col("pred." + COL_USER) == F.col("train." + COL_USER)) &
        (F.col("pred." + COL_TRACK) == F.col("train." + COL_TRACK)),
        how='left_outer'  # using left join to keep predictions
    )

    # Filter out items seen during training
    top_all = dfs_pred_exclude_train.filter(F.col("train." + COL_COUNT).isNull()).select(
        F.col("pred." + COL_USER).alias(COL_USER),
        F.col("pred." + COL_TRACK).alias(COL_TRACK),
        F.col("pred.prediction").alias("prediction")
    )

    # Trigger the evaluation with an action and cache the result for efficiency
    top_all.cache().count()

# Measure and print the time taken for the prediction process
print("Took {} seconds for prediction.".format(test_time.interval))



In [None]:
top_all.show()

In [None]:
rank_eval = SparkRankingEvaluation(test, top_all, k = TOP_K, col_user=COL_USER, col_item=COL_TRACK, 
                                    col_rating=COL_COUNT, col_prediction="prediction", 
                                    relevancy_method="top_k")

In [None]:
print("Model:\tALS",
      "Top K:\t%d" % rank_eval.k,
      "MAP:\t%f" % rank_eval.map_at_k(),
      "NDCG:\t%f" % rank_eval.ndcg_at_k(),
      "Precision@K:\t%f" % rank_eval.precision_at_k(),
      "Recall@K:\t%f" % rank_eval.recall_at_k(), sep='\n')

In [None]:
# Generate predicted ratings.
prediction = model.transform(test)
prediction.cache().show()

In [None]:
rating_eval = SparkRatingEvaluation(test, prediction, col_user=COL_USER, col_item=COL_TRACK, 
                                    col_rating=COL_COUNT, col_prediction="prediction")

print("Model:\tALS rating prediction",
      "RMSE:\t%f" % rating_eval.rmse(),
      "MAE:\t%f" % rating_eval.mae(),
      "Explained variance:\t%f" % rating_eval.exp_var(),
      "R squared:\t%f" % rating_eval.rsquared(), sep='\n')

In [None]:
# cleanup spark instance
spark.stop()