In [None]:
import os
import time
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# spark imports
import sys
import pyspark
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType

# data science imports
import math
import numpy as np
import pandas as pd

# recommenders imports

from recommenders.utils.timer import Timer
from recommenders.utils.notebook_utils import is_jupyter
from recommenders.datasets.spark_splitters import spark_random_split
from recommenders.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation, SparkDiversityEvaluation
from recommenders.utils.spark_utils import start_or_get_spark
from recommenders.utils.notebook_utils import store_metadata

In [None]:
print(f"System version: {sys.version}")
print("Spark version: {}".format(pyspark.__version__))

In [None]:
# top k items to recommend
TOP_K = 10

# Column names for the dataset
COL_USER = "user_id"
COL_TRACK = "track_id"
COL_COUNT = "playcount"

In [None]:
# the following settings work well for debugging locally on VM - change when running on a cluster
# set up a giant single executor with many threads and specify memory cap
spark = start_or_get_spark("ALS PySpark", memory="16g", config={'spark.local.dir': "/home/manuel-albino/spark-temp", 'spark.cleaner.ttl': "true"})
spark.conf.set("spark.sql.analyzer.failAmbiguousSelfJoin", "false")

spark.catalog.clearCache()

In [None]:
# read in the dataset into pyspark DataFrame
song_ratings = spark.read.option("header", "true") \
    .option("delimiter", "\t") \
    .option("inferSchema", "true") \
    .csv("../remappings/data/Modified_Listening_History.txt")

#remapping
song_ratings = song_ratings.withColumn("track_id_temp", song_ratings.track_id).withColumn("user_id_temp", song_ratings.user_id)

song_ratings = song_ratings.withColumn("track_id", song_ratings.user_id_temp).withColumn("user_id", song_ratings.track_id_temp)

# key = old column, value = new column
mapping = {
    "track_id": COL_USER,
    "user_id": COL_TRACK,
    "playcount": COL_COUNT
}

song_ratings = song_ratings.select(*[F.col(old).alias(new) for old, new in mapping.items()])
sample = song_ratings.sample(False, 0.002, 0)

# show matrix (track, user, playcount)
sample.show(2, truncate=False)

## Data Splitting

In [None]:
train, test = spark_random_split(sample, ratio=0.75, seed=123)
print ("N train", train.cache().count())
print ("N test", test.cache().count())

## ALS model creation with Confidence column

In [None]:
alpha = 1 

# Transform playcount to confidence using the current alpha
train_with_confidence = train.withColumn("confidence", 1 + alpha * F.log(1 + F.col(COL_COUNT)))

train_with_confidence.show(10, truncate=False)

header = {
    "userCol": COL_USER,
    "itemCol": COL_TRACK,
    "ratingCol": "confidence",
}

als = ALS(
    rank=10,
    maxIter=15,
    implicitPrefs=True,
    regParam=0.05,
    coldStartStrategy='drop',
    nonnegative=False,
    seed=42,
    **header
)

## Training

In [None]:
with Timer() as train_time:
    model = als.fit(train_with_confidence)

print("Took {} seconds for training.".format(train_time.interval))



## Prediction

In [None]:
with Timer() as test_time:

    # Get the cross join of all user-item pairs and score them.
    users = train.select(COL_USER).distinct()
    items = train.select(COL_TRACK).distinct()
    user_item = users.crossJoin(items)
    dfs_pred = model.transform(user_item)

    # Remove seen items.
    dfs_pred_exclude_train = dfs_pred.alias("pred").join(
        train.alias("train"),
        (dfs_pred[COL_USER] == train[COL_USER]) & (dfs_pred[COL_TRACK] == train[COL_TRACK]),
        how='outer'
    )

    top_all = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[f"train.{COL_COUNT}"].isNull()) \
        .select('pred.' + COL_USER, 'pred.' + COL_TRACK, 'pred.' + "prediction")

    # In Spark, transformations are lazy evaluation
    # Use an action to force execute and measure the test time 
    top_all.cache().count()


    # top k recommendations for each user
    window = Window.partitionBy(COL_USER).orderBy(F.col("prediction").desc())    

    top_k_reco = top_all.select("*", F.row_number().over(window).alias("rank")).filter(F.col("rank") <= TOP_K).drop("rank")
    

    print(top_k_reco.count())

print("Took {} seconds for prediction.".format(test_time.interval))



In [None]:
top_all.show()

## Evaluation

Ranking and Diversity functions
Usable for pySpark. To test with bpr need to use recommenders.utils.python_evaluation functions

In [None]:
def get_ranking_results(ranking_eval):
    metrics = {
        "Precision@k": ranking_eval.precision_at_k(),
        "Recall@k": ranking_eval.recall_at_k(),
        "NDCG@k": ranking_eval.ndcg_at_k(),
        "Mean average precision": ranking_eval.map_at_k()
      
    }
    return metrics   

def get_diversity_results(diversity_eval):
    metrics = {
        "catalog_coverage":diversity_eval.catalog_coverage(),
        "distributional_coverage":diversity_eval.distributional_coverage(), 
        "novelty": diversity_eval.novelty(), 
        "diversity": diversity_eval.diversity(), 
        "serendipity": diversity_eval.serendipity()
    }
    return metrics 



Summary of the evaluation

In [None]:
def generate_summary(data, algo, k, ranking_metrics, diversity_metrics):
    summary = {"Data": data, "Algo": algo, "K": k}

    if ranking_metrics is None:
        ranking_metrics = {           
            "Precision@k": np.nan,
            "Recall@k": np.nan,            
            "nDCG@k": np.nan,
            "MAP": np.nan,
        }
    summary.update(ranking_metrics)
    summary.update(diversity_metrics)
    return summary

### ALS metrics 

In [None]:
als_ranking_eval = SparkRankingEvaluation(
    test, 
    top_all, 
    k = TOP_K, 
    col_user=COL_USER, 
    col_item=COL_TRACK,
    col_rating=COL_COUNT, 
    col_prediction="prediction",
    relevancy_method="top_k"
)

als_ranking_metrics = get_ranking_results(als_ranking_eval)

als_diversity_eval = SparkDiversityEvaluation(
    train_df = train, 
    reco_df = top_k_reco,
    col_user = COL_USER, 
    col_item = COL_TRACK
)

als_diversity_metrics = get_diversity_results(als_diversity_eval)

als_results = generate_summary(sample.count(), "als", TOP_K, als_ranking_metrics, als_diversity_metrics)

To add more models create the another cell with the other model metrics and generate a summary

## Create the results dataframe

In [None]:
cols = ["Data", "Algo", "K", "Precision@k", "Recall@k", "NDCG@k", "Mean average precision","catalog_coverage", "distributional_coverage","novelty", "diversity", "serendipity" ]
df_results = pd.DataFrame(columns=cols)

# add the models results here
df_results.loc[1] = als_results 

In [None]:
df_results

In [None]:
# cleanup spark instance
spark.stop()