In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, collect_list, when, desc, rank, mean
from pyspark.sql.window import Window
from pyspark.ml.recommendation import ALS
from pyspark.mllib.evaluation import RankingMetrics

In [2]:
def main(spark, userID):

    train = spark.read.parquet('/scratch/sjm643/sp24_bigd/rec/train.parquet')
    val = spark.read.parquet('/scratch/sjm643/sp24_bigd/rec/val.parquet')
    test = spark.read.parquet('/scratch/sjm643/sp24_bigd/rec/test.parquet')
    
    train_full = train.union(val)
    ## from train-validation
    #best hyperparameters: reg=0.05, rank=150
    best_rank = 150
    best_reg = 0.05
    als = ALS(rank=best_rank, maxIter=10, regParam=best_reg,
                      userCol='userId', itemCol='movieId', ratingCol='rating', coldStartStrategy='drop')

    model = als.fit(train_full)
    predictions = model.transform(test)
            
    window_spec = Window.partitionBy('userId').orderBy(desc('prediction'))
    ranked_pred = predictions.withColumn('rank', rank().over(window_spec))
    top_100_per_user = ranked_pred.filter(col('rank') <= 100)
    top_100_per_user = top_100_per_user.drop(col('rank'))

    top_100_per_user_rdd = top_100_per_user.rdd.map(
        lambda row: (row['userId'], row['movieId'])).groupByKey().mapValues(list)

    mean_ratings_per_user = test.groupBy('userId').agg(mean("rating").alias("mean_rating"))                                            

    movies_with_mean = test.join(mean_ratings_per_user, 'userId', 'inner')
    movies_above_mean = movies_with_mean.filter(col('rating') > col('mean_rating'))

    movies_above_mean_rdd = movies_above_mean.rdd.map(
        lambda row: (row['userId'], row['movieId'])).groupByKey().mapValues(list)

    preds_and_labels = top_100_per_user_rdd.join(movies_above_mean_rdd).map(lambda row: (row[1][0], row[1][1])).collect()
    preds_and_labels_par = spark.sparkContext.parallelize(preds_and_labels)   

    metrics = RankingMetrics(preds_and_labels_par)

    MAP = metrics.meanAveragePrecision
    ndcgAt5 = metrics.ndcgAt(5)
    ndcgAt100 = metrics.ndcgAt(100)
    recallAt5 = metrics.recallAt(5)
    recallAt100 = metrics.recallAt(100)

    print("MAP Score on full test dataset (users with > 10 movies watched): ", MAP)
    print("NDCG at 5 on full test dataset (users with > 10 movies watched): ", ndcgAt5)
    print("NDCG at 100 on full test dataset (users with > 10 movies watched): ", ndcgAt100)
    print("Recall at 5 on full test dataset (users with > 10 movies watched): ", recallAt5)
    print("Recall at 100 on full test dataset (users with > 10 movies watched): ", recallAt100)

In [3]:
if __name__ == "__main__":

    spark = SparkSession.builder \
    .appName("Spark Application") \
    .config("spark.executor.memory", "16g") \
    .config("spark.driver.memory", "16g") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.broadcastTimeout", "7200") \
    .config("spark.driver.extraJavaOptions", "-XX:+UseG1GC") \
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC") \
    .getOrCreate()
    
    userID = os.environ['USER']

    main(spark, userID)

24/05/12 01:08:06 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/12 01:08:24 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
24/05/12 01:08:24 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
24/05/12 01:08:27 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeSystemLAPACK
24/05/12 01:08:27 WARN LAPACK: Failed to load implementation from: com.github.fommil.netlib.NativeRefLAPACK
                                                                                

MAP Score on full test dataset (users with > 10 movies watched):  0.7507122400143744
NDCG at 5 on full test dataset (users with > 10 movies watched):  0.8289041684546175
NDCG at 100 on full test dataset (users with > 10 movies watched):  0.8908813224744427
Recall at 5 on full test dataset (users with > 10 movies watched):  0.4063115459577412
Recall at 100 on full test dataset (users with > 10 movies watched):  0.9561633671706878
