In [18]:
import os
from pyspark.sql.window import Window
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import Evaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.sql.functions import col, expr, size, collect_list, explode, array_intersect, lit
from pyspark.sql import functions as F

In [19]:
def train_als_model_with_tuning(ratings):
    # Define ALS model
    als = ALS(
        userCol="userId",
        itemCol="movieId",
        ratingCol="rating",
        coldStartStrategy="drop",
        nonnegative=True
    )

    # Define parameter grid
    param_grid = ParamGridBuilder() \
        .addGrid(als.rank, [10, 20, 30]) \
        .addGrid(als.regParam, [0.01, 0.1, 0.2]) \
        .build()

    # Define evaluator
    evaluator = RegressionEvaluator(
        metricName="rmse",
        labelCol="rating",
        predictionCol="prediction"
    )

    # Define cross-validator
    cross_validator = CrossValidator(
        estimator=als,
        estimatorParamMaps=param_grid,
        evaluator=evaluator,
        numFolds=5,
        parallelism=2
    )

    # Train the model with cross-validation
    cv_model = cross_validator.fit(ratings)

    return cv_model.bestModel

In [20]:
def get_top_n_recommendations(model, n_recommendations=100):
    user_recs = model.recommendForAllUsers(n_recommendations)
    return user_recs

In [21]:
def compute_map(top_movies, ratings, n_recommendations=100):
    movie_id_expr = get_movie_id(top_movies, n_recommendations)
    user_actual_movies = ratings.groupBy("userId").agg(
        expr("collect_list(movieId) as actual_movies")
    )
    
    precision_per_user = user_actual_movies.select(
        expr(f"""size(array_intersect(actual_movies, {movie_id_expr})) as hits"""),
        size(col("actual_movies")).alias("total_relevant"),
        lit(n_recommendations).alias("total_recommendations")
    ).selectExpr(
        "hits / total_recommendations as precision_at_k"
    )
    
    mean_average_precision = precision_per_user.selectExpr(
        "avg(precision_at_k) as MAP"
    ).first()["MAP"]
    
    return mean_average_precision

In [22]:
def get_movie_id(top_movies, n_recommendations=100):
    top_movie_ids = top_movies.select(explode("recommendations.movieId").alias("movieId")).distinct().limit(n_recommendations).collect()
    return f"array({','.join([str(row['movieId']) for row in top_movie_ids])})"

In [23]:
def process_data(spark):
    base_path = f'./ml-latest'
    train_path = f'{base_path}/train_ratings.parquet'
    val_path = f'{base_path}/val_ratings.parquet'
    test_path = f'{base_path}/test_ratings.parquet'

    train_ratings = spark.read.parquet(train_path, header=True, inferSchema=True)
    val_ratings = spark.read.parquet(val_path, header=True, inferSchema=True)
    test_ratings = spark.read.parquet(test_path, header=True, inferSchema=True)

    als_model = train_als_model_with_tuning(train_ratings)
    top_recommendations = get_top_n_recommendations(als_model)

    train_map = compute_map(top_recommendations, train_ratings)
    print(f"Train MAP: {train_map}")
    val_map = compute_map(top_recommendations, val_ratings)
    print(f"Validation MAP: {val_map}")
    test_map = compute_map(top_recommendations, test_ratings)
    print(f"Test MAP: {test_map}")

    return top_recommendations

In [24]:
def main(spark):
    process_data(spark)

In [25]:
if __name__ == "__main__":
    
    spark = SparkSession.builder \
        .appName('als_recommender') \
        .config("spark.sql.shuffle.partitions", "800") \
        .config("spark.executor.memory", "16g") \
        .config("spark.driver.memory", "16g") \
        .config("spark.memory.fraction", "0.8") \
        .config("spark.memory.storageFraction", "0.2") \
        .getOrCreate()
#     userID = os.getenv('USER')
    main(spark)

                                                                                ]

Train MAP: 0.003450595031377944


                                                                                

Validation MAP: 0.0007368802062188479


                                                                                

Test MAP: 0.0007785544136996614
