In [54]:
from pathlib import Path
import os
import itertools

DATA_DIR = Path("../../data/ml-latest-small")
#DATA_DIR = Path("data/ml-latest")
os.environ["DATA_DIR"] = str(DATA_DIR.absolute())
!echo $$DATA_DIR

/Users/georg/Desktop/CS5052-Spark/api/app/../../data/ml-latest-small


In [81]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

from data import spark, F
from model import df_movies, df_ratings, df_links, df_tags

In [104]:
def fit_model(df, reg_param):
    als = ALS(maxIter=5, regParam=1., userCol="userId", itemCol="movieId", ratingCol="rating",
            coldStartStrategy="drop")
    model = als.fit(df)
    return model

def evaluate_model(model, df):
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    return rmse

def select_best_reg_param(df_training, df_test):
    best_rmse, best_reg_param = None, None
    for reg_param in (1., .1, .01):
        model = fit_model(df_training, reg_param)
        rmse = evaluate_model(model, df_test)
        if best_rmse is None or best_rmse > rmse:
            best_rmse, best_reg_param = rmse, reg_param
    return best_reg_param

def generate_all_recommendations(df_ratings):
    (df_training, df_test) = df_ratings.randomSplit([0.8, 0.2])
    reg_param = select_best_reg_param(df_training, df_test)
    model = fit_model(df_ratings, reg_param)
    return model.recommendForAllUsers(10)

In [105]:
df_recommendations = generate_all_recommendations(df_ratings)

In [128]:
df_recommendations.filter("userId=1").select(F.explode("recommendations.movieId").alias("movieId")).join(df_movies, how="inner", on="movieId").collect()

[Row(movieId=40491, title='Match Factory Girl, The (Tulitikkutehtaan tyttö)', genres=['Comedy', 'Drama'], year=1990),
 Row(movieId=3567, title='Bossa Nova', genres=['Comedy', 'Drama', 'Romance'], year=2000),
 Row(movieId=136850, title='Villain', genres=['Crime', 'Drama', 'Thriller'], year=1971),
 Row(movieId=25947, title='Unfaithfully Yours', genres=['Comedy'], year=1948),
 Row(movieId=5490, title='The Big Bus', genres=['Action', 'Comedy'], year=1976),
 Row(movieId=132333, title='Seve', genres=['Documentary', 'Drama'], year=2014),
 Row(movieId=67618, title='Strictly Sexual', genres=['Comedy', 'Drama', 'Romance'], year=2008),
 Row(movieId=141718, title='Deathgasm', genres=['Comedy', 'Horror'], year=2015),
 Row(movieId=162414, title='Mo', genres=['Drama'], year=None),
 Row(movieId=152711, title='Who Killed Chea Vichea?', genres=['Documentary'], year=2010)]