# Tuning the MF model

In [3]:
# basic packages
import pandas as pd
import numpy as np

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder  

import sys
sys.path.append('../model')
from helper import *

In [4]:
#load data
df = pd.read_csv('../ml-latest/ratings.csv')
df_tag = pd.read_csv('../ml-latest/genome-scores.csv')

In [5]:
#sampling
df_sample = sample_df(df, user_thresh=20, item_thresh=500)


number of users: 20000
number of items: 1000
number of ratings: 1179969


In [7]:
#train test split
train_df, test_df = train_test_split_by_time(df_sample)
print(len(train_df))
print(len(test_df))

rating_train = train_df.pivot(index='movieId', columns='userId', values='rating')
rating_test = test_df.pivot(index='movieId', columns='userId', values='rating')
print(rating_train.shape)
print(rating_test.shape)

952005
227964
(1000, 20000)
(1000, 20000)


In [8]:
#initialize spark 
spark = SparkSession.builder.appName("PySpark ALS Model").getOrCreate() 
train = spark.createDataFrame(train_df)  
test = spark.createDataFrame(test_df)  

In [9]:
#build model
als = ALS(maxIter=20, regParam=0.1, rank=5, userCol="userId", itemCol="movieId", ratingCol="rating",
          coldStartStrategy="drop")

In [11]:
#Tune 
maxIter_list = [5, 15, 25]
regParam_list = [ 0.1, 0.01]
rank_list = [5, 10, 15]

paramGrid = ParamGridBuilder() \
    .addGrid(als.maxIter,maxIter_list) \
    .addGrid(als.regParam, regParam_list) \
    .addGrid(als.rank,rank_list)\
    .build()

crossval = CrossValidator(estimator=als,
                          estimatorParamMaps=paramGrid,
                          evaluator=RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction"),
                          numFolds=3)  # use 3+ folds in practice

# Run cross-validation, and choose the best set of parameters.
cvModel = crossval.fit(train)


# Make predictions on test documents. cvModel uses the best model found (lrModel).
prediction = cvModel.transform(test) 

In [12]:
paramMaps = cvModel.getEstimatorParamMaps()
results = np.array(cvModel.avgMetrics)
bestparams = paramMaps[np.argmin(results)]
print("Best Params:", bestparams)
print("The miniest mae:", np.min(results)) 

Best Params: {Param(parent='ALS_1f9bb683974a', name='maxIter', doc='max number of iterations (>= 0).'): 25, Param(parent='ALS_1f9bb683974a', name='regParam', doc='regularization parameter (>= 0).'): 0.1, Param(parent='ALS_1f9bb683974a', name='rank', doc='rank of the factorization'): 10}
The miniest mae: 0.8173732374741887
