In [None]:
# Set up the environment for using pyspark
import findspark
findspark.init()

In [None]:
import pyspark

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.sql import Row
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [None]:
spark = SparkSession\
        .builder\
        .appName("Movie Recommendations")\
        .getOrCreate()
sc = spark.sparkContext

### Get the movies and rating csv files

In [None]:
movies_sdf = spark.read.format('csv').options(header='true', \
                                              inferSchema='true').load('../datasets/ml-latest-small/movies.csv')

In [None]:
rating_sdf = spark.read.format('csv').options(header='true', \
                                              inferSchema='true').load('../datasets/ml-latest-small/ratings.csv')

### Convert to pandas dataframes

In [None]:
rating_pd = rating_sdf.toPandas()

In [None]:
rating_pd.head()

In [None]:
movies_pd = movies_sdf.toPandas()

In [None]:
movies_pd.head()

### Check if there are any null

In [None]:
rating_pd.isnull().any()

In [None]:
movies_pd.isnull().any()

In [None]:
rating_sdf.printSchema()

In [None]:
movies_sdf.printSchema()

In [None]:
(training, test) = rating_sdf.randomSplit([0.8, 0.2])

In [None]:
print("Shape of training dataset: ({}, {})".format(training.count(), len(training.dtypes)))

In [None]:
print("Shape of test dataset: ({}, {})".format(test.count(), len(test.dtypes)))

### Build the recommendation model using ALS on the training data
### Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics

### Use ALS from the MLLib to make recommendations
#### Parameters
- numBlocks is the number of blocks used to parallelize computation (set to -1 to auto-configure).  
- rank is the number of latent factors in the model.  
- iterations is the number of iterations to run.  
- lambda specifies the regularization parameter in ALS.  
- implicitPrefs specifies whether to use the explicit feedback ALS variant or one adapted for implicit feedback data.  
- alpha is a parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations.

In [None]:
als = ALS(maxIter = 5, regParam = 0.01, userCol = "userId", itemCol = "movieId", 
          ratingCol = "rating", coldStartStrategy = "drop")

### Cross Validation Parameter Builder

In [None]:
# Create a ParamGridBuilder for model tuning
param_grid = ParamGridBuilder()\
             .addGrid(als.rank, [4, 8, 12])\
             .addGrid(als.maxIter, [10])\
             .addGrid(als.regParam, [.17, .18, .19])\
             .build()

### Create Regression Evaluator

In [None]:
# Set evaluator as RMSE
evaluator = RegressionEvaluator(metricName = 'rmse', labelCol = 'rating', 
                               predictionCol = 'prediction')

### Cross Validation using TrainValidationSplit

In [None]:
import time
import numpy as np

train_val = TrainValidationSplit(estimator = als, estimatorParamMaps = param_grid,
                                evaluator = evaluator)


### Model training
Following takes about 4.2 minutes

In [None]:
# Fit training data to ALS

model = train_val.fit(training)


In [None]:
# Get the Best Model from param builder
bestModel = model.bestModel

### Make Predictions

In [None]:
# Make predicitons and evaluate them with RMSE
predictions = bestModel.transform(test)
rmse = evaluator.evaluate(predictions)

In [None]:
print(predictions)

In [None]:
print(bestModel._java_obj.parent().getMaxIter())

In [None]:
print(bestModel._java_obj.parent().getRegParam())

In [None]:
print(bestModel.rank)

In [None]:
print("RMSE = " + str(rmse))

In [None]:
predictions.sort("userId", "rating").toPandas()

### Recommend movies for 10 users

In [None]:
user_recs = bestModel.recommendForAllUsers(10)

In [None]:
user_recs_pd = user_recs.toPandas()

In [None]:
def getMovieName(movieId):
    name = movies_pd.loc[(movies_pd['movieId'] == movieId), ['title']]
    name = name['title'].values
    return str(name)

In [None]:
def getMovieNames(movieIds):
    ret = []
    for id in movieIds:
        name = getMovieName(id)
        ret.append(name)
    return ret

In [None]:
def getRecords(recs):
    recs = recs.select("recommendations.movieId", "recommendations.rating")
    movies = recs.select("movieId").toPandas().iloc[0, 0]
    ratings = recs.select("rating").toPandas().iloc[0,0]
    ratings_matrix = pd.DataFrame(movies, columns = ["movieId"])
    ratings_matrix["ratings"] = ratings
    movieNames = getMovieNames(movies)
    ratings_matrix['Movie Name'] = movieNames
    return ratings_matrix

In [None]:
user_recs_pd.head(9000)

In [None]:
import numpy as np
def getUserRecommendation(userid):
    y = user_recs_pd.loc[(user_recs_pd['userId'] == userid), ['recommendations'] ]
    recs = y['recommendations'].iloc[0]
    movies = []
    ratings = []
    names = []
    for rec in recs:
        movies.append(rec.movieId)
        ratings.append(np.round(rec.rating, 4))
        names.append(getMovieName(rec.movieId))
    ratings_matrix = pd.DataFrame(movies, columns = ["movieId"])
    ratings_matrix['name'] = names
    ratings_matrix['rating'] = ratings
    return ratings_matrix


In [None]:
urec = getUserRecommendation(463)

In [None]:
pd.set_option('display.max_colwidth', 80)
print("Recommendation for user id 463")
urec

In [None]:
urec = getUserRecommendation(600)

In [None]:
print("Recommendation for user id 600")
urec