In [26]:
import os
from pyspark.sql import SparkSession
#from pyspark.sql.functions import monotonically_increasing_id

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import missingno as msno
import pandas_profiling

%matplotlib inline
# Customise plots
mpl.rcParams['font.sans-serif'] = "Arial"
mpl.rcParams['font.family'] = "sans-serif"
mpl.rcParams['xtick.labelsize'] = 12
mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['axes.titlesize'] = 16
mpl.rcParams['axes.labelsize'] = 12

# Initialise spark
spark = SparkSession.builder\
                    .appName('Small_movie')\
                    .master('local[*]')\
                    .getOrCreate()

# Load data 
links = spark.read\
             .csv('../data/links.csv',\
                  header=True,\
                  inferSchema=True)
links.createOrReplaceTempView('links')

movies = spark.read\
             .csv('../data/movies.csv',\
                  header=True,\
                  inferSchema=True)
movies.createOrReplaceTempView('movies')

ratings = spark.read\
             .csv('../data/ratings.csv',\
                  header=True,\
                  inferSchema=True)
ratings.createOrReplaceTempView('ratings')

tags = spark.read\
             .csv('../data/tags.csv',\
                  header=True,\
                  inferSchema=True)
tags.createOrReplaceTempView('tags')



In [27]:
movies.show(1, vertical=True)

-RECORD 0-----------------------
 movieId | 1                    
 title   | Toy Story (1995)     
 genres  | Adventure|Animati... 
only showing top 1 row



In [28]:
ratings.show(1, vertical=True)

-RECORD 0--------------
 userId    | 1         
 movieId   | 1         
 rating    | 4.0       
 timestamp | 964982703 
only showing top 1 row



In [32]:
# Select user, movie and rating 
# join movies and ratings
query="""
SELECT mov.movieId, rat.userId, rat.rating 
FROM movies as mov
INNER JOIN ratings AS rat
ON mov.movieId=rat.movieId
"""
df = spark.sql(query)
df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)



In [30]:
df.show(1, vertical=True)

-RECORD 0------
 movieId | 1   
 userId  | 1   
 rating  | 4.0 
only showing top 1 row



In [44]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

(training_data, test_data) = df.randomSplit([0.8, 0.2],\
                                                 seed=1234)
# ALS Method
# rank: number of latent features
# regParam: regularisation parameter
# coldStartStrategy: avoid user all the ratings in the train set

als = ALS(userCol='userId',\
                itemCol='movieId',\
                ratingCol='rating',\
                rank=5,\
                maxIter=10,\
                regParam=0.05,\
                nonnegative=True,\
                coldStartStrategy='drop',\
                implicitPrefs=False)
als_model = als.fit(training_data)

# Evaluate the model by computing the RMSE on the test data
predictions = als_model.transform(test_data)
evaluator = RegressionEvaluator(metricName="rmse",\
                                labelCol="rating",\
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))