# INM432 Big Data Coursework 2016/207 Part 2: Spakr Pipelines and Evaluation of Scaling of Algorithms

### Team Members: Ryan Nazareth and Aimore Dutra 

In [25]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from pyspark.ml.evaluation import RegressionEvaluator

spark = SparkSession.builder.getOrCreate() # create a SparkSession 
lines = spark.read.text("hdfs://saltdean/data/movielens/sample_movielens_ratings.txt").rdd 
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
ratings.createOrReplaceTempView('ratings') # register the DataFrame so that we can use it with Spark SQL.

SQL1 = 'SELECT AVG(rating) FROM ratings'
row = spark.sql(SQL1).collect()[0] # get the single row with the result

meanRating = row['avg(rating)'] # access Row as a map 
print('meanRating',meanRating)

se_rdd = test.rdd.map(lambda row: Row(se = pow(row['rating']-meanRating,2)) ) 
se_df = spark.createDataFrame(se_rdd) 
se_df.createOrReplaceTempView('se')
print('se_df',se_df)
SQL2 = 'SELECT AVG(se) FROM se'
row = spark.sql(SQL2).collect()[0]
meanSE = row['avg(se)'] # access Row as a map 
print('meanSE',meanSE)

(training, test) = ratings.randomSplit([0.8, 0.2])

als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")

# We use a ParamGridBuilder to construct a grid of parameters to search over.
# TrainValidationSplit will try all combinations of values and determine best model using
# the evaluator.
paramGrid = ParamGridBuilder() \
    .addGrid(als.regParam, [0.03,0.1,0.3]) \
    .addGrid(als.rank, [5,10,50]).build()
    
# In this case the estimator is simply the linear regression.
# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.
regEval = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
tvs = TrainValidationSplit(estimator=als,
                           estimatorParamMaps=paramGrid,
                           evaluator=regEval,
                           # 80% of the data will be used for training, 20% for validation.
                           trainRatio=0.8)

# Run TrainValidationSplit, and choose the best set of parameters.
model = tvs.fit(training)

# # Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)

rmse = regEval.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))   

meanRating 1.7741505662891406
se_df DataFrame[se: double]
meanSE 1.2542604431985964
Root-mean-square error = 1.0005635545150617
