# INM432 Big Data Coursework 2016/207 Part 2: Spark Pipelines and Evaluation of Scaling of Algorithms

### Team Members: Ryan Nazareth and Aimore Dutra 

## a) Choice of Dataset and Task 


In [10]:
from pyspark.sql import SparkSession
from pyspark.sql import Row
# the imports are used creating the data frame

spark = SparkSession.builder.getOrCreate() # create a SparkSession 
# this gets us an RDD. (could also be done with RDD.textFile in this case)
lines = spark.read.text("hdfs://saltdean/data/movielens/sample_movielens_ratings.txt").rdd 
# now split the lines at the '::'
parts = lines.map(lambda row: row.value.split("::"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),
                                     rating=float(p[2]), timestamp=int(p[3])))
ratings = spark.createDataFrame(ratingsRDD)
ratings.createOrReplaceTempView('ratings') # register the DataFrame so that we can use it with Spark SQL.
(training, test) = ratings.randomSplit([0.8, 0.2]) # split into test and training set
print(training.describe()) # just for testing, should show the four columns
print(training.count()) # just fore testing, should be around 1188

DataFrame[summary: string, movieId: string, rating: string, timestamp: string, userId: string]
1210


## b) Machine Learning Pipeline 

Now create an ALS estimator and a parameter grid to explore different values for the `rank` and `regParam` parameter of the ALS. 

### Choice of processing steps:

### Learning algorithms: 

### Parameter settings:


In [11]:
SQL1 = 'SELECT AVG(rating) FROM ratings'
row = spark.sql(SQL1).collect()[0] # get the single row with the result

meanRating = row['avg(rating)'] # access Row as a map 
print('meanRating',meanRating)

se_rdd = test.rdd.map(lambda row: Row(se = pow(row['rating']-meanRating,2)) ) 
se_df = spark.createDataFrame(se_rdd) 
se_df.createOrReplaceTempView('se')
print('sedf',se_df)
SQL2 = 'SELECT AVG(se) FROM se'
row = spark.sql(SQL2).collect()[0]
meanSE = row['avg(se)'] # access Row as a map 
print('meanSE',meanSE)

meanRating 1.7741505662891406
sedf DataFrame[se: double]
meanSE 1.662126327662754


## c) Evaluating Performance of Pipeline using training and test sets





## d) Implementing a parameter grid 

Implementing a parameter grid (using pyspark.ml.tuning.ParamGridBuilder[source]), varying at least one feature preprocessing step, one machine learning parameter, and the training set size. 


In [13]:
from pyspark.ml.tuning import TrainValidationSplit 
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import ParamGridBuilder

# Build the recommendation model using ALS on the training data
als = ALS(maxIter=5, rank=5, regParam=0.1, userCol="userId", itemCol="movieId", ratingCol="rating")
regEval = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
paramGrid = ParamGridBuilder() \
    .addGrid(als.regParam, [0.03,0.1,0.3]) \
    .addGrid(als.rank, [5,10,50]).build()
    #    .addGrid(als.rank, [3,10,30,100,300]).build() 

tvs = TrainValidationSplit(estimator=als, estimatorParamMaps=paramGrid, evaluator=regEval)
print('starting training')
tvsModel = tvs.fit(training)
print('finished training')

starting cross-validation
finished cross-validation


In [18]:

print(tvsModel.bestModel.rank)
#paramMap = list(zip(tvsModel.getEstimatorParamMaps())
#paramMax = max(paramMap)
#print(paramMax)
# Evaluate the model by computing the RMSE on the test data
predictions = tvsModel.transform(test)
for row in predictions.take(5):
    print(row)
    
rmse = regEval.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))


50
Row(movieId=31, rating=3.0, timestamp=1424380312, userId=8, prediction=1.852697730064392)
Row(movieId=31, rating=2.0, timestamp=1424380312, userId=25, prediction=1.506085753440857)
Row(movieId=31, rating=1.0, timestamp=1424380312, userId=24, prediction=1.7374300956726074)
Row(movieId=31, rating=1.0, timestamp=1424380312, userId=0, prediction=1.0014280080795288)
Row(movieId=85, rating=5.0, timestamp=1424380312, userId=16, prediction=1.7288849353790283)
Root-mean-square error = 1.2269074565300253
