In [1]:
import pandas as pd
import numpy as np
import timeit
import time

# get the data from csv file to dataframe
DataTrain = pd.read_csv(
    'data/upload/DataTrain.csv', 
    low_memory=False)
DataTest = pd.read_csv(
    'data/upload/DataTest.csv', 
    low_memory=False)

DataTrain.drop("Unnamed: 0", 1, inplace = True)
DataTest.drop("Unnamed: 0", 1, inplace = True)


In [7]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('recommender').getOrCreate()
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [8]:
DataTrain = spark.read.csv('data/upload/DataTrain.csv', inferSchema=True, header=True)
DataTest = spark.read.csv('data/upload/DataTest.csv', inferSchema=True, header=True)

als = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='itemId', ratingCol='rating')

model = als.fit(DataTrain)

predictions = model.transform(DataTest)

In [9]:
predictions.describe().show()

+-------+------------------+------------------+------------------+------------------+--------------------+----------+
|summary|          reviewId|            itemId|            userId|            rating|sentiment_score_norm|prediction|
+-------+------------------+------------------+------------------+------------------+--------------------+----------+
|  count|              1116|              1116|              1116|              1116|                1116|      1116|
|   mean| 2816.728494623656| 15.46236559139785|1857.6917562724013| 4.472222222222222|   3.860215053763441|       NaN|
| stddev|1577.9455919774737|10.199596376206436|1190.1411108120685|0.9442480525019428|  0.7072295143709689|       NaN|
|    min|                 9|                 1|                 1|                 1|                 1.5| -18.78609|
|    max|              5133|                40|              3815|                 5|                 5.0|       NaN|
+-------+------------------+------------------+---------

In [10]:
predictions = predictions.na.drop()
predictions.describe().show()

+-------+------------------+-----------------+-----------------+------------------+--------------------+------------------+
|summary|          reviewId|           itemId|           userId|            rating|sentiment_score_norm|        prediction|
+-------+------------------+-----------------+-----------------+------------------+--------------------+------------------+
|  count|               982|              982|              982|               982|                 982|               982|
|   mean|2509.7331975560082|13.14562118126273|2050.467413441955| 4.594704684317719|  3.8721995926680246| 2.799754872811909|
| stddev|1429.7212149203187|8.347826917328316|1134.196793964153|0.7215147620412754|  0.6517529478846156|3.1415917845968417|
|    min|                 9|                1|                1|                 1|                 1.5|         -18.78609|
|    max|              4999|               23|             3815|                 5|                 5.0|          8.841551|
+-------

In [11]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating')
# RMSE Root Mean Square Error
rmse = evaluator.evaluate(predictions)
rmse

3.61497011833511

In [13]:
als2 = ALS(maxIter=5, regParam=0.01, userCol='userId', itemCol='itemId', ratingCol='sentiment_score_norm')

model2 = als2.fit(DataTrain)

predictions2 = model2.transform(DataTest)

In [14]:
predictions2 = predictions2.na.drop()
predictions2.describe().show()

+-------+------------------+-----------------+-----------------+------------------+--------------------+------------------+
|summary|          reviewId|           itemId|           userId|            rating|sentiment_score_norm|        prediction|
+-------+------------------+-----------------+-----------------+------------------+--------------------+------------------+
|  count|               982|              982|              982|               982|                 982|               982|
|   mean|2509.7331975560082|13.14562118126273|2050.467413441955| 4.594704684317719|  3.8721995926680246| 2.363613892376969|
| stddev|1429.7212149203187|8.347826917328316|1134.196793964153|0.7215147620412754|  0.6517529478846156|2.7429094575079698|
|    min|                 9|                1|                1|                 1|                 1.5|        -13.279798|
|    max|              4999|               23|             3815|                 5|                 5.0|          8.283944|
+-------

In [15]:
evaluator2 = RegressionEvaluator(metricName='rmse', labelCol='sentiment_score_norm')
# RMSE Root Mean Square Error
rmse2 = evaluator2.evaluate(predictions2)
rmse2

3.124018111879493