In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('recom').getOrCreate()
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
data = spark.read.csv('dbfs:/FileStore/shared_uploads/gkantirisrafael@gmail.com/movielens_ratings.csv',header=True,inferSchema=True)

In [0]:
data.show(3)

+-------+------+------+
|movieId|rating|userId|
+-------+------+------+
|      2|   3.0|     0|
|      3|   1.0|     0|
|      5|   2.0|     0|
+-------+------+------+
only showing top 3 rows



In [0]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [0]:
#We want to build a recommendation system for our 3 users
#We will split the data into train and test sets
training,test = data.randomSplit([0.8,0.2])

In [0]:
#Now we will create our ALS model
als = ALS(maxIter=5,regParam=0.01,userCol='userId',itemCol='movieId',ratingCol='rating')

In [0]:
#Now we will create a model
model = als.fit(training)

In [0]:
#Νow we will see how our model performed
predictions = model.transform(test)

In [0]:
predictions.show(4)

+-------+------+------+----------+
|movieId|rating|userId|prediction|
+-------+------+------+----------+
|      0|   1.0|    19| 0.6702602|
|      1|   1.0|     5| 1.0954499|
|      1|   1.0|    28| 6.3937554|
|      2|   1.0|    23|   1.67413|
+-------+------+------+----------+
only showing top 4 rows



In [0]:
#We will see how we can more formally evaluate this model
evaluator = RegressionEvaluator(metricName='rmse',labelCol='rating',predictionCol='prediction')

In [0]:
#rmse = root mean squared error
rmse = evaluator.evaluate(predictions)

In [0]:
print('RMSE:',rmse)

RMSE: 1.8446081980141715


In [0]:
single_user = test.filter(test['userId']==11).select('movieId','userId')

In [0]:
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|     23|    11|
|     30|    11|
|     32|    11|
|     40|    11|
|     47|    11|
|     66|    11|
|     75|    11|
|     76|    11|
|     77|    11|
|     86|    11|
|     90|    11|
|     99|    11|
+-------+------+



In [0]:
recommandations = model.transform(single_user)

In [0]:
recommandations.orderBy('prediction',ascending=False).show()

+-------+------+------------+
|movieId|userId|  prediction|
+-------+------+------------+
|     47|    11|    3.636132|
|     23|    11|   3.0507095|
|     32|    11|   2.3810744|
|     30|    11|   2.0381036|
|     99|    11|   1.3677499|
|     75|    11|-0.052416984|
|     77|    11|-0.090883374|
|     90|    11| -0.17210558|
|     40|    11| -0.17815378|
|     86|    11| -0.42581415|
|     76|    11|  -2.3493063|
|     66|    11|  -2.9051952|
+-------+------+------------+

