### Recommender System for Movie Lens

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('rec').getOrCreate()

In [4]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS

In [5]:
data = spark.read.csv('movielens_ratings.csv',inferSchema=True,header=True)

In [6]:
data.head()

Row(movieId=2, rating=3.0, userId=0)

In [7]:
data.describe().show()

+-------+------------------+------------------+------------------+
|summary|           movieId|            rating|            userId|
+-------+------------------+------------------+------------------+
|  count|              1501|              1501|              1501|
|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|
| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|
|    min|                 0|               1.0|                 0|
|    max|                99|               5.0|                29|
+-------+------------------+------------------+------------------+



In [8]:
(training, test) = data.randomSplit([0.8, 0.2])

In [9]:
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating")
model = als.fit(training)

In [10]:
predictions = model.transform(test)

In [11]:
predictions.show()

+-------+------+------+-----------+
|movieId|rating|userId| prediction|
+-------+------+------+-----------+
|     31|   1.0|    27|  0.8955359|
|     31|   4.0|    12|-0.29367238|
|     31|   2.0|    25|  0.8474124|
|     85|   1.0|    12|  1.4147537|
|     85|   1.0|    15| -1.1256746|
|     85|   1.0|    23|  1.2074348|
|     65|   2.0|     3|-0.46211895|
|     65|   1.0|    19|  1.4064909|
|     65|   2.0|    15|  3.4476335|
|     53|   1.0|     6|  5.4405265|
|     53|   2.0|    19|  1.7516881|
|     53|   1.0|     9|-0.51127696|
|     53|   1.0|    25| -1.6916114|
|     78|   1.0|     1|    0.97278|
|     34|   1.0|    28|  1.9464095|
|     34|   1.0|    19|  1.1601496|
|     81|   3.0|    26|  1.9598632|
|     81|   1.0|    16| 0.67300534|
|     81|   1.0|    15|  1.9634713|
|     76|   1.0|    11|  0.9585729|
+-------+------+------+-----------+
only showing top 20 rows



In [12]:
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.570373838596153


##### supply a recommendation to a user

In [13]:
single_user = test.filter(test['userId']==11).select(['movieId','userId'])

In [14]:
single_user.show()

+-------+------+
|movieId|userId|
+-------+------+
|     11|    11|
|     18|    11|
|     23|    11|
|     25|    11|
|     37|    11|
|     43|    11|
|     47|    11|
|     50|    11|
|     64|    11|
|     69|    11|
|     71|    11|
|     76|    11|
|     80|    11|
+-------+------+



In [15]:
reccomendations = model.transform(single_user)

In [16]:
reccomendations.orderBy('prediction',ascending=False).show()

+-------+------+------------+
|movieId|userId|  prediction|
+-------+------+------------+
|     47|    11|   4.1931615|
|     23|    11|   3.4420052|
|     11|    11|    3.290887|
|     69|    11|   2.5797758|
|     18|    11|    2.349695|
|     37|    11|   1.8759484|
|     80|    11|   1.6435853|
|     50|    11|   1.3653221|
|     76|    11|   0.9585729|
|     64|    11|-0.034999847|
|     43|    11| -0.64043903|
|     25|    11|   -1.153085|
|     71|    11|   -1.791611|
+-------+------+------------+



##### END