# Recommender System

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('RS').getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
21/12/28 21:10:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
21/12/28 21:10:06 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
21/12/28 21:10:06 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [4]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [5]:
df_spark = spark.read.csv('data/movies_ratings.csv', header=True, inferSchema=True)
df_spark = df_spark.drop('timestamp')
df_spark.show()

                                                                                

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|     1|    110|   1.0|
|     1|    147|   4.5|
|     1|    858|   5.0|
|     1|   1221|   5.0|
|     1|   1246|   5.0|
|     1|   1968|   4.0|
|     1|   2762|   4.5|
|     1|   2918|   5.0|
|     1|   2959|   4.0|
|     1|   4226|   4.0|
|     1|   4878|   5.0|
|     1|   5577|   5.0|
|     1|  33794|   4.0|
|     1|  54503|   3.5|
|     1|  58559|   4.0|
|     1|  59315|   5.0|
|     1|  68358|   5.0|
|     1|  69844|   5.0|
|     1|  73017|   5.0|
|     1|  81834|   5.0|
+------+-------+------+
only showing top 20 rows



In [6]:
df_spark.describe().show()



+-------+-----------------+------------------+------------------+
|summary|           userId|           movieId|            rating|
+-------+-----------------+------------------+------------------+
|  count|         26024289|          26024289|          26024289|
|   mean| 135037.090248114|15849.109677040553|3.5280903543608817|
| stddev|78176.19722170963|31085.257531391508|1.0654427636662405|
|    min|                1|                 1|               0.5|
|    max|           270896|            176275|               5.0|
+-------+-----------------+------------------+------------------+



                                                                                

In [7]:
train, test = df_spark.randomSplit([0.75, 0.25])

In [8]:
als = ALS(maxIter=5,
          regParam=0.01,
          userCol='userId',
          itemCol='movieId',
          ratingCol='rating')

In [9]:
model = als.fit(train)

21/12/28 21:11:46 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
21/12/28 21:11:46 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
21/12/28 21:11:48 WARN InstanceBuilder$NativeLAPACK: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                

In [10]:
predictions = model.transform(train)
predictions.show()



+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|     1|    110|   1.0| 2.1854148|
|     1|    147|   4.5| 5.0077424|
|     1|    858|   5.0| 3.9157996|
|     1|   1968|   4.0| 4.4681597|
|     1|   2762|   4.5|  3.933242|
|     1|   2918|   5.0| 4.8423142|
|     1|   2959|   4.0|  4.413575|
|     1|   4226|   4.0| 5.0325427|
|     1|   4878|   5.0| 4.4511256|
|     1|  58559|   4.0| 3.7983449|
|     1|  68358|   5.0|  3.412424|
|     1|  69844|   5.0|  4.133386|
|     1|  81834|   5.0| 4.2029743|
|     1|  91500|   2.5| 3.4208715|
|     1|  92439|   5.0| 3.8632247|
|     1|  96821|   5.0| 4.4663596|
|     1|  98809|   0.5| 2.2472618|
|     1| 112552|   5.0| 4.6257486|
| 52224|  54286|   4.0| 3.8350205|
| 52224|  59315|   3.5| 3.4924827|
+------+-------+------+----------+
only showing top 20 rows



                                                                                

In [11]:
evaluator = RegressionEvaluator(metricName='rmse', labelCol='rating', predictionCol='prediction')

In [12]:
rmse = evaluator.evaluate(predictions)

                                                                                

In [13]:
print(f'RMSE: {rmse}')

RMSE: 0.7307858469138336


In [18]:
single_user = test.filter(test['userId']==1).select(['movieId', 'userId'])

In [19]:
single_user.show()



+-------+------+
|movieId|userId|
+-------+------+
|   1221|     1|
|   1246|     1|
|   5577|     1|
|  33794|     1|
|  54503|     1|
|  59315|     1|
|  73017|     1|
|  91542|     1|
|  99114|     1|
+-------+------+



                                                                                

In [20]:
recommendations = model.transform(single_user)

In [23]:
recommendations.orderBy('prediction', ascending=False).show()



+-------+------+----------+
|movieId|userId|prediction|
+-------+------+----------+
|  73017|     1| 4.7726235|
|  91542|     1| 4.6581016|
|   5577|     1|  4.380063|
|   1246|     1|  4.282317|
|  33794|     1| 4.1999807|
|  99114|     1| 4.1401916|
|  59315|     1| 4.0991483|
|   1221|     1|  4.037142|
|  54503|     1| 3.7291517|
+-------+------+----------+



                                                                                