<a href="https://colab.research.google.com/github/gteless/Aulas_FIAP/blob/main/recomendacoes_als.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
!pip install pyspark
!pip install findspark



In [17]:
from pyspark.sql import SparkSession
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [18]:
spark = SparkSession.builder.master('local[*]').getOrCreate()

In [19]:
lines = spark.read.text("/content/06- sample_movielens_ratings.txt").rdd

In [20]:
parts = lines.map(lambda row: row.value.split("::"))

In [23]:
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]),  \
                                     movieId=int(p[1]), \
                                     rating=float(p[2]), \
                                     timestamp=int(p[3])))

In [24]:
ratings = spark.createDataFrame(ratingsRDD)

In [25]:
ratings.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     0|      2|   3.0|1424380312|
|     0|      3|   1.0|1424380312|
|     0|      5|   2.0|1424380312|
|     0|      9|   4.0|1424380312|
|     0|     11|   1.0|1424380312|
|     0|     12|   2.0|1424380312|
|     0|     15|   1.0|1424380312|
|     0|     17|   1.0|1424380312|
|     0|     19|   1.0|1424380312|
|     0|     21|   1.0|1424380312|
|     0|     23|   1.0|1424380312|
|     0|     26|   3.0|1424380312|
|     0|     27|   1.0|1424380312|
|     0|     28|   1.0|1424380312|
|     0|     29|   1.0|1424380312|
|     0|     30|   1.0|1424380312|
|     0|     31|   1.0|1424380312|
|     0|     34|   1.0|1424380312|
|     0|     37|   1.0|1424380312|
|     0|     41|   2.0|1424380312|
+------+-------+------+----------+
only showing top 20 rows



In [26]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [27]:
als = ALS(maxIter=5,\
          regParam=0.01,\
          userCol="userId",\
          itemCol="movieId",\
          ratingCol="rating",\
          coldStartStrategy="drop")

In [30]:
model = als.fit(training)

In [31]:
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",\
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Erro médio quadrado = " + str(rmse))

Erro médio quadrado = 1.8113472093514018


In [32]:
userRec = model.recommendForAllUsers(10)

In [33]:
userRec.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|    20|[{94, 4.101472}, ...|
|    10|[{62, 3.2924948},...|
|     0|[{92, 4.111323}, ...|
|     1|[{62, 3.9785001},...|
|    21|[{29, 4.9223266},...|
|    11|[{46, 5.629353}, ...|
|    12|[{1, 7.0405626}, ...|
|    22|[{46, 6.120947}, ...|
|     2|[{8, 5.117934}, {...|
|    13|[{74, 2.9587736},...|
|     3|[{83, 4.971333}, ...|
|    23|[{9, 5.637798}, {...|
|     4|[{74, 4.6671205},...|
|    24|[{90, 5.023566}, ...|
|    14|[{32, 5.6837153},...|
|     5|[{46, 6.060383}, ...|
|    15|[{46, 4.676827}, ...|
|    25|[{51, 6.7716484},...|
|    26|[{49, 5.457879}, ...|
|     6|[{51, 3.8709667},...|
+------+--------------------+
only showing top 20 rows



In [34]:
movieRecs = model.recommendForAllItems(10)

In [35]:
movieRecs.show()

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|     20|[{17, 4.8744907},...|
|     40|[{8, 2.1154873}, ...|
|     10|[{23, 3.9910712},...|
|     50|[{12, 3.9930975},...|
|     80|[{12, 4.4972887},...|
|     70|[{8, 4.4876175}, ...|
|     60|[{8, 2.864603}, {...|
|     90|[{26, 5.426406}, ...|
|     30|[{11, 5.0885506},...|
|      0|[{23, 3.569371}, ...|
|     31|[{11, 3.9737198},...|
|     81|[{28, 4.7277374},...|
|     91|[{23, 3.7359993},...|
|      1|[{12, 7.0405626},...|
|     41|[{14, 4.286777}, ...|
|     61|[{16, 3.319751}, ...|
|     51|[{25, 6.7716484},...|
|     21|[{26, 2.760354}, ...|
|     11|[{7, 1.8847281}, ...|
|     71|[{25, 3.9516907},...|
+-------+--------------------+
only showing top 20 rows



In [36]:
users = ratings.select(als.getUserCol()).distinct()

In [37]:
users.show()

+------+
|userId|
+------+
|    26|
|    29|
|    19|
|     0|
|    22|
|     7|
|    25|
|     6|
|     9|
|    27|
|    17|
|    28|
|     5|
|     1|
|    10|
|     3|
|    12|
|     8|
|    11|
|     2|
+------+
only showing top 20 rows



In [38]:
UserRecsOnlyItemId = userRec.select(userRec['userId'], \
                                    userRec['recommendations']['movieId'])

In [41]:
UserRecsOnlyItemId.show(10, False)

+------+----------------------------------------+
|userId|recommendations.movieId                 |
+------+----------------------------------------+
|20    |[94, 77, 22, 74, 90, 75, 18, 46, 88, 68]|
|10    |[62, 29, 42, 52, 9, 49, 89, 72, 0, 92]  |
|0     |[92, 18, 49, 87, 9, 52, 91, 43, 86, 26] |
|1     |[62, 17, 18, 53, 55, 28, 13, 9, 52, 85] |
|21    |[29, 53, 43, 70, 2, 52, 74, 90, 95, 96] |
|11    |[46, 30, 23, 32, 27, 79, 96, 48, 90, 66]|
|12    |[1, 46, 64, 35, 17, 44, 80, 98, 33, 94] |
|22    |[46, 75, 49, 74, 30, 88, 77, 51, 94, 72]|
|2     |[8, 93, 83, 37, 39, 81, 38, 89, 29, 92] |
|13    |[74, 53, 29, 62, 83, 52, 18, 92, 41, 4] |
+------+----------------------------------------+
only showing top 10 rows

