In [1]:
#configuración en google colab de spark y pyspark
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# verificar que tengan instalado la librería 'pyspark', no requerido en AWS EMR/Spark
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/27/67/5158f846202d7f012d1c9ca21c3549a58fd3c6707ae8ee823adcaca6473c/pyspark-3.0.2.tar.gz (204.8MB)
[K     |████████████████████████████████| 204.8MB 72kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 22.4MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.0.2-py2.py3-none-any.whl size=205186687 sha256=ca7a128a509a82b3258a3c8dc9fbedae234710abc6c2d22a7ba245db69731408
  Stored in directory: /root/.cache/pip/wheels/8b/09/da/c1f2859bcc86375dc972c5b6af4881b3603269bcc4c9be5d16
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.0.2


In [4]:
#create spark session, no requerido en AWS EMR/Spark
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('nlp').getOrCreate()

In [5]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

In [16]:
lines = spark.read.text('./all_4_train.csv').rdd
header = lines.take(1)[0]
lines = lines.filter(lambda line: line != header)
parts = lines.map(lambda row: row.value.split(","))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), movieId=int(p[1]),rating=float(p[2])))
ratings = spark.createDataFrame(ratingsRDD)
(training, test) = ratings.randomSplit([0.8, 0.2])

In [17]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop")
model = als.fit(training)

In [18]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.9771777116504603


In [19]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [20]:
# Generate top 10 movie recommendations for a specified set of users
users = ratings.select(als.getUserCol()).distinct().limit(3)
userSubsetRecs = model.recommendForUserSubset(users, 10)
# Generate top 10 user recommendations for a specified set of movies
movies = ratings.select(als.getItemCol()).distinct().limit(3)
movieSubSetRecs = model.recommendForItemSubset(movies, 10)
# $example off$
userRecs.show()
movieRecs.show()
userSubsetRecs.show()
movieSubSetRecs.show()

+------+--------------------+
|userId|     recommendations|
+------+--------------------+
|   471|[[2363, 6.448765]...|
|  1088|[[2367, 9.536041]...|
|  2122|[[2363, 6.632059]...|
|  2659|[[2481, 8.777299]...|
|  4101|[[3171, 4.6980233...|
|  6336|[[3171, 7.516101]...|
|  7554|[[3277, 7.44603],...|
|  8638|[[2367, 3.661239]...|
| 10817|[[3171, 5.3699374...|
| 14450|[[1378, 6.373323]...|
| 14832|[[2363, 5.289339]...|
| 15790|[[1166, 6.455227]...|
| 15846|[[2805, 6.48489],...|
| 16386|[[1349, 4.981197]...|
| 20735|[[3237, 5.096431]...|
| 22346|[[614, 5.2376804]...|
| 23271|[[2191, 6.4202285...|
| 25591|[[2887, 6.492427]...|
| 26706|[[828, 6.99544], ...|
| 27974|[[2745, 6.1191998...|
+------+--------------------+
only showing top 20 rows

+-------+--------------------+
|movieId|     recommendations|
+-------+--------------------+
|   1580|[[649160, 9.19072...|
|    471|[[2489911, 7.5341...|
|   1591|[[782218, 14.5245...|
|   1342|[[2590897, 8.4173...|
|   2122|[[2009688, 7.2322...|
|   21