In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Python Spark Recommendation System") \
    .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x000001BF95576CF8>


In [3]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

## Dataset

*Sumber [trustlet.org](http://www.trustlet.org/downloaded_epinions.html)*  
Dataset dibuat berdasarkan situs ulasan konsumen Epinions.com pada November hingga Desember 2003
Dataset terdiri atas :
 * 49290 pengguna, yang memberi rating pada
 * 139738 barang berbeda paling tidak sekali  
 
Setiap baris memiliki format :  
`user_id` `item_id` `rating`  
Contohnya :
`23` `387` `5`

Rentang nilai tiap kolom :
 * user_id [1,49290]
 * item_id [1,139738]
 * rating [1,5]

## Preprocess

### Load Data

In [4]:
# https://spark.apache.org/docs/2.2.0/ml-collaborative-filtering.html
# https://spark.apache.org/docs/2.2.0/api/python/pyspark.ml.html#pyspark.ml.recommendation.ALS

lines = spark.read.text("ratings_data.txt").rdd
print(lines.take(5))

[Row(value='1 100 4'), Row(value='1 101 5'), Row(value='1 102 3'), Row(value='1 10 3'), Row(value='1 103 5')]


In [13]:
parts = lines.map(lambda row: row.value.split(" "))

In [14]:
ratingsRDD = parts.map(lambda p: Row(user_id=int(p[0]), item_id=int(p[1]),
                                     rating=float(p[2])))

In [15]:
print(ratingsRDD.take(5))

[Row(item_id=100, rating=4.0, user_id=1), Row(item_id=101, rating=5.0, user_id=1), Row(item_id=102, rating=3.0, user_id=1), Row(item_id=10, rating=3.0, user_id=1), Row(item_id=103, rating=5.0, user_id=1)]


In [16]:
ratings = spark.createDataFrame(ratingsRDD)
ratings.show()

+-------+------+-------+
|item_id|rating|user_id|
+-------+------+-------+
|    100|   4.0|      1|
|    101|   5.0|      1|
|    102|   3.0|      1|
|     10|   3.0|      1|
|    103|   5.0|      1|
|    104|   2.0|      1|
|    105|   5.0|      1|
|    106|   5.0|      1|
|    107|   5.0|      1|
|    108|   5.0|      1|
|    109|   3.0|      1|
|    110|   4.0|      1|
|    111|   5.0|      1|
|    112|   4.0|      1|
|    113|   5.0|      1|
|     11|   4.0|      1|
|    114|   5.0|      1|
|    115|   5.0|      1|
|    116|   5.0|      1|
|    117|   5.0|      1|
+-------+------+-------+
only showing top 20 rows



## Create Model

In [18]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [19]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="item_id", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [20]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 3.0742044368388055


In [21]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [22]:
userRecs.show()

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|    148|[[42662, 14.91667...|
|    463|[[12073, 8.953663...|
|    471|[[9241, 9.648809]...|
|    496|[[22262, 9.649987...|
|    833|[[60729, 9.68524]...|
|   1088|[[42662, 11.28091...|
|   1238|[[4246, 12.01042]...|
|   1342|[[20836, 5.806043...|
|   1580|[[4253, 23.45259]...|
|   1591|[[36067, 9.603536...|
|   1645|[[39664, 16.62483...|
|   1829|[[39664, 12.27674...|
|   2122|[[20806, 22.45664...|
|   2142|[[3320, 12.47198]...|
|   2659|[[105443, 11.2944...|
|   2866|[[5491, 15.770605...|
|   3175|[[39668, 9.401659...|
|   3749|[[7379, 10.86928]...|
|   3794|[[38281, 13.77242...|
|   3918|[[65144, 14.88890...|
+-------+--------------------+
only showing top 20 rows



In [23]:
movieRecs.show()

+-------+--------------------+
|item_id|     recommendations|
+-------+--------------------+
|    148|[[46310, 4.373580...|
|    471|[[24223, 22.21638...|
|    496|[[12579, 47.66844...|
|    833|[[14689, 19.55903...|
|   1088|[[18765, 25.68565...|
|   1238|[[4764, 18.489847...|
|   1342|[[27955, 14.26324...|
|   1580|[[14884, 18.26739...|
|   1645|[[18262, 23.64232...|
|   1829|[[4752, 37.127506...|
|   1959|[[32172, 18.63920...|
|   2122|[[4764, 30.124872...|
|   2142|[[4646, 12.202996...|
|   2366|[[11906, 18.64879...|
|   2659|[[6526, 12.534604...|
|   2866|[[18076, 17.5351]...|
|   3175|[[10840, 9.773432...|
|   3794|[[6487, 25.394901...|
|   3918|[[29249, 21.14554...|
|   3997|[[9706, 13.552515...|
+-------+--------------------+
only showing top 20 rows

