# Pyspark Recomendation System (ALS Algorithm)

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Python Spark Frequent Itemsets Example") \
    .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x10f24cef0>


In [3]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

## Praproses

Praproses digunakan untuk membaca data user rating. Sesuaikan dengan dataset.

In [4]:
df = spark.read.csv("/Users/gunstringer/Downloads/anime-recommendations-database/rating.csv", header=True, inferSchema=True)

In [5]:
df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- anime_id: integer (nullable = true)
 |-- rating: integer (nullable = true)



In [6]:
df.count()

7813737

In [7]:
df.show()

+-------+--------+------+
|user_id|anime_id|rating|
+-------+--------+------+
|      1|      20|    -1|
|      1|      24|    -1|
|      1|      79|    -1|
|      1|     226|    -1|
|      1|     241|    -1|
|      1|     355|    -1|
|      1|     356|    -1|
|      1|     442|    -1|
|      1|     487|    -1|
|      1|     846|    -1|
|      1|     936|    -1|
|      1|    1546|    -1|
|      1|    1692|    -1|
|      1|    1836|    -1|
|      1|    2001|    -1|
|      1|    2025|    -1|
|      1|    2144|    -1|
|      1|    2787|    -1|
|      1|    2993|    -1|
|      1|    3455|    -1|
+-------+--------+------+
only showing top 20 rows



## Filter Data 
Menghilangkan data rating N/A atau bernilai kurang dari 0

In [8]:
df = df.dropna()

In [10]:
df.createOrReplaceTempView("anime")

In [11]:
ratings = spark.sql ("SELECT * FROM anime where rating >= 0")

In [12]:
ratings.show()

+-------+--------+------+
|user_id|anime_id|rating|
+-------+--------+------+
|      1|    8074|    10|
|      1|   11617|    10|
|      1|   11757|    10|
|      1|   15451|    10|
|      2|   11771|    10|
|      3|      20|     8|
|      3|     154|     6|
|      3|     170|     9|
|      3|     199|    10|
|      3|     225|     9|
|      3|     341|     6|
|      3|     430|     7|
|      3|     527|     7|
|      3|     552|     7|
|      3|     813|    10|
|      3|    1119|     7|
|      3|    1121|     7|
|      3|    1122|     7|
|      3|    1132|     8|
|      3|    1292|     6|
+-------+--------+------+
only showing top 20 rows



In [13]:
ratings.count()

6337241

## Create Model

In [14]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [15]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="anime_id", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [16]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.1769900370213127


In [17]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

In [18]:
userRecs.show()

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|    148|[[8375, 32.014004...|
|    463|[[8375, 30.679579...|
|    471|[[8375, 16.239899...|
|    496|[[11795, 16.06025...|
|    833|[[8375, 24.381958...|
|   1088|[[9077, 23.748951...|
|   1238|[[8375, 28.015049...|
|   1342|[[9077, 80.397446...|
|   1580|[[9077, 26.957514...|
|   1591|[[5663, 13.937275...|
|   1645|[[8375, 43.24437]...|
|   1829|[[5663, 21.526478...|
|   1959|[[31762, 18.70122...|
|   2122|[[5569, 25.78103]...|
|   2142|[[7157, 20.862059...|
|   2366|[[8375, 30.696129...|
|   2659|[[8375, 19.454397...|
|   2866|[[8375, 37.802578...|
|   3175|[[25495, 18.02893...|
|   3749|[[7257, 22.68229]...|
+-------+--------------------+
only showing top 20 rows



In [19]:
movieRecs.show()

+--------+--------------------+
|anime_id|     recommendations|
+--------+--------------------+
|    1580|[[71885, 27.72872...|
|    5300|[[7548, 16.965534...|
|     471|[[68128, 16.72869...|
|    1591|[[68128, 22.43071...|
|    4101|[[73381, 10.21272...|
|   11141|[[344, 11.517619]...|
|   24171|[[36956, 22.24688...|
|   25591|[[344, 12.456382]...|
|    1342|[[27990, 30.99148...|
|    2122|[[63732, 18.63065...|
|    2142|[[72368, 13.05252...|
|    7982|[[39927, 11.76606...|
|   31912|[[17508, 17.17462...|
|     463|[[68128, 17.16554...|
|     833|[[17508, 19.63144...|
|   10623|[[39927, 10.73574...|
|   11033|[[17508, 20.22209...|
|   17753|[[73381, 10.45785...|
|   30903|[[7548, 27.76015]...|
|    6654|[[36956, 57.38936...|
+--------+--------------------+
only showing top 20 rows

