# Pyspark Recomendation System (ALS Algorithm)

## Inisialisasi


In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession \
    .builder \
    .appName("Python Spark Frequent Itemsets Example") \
    .getOrCreate()

print(spark)

<pyspark.sql.session.SparkSession object at 0x115ee0e48>


In [3]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

## Praproses

Membaca data user rating. Sesuaikan dengan dataset.

In [4]:
df = spark.read.csv("/Users/gunstringer/Downloads/anime-recommendations-database/rating.csv", header=True, inferSchema=True)

In [5]:
df.printSchema()

root
 |-- user_id: integer (nullable = true)
 |-- anime_id: integer (nullable = true)
 |-- rating: integer (nullable = true)



In [6]:
df.count()

7813737

In [7]:
df.show()

+-------+--------+------+
|user_id|anime_id|rating|
+-------+--------+------+
|      1|      20|    -1|
|      1|      24|    -1|
|      1|      79|    -1|
|      1|     226|    -1|
|      1|     241|    -1|
|      1|     355|    -1|
|      1|     356|    -1|
|      1|     442|    -1|
|      1|     487|    -1|
|      1|     846|    -1|
|      1|     936|    -1|
|      1|    1546|    -1|
|      1|    1692|    -1|
|      1|    1836|    -1|
|      1|    2001|    -1|
|      1|    2025|    -1|
|      1|    2144|    -1|
|      1|    2787|    -1|
|      1|    2993|    -1|
|      1|    3455|    -1|
+-------+--------+------+
only showing top 20 rows



## Filter Data 
Menghilangkan data rating N/A atau bernilai kurang dari 0

In [8]:
df = df.dropna()

In [9]:
df.createOrReplaceTempView("anime")

In [10]:
ratings = spark.sql ("SELECT * FROM anime where rating >= 0")

In [11]:
ratings.show()

+-------+--------+------+
|user_id|anime_id|rating|
+-------+--------+------+
|      1|    8074|    10|
|      1|   11617|    10|
|      1|   11757|    10|
|      1|   15451|    10|
|      2|   11771|    10|
|      3|      20|     8|
|      3|     154|     6|
|      3|     170|     9|
|      3|     199|    10|
|      3|     225|     9|
|      3|     341|     6|
|      3|     430|     7|
|      3|     527|     7|
|      3|     552|     7|
|      3|     813|    10|
|      3|    1119|     7|
|      3|    1121|     7|
|      3|    1122|     7|
|      3|    1132|     8|
|      3|    1292|     6|
+-------+--------+------+
only showing top 20 rows



In [12]:
ratings.count()

6337241

## Membuat Model

membuat model ALS Algorithm

In [13]:
(training, test) = ratings.randomSplit([0.8, 0.2])

In [14]:
# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als = ALS(maxIter=5, regParam=0.01, userCol="user_id", itemCol="anime_id", ratingCol="rating",
          coldStartStrategy="drop")
model = als.fit(training)

In [15]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 1.1768135397921138


In [16]:
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

# Tampilkan data rekomendasi

In [17]:
userRecs.show()

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|    148|[[7112, 13.83287]...|
|    463|[[5688, 19.655495...|
|    471|[[5565, 21.059868...|
|    496|[[25495, 14.40900...|
|    833|[[7416, 16.756664...|
|   1088|[[25495, 26.73759...|
|   1238|[[5688, 19.01693]...|
|   1342|[[20929, 40.4739]...|
|   1580|[[25495, 21.18709...|
|   1591|[[28439, 16.50583...|
|   1645|[[25495, 15.39053...|
|   1829|[[25495, 20.45839...|
|   1959|[[10584, 19.27946...|
|   2122|[[12745, 24.72007...|
|   2142|[[8446, 14.039937...|
|   2366|[[31762, 22.44822...|
|   2659|[[20929, 18.75345...|
|   2866|[[6183, 18.69217]...|
|   3175|[[29778, 21.35224...|
|   3749|[[6183, 21.236727...|
+-------+--------------------+
only showing top 20 rows



In [18]:
movieRecs.show()

+--------+--------------------+
|anime_id|     recommendations|
+--------+--------------------+
|    1580|[[21334, 28.36326...|
|    5300|[[47879, 14.57507...|
|    9900|[[57725, 10.40306...|
|     471|[[33981, 15.20216...|
|    1591|[[36956, 21.71910...|
|    4101|[[18937, 9.706232...|
|   11141|[[32000, 11.78403...|
|   24171|[[69605, 21.45848...|
|   25591|[[64560, 10.02326...|
|    1342|[[66242, 27.56492...|
|    2122|[[25218, 18.27583...|
|    2142|[[66242, 13.50196...|
|   31912|[[24363, 18.54298...|
|     463|[[56105, 14.79384...|
|     833|[[8325, 18.986818...|
|   10623|[[35893, 16.03964...|
|   11033|[[69605, 19.21315...|
|   17753|[[47879, 9.873876...|
|   30903|[[26722, 33.46457...|
|    6654|[[33981, 35.44406...|
+--------+--------------------+
only showing top 20 rows



## Eksperimen

In [39]:
# tampilkan rekomendasi anime 1 user
users = ratings.select(als.getUserCol()).distinct()
users = users.filter(users.user_id == 1)
userSubsetRecs = model.recommendForUserSubset(users, 10)
userSubsetRecs.show()

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|      1|[[25495, 32.27197...|
+-------+--------------------+



In [51]:
# export RDD / Dataframe to json
import pandas
new = userSubsetRecs.toPandas()
new = new.to_json()

In [63]:
# membuat prediksi 1 user pada 1 anime
dataframe = spark.createDataFrame([(7726, 148)], ["user_id", "anime_id"])
predictions2 = model.transform(dataframe)
predictions2.show()

+-------+--------+----------+
|user_id|anime_id|prediction|
+-------+--------+----------+
|   7726|     148|  7.389661|
+-------+--------+----------+

