In [1]:
import findspark
findspark.init("C:\\spark")

In [2]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row
from pyspark.sql import SparkSession

Recommendations sistemlerde 3 algoritma kullanilir.
- Collaborative filtering
- Content-based
- Hybrid technique.

NOTES:
- Bu projede bir collaborative filtering algorithm olan `Alternating least square matrix factorization method` (ALS) kullanacagiz.
- ALS, Apache Spark'ta yerleşik olarak gelir.
- PySpark, Spark için Python API'sidir.

In [3]:
spark = SparkSession.builder.appName("second_example").getOrCreate()

In [4]:
lines = spark.read.csv(".\\Pyspark_ML_Rating_Data\\ratings.csv", inferSchema = True, header= True)
lines.show()

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|     31|   2.5|1260759144|
|     1|   1029|   3.0|1260759179|
|     1|   1061|   3.0|1260759182|
|     1|   1129|   2.0|1260759185|
|     1|   1172|   4.0|1260759205|
|     1|   1263|   2.0|1260759151|
|     1|   1287|   2.0|1260759187|
|     1|   1293|   2.0|1260759148|
|     1|   1339|   3.5|1260759125|
|     1|   1343|   2.0|1260759131|
|     1|   1371|   2.5|1260759135|
|     1|   1405|   1.0|1260759203|
|     1|   1953|   4.0|1260759191|
|     1|   2105|   4.0|1260759139|
|     1|   2150|   3.0|1260759194|
|     1|   2193|   2.0|1260759198|
|     1|   2294|   2.0|1260759108|
|     1|   2455|   2.5|1260759113|
|     1|   2968|   1.0|1260759200|
|     1|   3671|   3.0|1260759117|
+------+-------+------+----------+
only showing top 20 rows



In [5]:
lines.describe().show()

+-------+------------------+------------------+------------------+--------------------+
|summary|            userId|           movieId|            rating|           timestamp|
+-------+------------------+------------------+------------------+--------------------+
|  count|            100004|            100004|            100004|              100004|
|   mean| 347.0113095476181|12548.664363425463| 3.543608255669773|1.1296390869392424E9|
| stddev|195.16383797819535|26369.198968815268|1.0580641091070326|1.9168582602710962E8|
|    min|                 1|                 1|               0.5|           789652009|
|    max|               671|            163949|               5.0|          1476640644|
+-------+------------------+------------------+------------------+--------------------+



In [31]:
training, test = lines.randomSplit([.8, .2], seed=42)

In [32]:
als = ALS(maxIter = 5, regParam= .09, userCol="userId", itemCol="movieId", ratingCol="rating",coldStartStrategy="drop",nonnegative=True)
model = als.fit(training)
predictions = model.transform(test)

In [33]:
evaluator=RegressionEvaluator(metricName="rmse",labelCol="rating",predictionCol="prediction")
predictions=model.transform(test)
rmse=evaluator.evaluate(predictions)
print("RMSE="+str(rmse))
predictions.show()

RMSE=0.9180440271621239
+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|    30|    463|   4.0| 945277405| 3.5538173|
|   460|    471|   5.0|1072836030| 3.8362873|
|   274|    471|   5.0|1074104142| 3.6900458|
|   306|    471|   3.0| 939718996| 3.6344876|
|   452|    471|   3.0| 976422396| 3.7346225|
|    19|    471|   3.0| 855192558| 4.0051093|
|    92|    471|   4.0| 848526594| 3.8723884|
|   299|    471|   4.5|1344186741|  4.249956|
|   607|    471|   4.0|1118247731| 3.2649348|
|    15|    471|   3.0|1166586067| 2.8280983|
|   659|    471|   4.0| 853412972| 3.6692934|
|   195|    471|   3.0| 976289176| 3.4980865|
|   468|    471|   4.0|1296197444| 3.3988836|
|   521|    471|   3.5|1370072127|  3.999179|
|   509|    496|   3.0| 940013481| 2.6204333|
|   463|   1088|   3.0|1050499697| 3.2633686|
|    57|   1088|   4.0| 907764935| 3.4524126|
|   306|   1088|   4.0| 939760516|  3.310099|
|   518|  

In [34]:
single_user = test.filter(test["userId"] == 12).select(["userId", "movieId","rating"])
single_user.orderBy("rating", ascending=False).show()

+------+-------+------+
|userId|movieId|rating|
+------+-------+------+
|    12|   1235|   5.0|
|    12|   1215|   5.0|
|    12|   3793|   5.0|
|    12|    736|   4.0|
|    12|   6184|   4.0|
|    12|   3864|   3.0|
|    12|   3886|   2.0|
|    12|   2355|   2.0|
|    12|   3869|   2.0|
|    12|   1028|   1.0|
|    12|   1295|   1.0|
|    12|   3791|   1.0|
|    12|   3844|   1.0|
+------+-------+------+



In [35]:
rec = model.transform(single_user)
rec.orderBy("rating", ascending=False).show()

+------+-------+------+----------+
|userId|movieId|rating|prediction|
+------+-------+------+----------+
|    12|   1215|   5.0|  2.590385|
|    12|   1235|   5.0| 1.5256846|
|    12|   3793|   5.0| 2.7903378|
|    12|    736|   4.0|   2.31013|
|    12|   6184|   4.0|  2.769578|
|    12|   3864|   3.0|  1.597648|
|    12|   3869|   2.0| 2.0295925|
|    12|   2355|   2.0|   2.66153|
|    12|   3844|   1.0| 2.1820014|
|    12|   1295|   1.0| 1.4813555|
|    12|   3791|   1.0| 2.2531295|
|    12|   1028|   1.0| 2.4114938|
+------+-------+------+----------+

