In [1]:
import pandas as pd
import numpy as np


In [17]:
import findspark
findspark.init()

In [18]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [104]:
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

In [19]:
spark = SparkSession.builder.appName("BookRecommendations").getOrCreate()

In [59]:
ratings = (
    spark.read.csv(
        path ="../data/data_processed/Book_Ratings.csv",
        sep=",",
        header= True,
        quote='"',  
    )
)

In [42]:
books = (
    spark.read.csv(
        path ="../data/data_processed/Books.csv",
        sep=",",
        header= True,
        quote='"',
    )
)

In [43]:
books.show()

+------+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|BookID|      ISBN|          Book-Title|         Book-Author|Year-Of-Publication|           Publisher|         Image-URL-S|         Image-URL-M|         Image-URL-L|
+------+----------+--------------------+--------------------+-------------------+--------------------+--------------------+--------------------+--------------------+
|     0|0195153448| Classical Mythology|  Mark P. O. Morford|               2002|Oxford University...|http://images.ama...|http://images.ama...|http://images.ama...|
|     1|0002005018|        Clara Callan|Richard Bruce Wright|               2001|HarperFlamingo Ca...|http://images.ama...|http://images.ama...|http://images.ama...|
|     2|0060973129|Decision in Normandy|        Carlo D'Este|               1991|     HarperPerennial|http://images.ama...|http://images.ama...|http://images.ama...|
|   

In [62]:
ratings.show()

+-------+----------+-----------+
|User-ID|      ISBN|Book-Rating|
+-------+----------+-----------+
| 276725|034545104X|          0|
| 276726|0155061224|          5|
| 276727|0446520802|          0|
| 276729|052165615X|          3|
| 276729|0521795028|          6|
| 276733|2080674722|          0|
| 276736|3257224281|          8|
| 276737|0600570967|          6|
| 276744|038550120X|          7|
| 276745| 342310538|         10|
| 276746|0425115801|          0|
| 276746|0449006522|          0|
| 276746|0553561618|          0|
| 276746|055356451X|          0|
| 276746|0786013990|          0|
| 276746|0786014512|          0|
| 276747|0060517794|          9|
| 276747|0451192001|          0|
| 276747|0609801279|          0|
| 276747|0671537458|          9|
+-------+----------+-----------+
only showing top 20 rows



In [72]:
data = ratings.join(books, books.ISBN ==  ratings.ISBN,"inner").select("User-ID", "BookID","Book-Title","Book-Rating")

In [77]:
data = data.withColumnRenamed("User-ID","userID")\
            .withColumnRenamed("BookID","bookID")\
            .withColumnRenamed("Book-Title","title")\
            .withColumnRenamed("Book-Rating","rating")

In [78]:
data.show()

+------+------+--------------------+------+
|userID|bookID|               title|rating|
+------+------+--------------------+------+
| 23902| 42486|Nothing Can Be Be...|     0|
| 16319| 29472|       Which Colour?|     0|
| 26583| 48578|Huck Scarry's Ste...|     0|
|176062|257021|ARE YOU MY MOTHER...|     0|
|114216|154500|                Bess|     0|
| 11676|175815| THE COAL HOUSE T/PB|     0|
|131402|175815| THE COAL HOUSE T/PB|     0|
|145927|175815| THE COAL HOUSE T/PB|     0|
| 11676|163766|Glue (First Facts...|     0|
|181659|163766|Glue (First Facts...|    10|
|206979|163766|Glue (First Facts...|     0|
| 11676|182456|Count Duckula: Va...|     6|
|110029|182456|Count Duckula: Va...|     0|
|206979|182456|Count Duckula: Va...|     0|
|201526|141012|PADDINGTON GOES T...|     0|
| 93366|175085|Frederick Street:...|     8|
| 26583| 48706|           Vancouver|     8|
|101731| 48706|           Vancouver|     8|
|249111|207528|King Edward VIII:...|     0|
|222586|238817|The valour and th

In [79]:
data = data.withColumn("userID", f.col("userID").cast("int"))\
            .withColumn("bookID", f.col("bookID").cast("int"))\
            .withColumn("rating", f.col("rating").cast("double"))

In [81]:
data.dtypes

[('userID', 'int'),
 ('bookID', 'int'),
 ('title', 'string'),
 ('rating', 'double')]

In [82]:
als = ALS(
    userCol = "userID",
    itemCol = "bookID",
    ratingCol = "rating",
)

In [83]:
evaluator = RegressionEvaluator(metricName = 'rmse', labelCol = 'rating', predictionCol = 'prediction')

In [86]:
train, val = data.randomSplit([0.8, 0.2], seed = 42)

In [87]:
model = als.fit(train)

In [88]:
predictions = model.transform(val)

In [89]:
predictions. show()

+------+------+--------------------+------+----------+
|userID|bookID|               title|rating|prediction|
+------+------+--------------------+------+----------+
|   165|   133|One Hundred Years...|   0.0|  8.423924|
|   243|   210| Memoirs of a Geisha|  10.0|-1.7866623|
|   651|   842|                Hood|   0.0| -2.022131|
|   626|113397| Der Blaumilchkanal.|   7.0|       0.0|
|    32|    40|Miss Zukas and th...|   0.0|       0.0|
|   244|   246|          Life of Pi|   9.0| 3.6417093|
|   254|   291|Angels &amp; Inse...|   0.0|  1.517553|
|   306|   330|Frankenstein (Dov...|   0.0|       NaN|
|   383|   367|               Proof|   0.0|  3.972284|
|   422|   474|    The Lovely Bones|   0.0|       NaN|
|   643| 15638|The Wind in the W...|   9.0| 4.1077247|
|   202|   154|The Brimstone Wed...|   6.0|       NaN|
|   392|   430|El paraÃ­so (Tiem...|   0.0|       NaN|
|   444|   507|The Kingdom: Arab...|   0.0| 0.9014306|
|   383|   380|Victorious Christ...|   0.0|       NaN|
|   254|  

In [90]:
rmse = evaluator.evaluate(predictions.na.drop())

In [91]:
print(rmse)

4.4172150236801855


In [92]:
userFactor = model.userFactors
itemFactor = model.itemFactors

In [98]:
userFactor.sort('id').show()

+---+--------------------+
| id|            features|
+---+--------------------+
|  8|[-0.5981992, -0.3...|
|  9|[1.136716, -0.318...|
| 12|[-0.42292485, -1....|
| 14|[1.0567818, -0.25...|
| 16|[-0.404279, 1.882...|
| 17|[1.3811506, -0.27...|
| 19|[0.44821343, -0.3...|
| 20|[0.0, 0.0, 0.0, 0...|
| 22|[0.07295373, 0.02...|
| 23|[0.0, 0.0, 0.0, 0...|
| 26|[0.8107162, -0.20...|
| 32|[0.0, 0.0, 0.0, 0...|
| 36|[0.0, 0.0, 0.0, 0...|
| 39|[0.7840791, 0.058...|
| 42|[0.9239076, 0.397...|
| 44|[0.0, 0.0, 0.0, 0...|
| 51|[0.541019, -1.032...|
| 53|[1.3431232, -0.60...|
| 56|[1.9356911, 0.109...|
| 64|[-0.6327808, 0.00...|
+---+--------------------+
only showing top 20 rows



In [101]:
itemFactor.count()

238888

In [96]:
itemFactor.sort('id').show()

+---+--------------------+
| id|            features|
+---+--------------------+
|  1|[-1.1937307, -0.6...|
|  2|[-0.8358279, 0.21...|
|  3|[1.6313949, -0.85...|
|  4|[0.0, 0.0, 0.0, 0...|
|  5|[-1.2868525, -0.2...|
|  6|[0.0, 0.0, 0.0, 0...|
|  7|[0.8772846, -0.37...|
|  8|[0.0, 0.0, 0.0, 0...|
|  9|[-0.9567951, -0.5...|
| 10|[0.6354727, -1.02...|
| 11|[0.0, 0.0, 0.0, 0...|
| 12|[-0.9567951, -0.5...|
| 13|[-0.9567951, -0.5...|
| 14|[-0.42060125, -0....|
| 15|[-1.1481541, -0.6...|
| 16|[-1.1147898, -0.6...|
| 17|[-0.45291173, -1....|
| 18|[-1.0597812, -1.1...|
| 19|[2.219686, -1.090...|
| 20|[-1.8239057, 0.27...|
+---+--------------------+
only showing top 20 rows



In [102]:
user91 = model.userFactors.filter(f.col('id') == 91).select(f.col('features')).rdd.flatMap(lambda x: x).collect()[0]
item345 = model.itemFactors.filter(f.col('id') == 345).select(f.col('features')).rdd.flatMap(lambda x: x).collect()[0]

In [103]:
np.dot(user91, item345)

0.46775561565311863

In [108]:
param_grid = (
    ParamGridBuilder()
    .addGrid(als.rank,[5,10])
    .addGrid(als.maxIter,[20])
    .addGrid(als.regParam,[0.05, 0.1])
    .build()
)

In [106]:
from pprint import pprint

In [109]:
pprint(param_grid)

[{Param(parent='ALS_6d4c59245750', name='maxIter', doc='max number of iterations (>= 0).'): 20,
  Param(parent='ALS_6d4c59245750', name='regParam', doc='regularization parameter (>= 0).'): 0.05,
  Param(parent='ALS_6d4c59245750', name='rank', doc='rank of the factorization'): 5},
 {Param(parent='ALS_6d4c59245750', name='maxIter', doc='max number of iterations (>= 0).'): 20,
  Param(parent='ALS_6d4c59245750', name='regParam', doc='regularization parameter (>= 0).'): 0.1,
  Param(parent='ALS_6d4c59245750', name='rank', doc='rank of the factorization'): 5},
 {Param(parent='ALS_6d4c59245750', name='maxIter', doc='max number of iterations (>= 0).'): 20,
  Param(parent='ALS_6d4c59245750', name='regParam', doc='regularization parameter (>= 0).'): 0.05,
  Param(parent='ALS_6d4c59245750', name='rank', doc='rank of the factorization'): 10},
 {Param(parent='ALS_6d4c59245750', name='maxIter', doc='max number of iterations (>= 0).'): 20,
  Param(parent='ALS_6d4c59245750', name='regParam', doc='regu

In [None]:
model = crossval_model.bestModel

In [110]:
rec_all = model.recommendForAllUsers(5).cache()
rec_all.show()
rec_all.printSchema()

+------+--------------------+
|userID|     recommendations|
+------+--------------------+
|   463|[{161693, 16.9972...|
|   496|[{17462, 19.43290...|
|  1238|[{229350, 0.0}, {...|
|  1591|[{20152, 15.41076...|
|  2366|[{139921, 14.3973...|
|  3918|[{167275, 15.8302...|
|  4900|[{264386, 18.5042...|
|  5300|[{253887, 12.1571...|
|  5803|[{229350, 0.0}, {...|
|  6336|[{266736, 16.4045...|
|  6357|[{229350, 0.0}, {...|
|  6397|[{189533, 21.7626...|
|  6466|[{42031, 17.34398...|
|  6654|[{266736, 12.6541...|
|  7253|[{45870, 20.02798...|
|  7982|[{229350, 0.0}, {...|
|  8086|[{135672, 13.0669...|
|  9427|[{156325, 18.8128...|
|  9465|[{156325, 17.8291...|
| 11033|[{162918, 34.2928...|
+------+--------------------+
only showing top 20 rows

root
 |-- userID: integer (nullable = false)
 |-- recommendations: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- bookID: integer (nullable = true)
 |    |    |-- rating: float (nullable = true)



In [121]:
getuser = rec_all.filter(rec_all.userID == 7253).select(f.col('recommendations')).rdd

In [128]:
kq = getuser.collect()

In [140]:
dict(kq[0][0])

dict_keys([45870, 1895, 150342, 60401, 152056])