In [0]:
#imports
from pyspark.sql import SQLContext
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline
from pyspark.sql import Row
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import udf, col, when
import numpy as np

In [0]:
#reading the datasets : books details and users ratings
books_df = spark.read.csv('/Volumes/workspace/recommendation_system_data/books/books.csv',header=True, inferSchema=True) 
books_df.printSchema()

ratings_df = spark.read.csv('/Volumes/workspace/recommendation_system_data/books/ratings.csv',header=True, inferSchema=True)
ratings_df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- book_id: integer (nullable = true)
 |-- best_book_id: integer (nullable = true)
 |-- work_id: integer (nullable = true)
 |-- books_count: integer (nullable = true)
 |-- isbn: string (nullable = true)
 |-- isbn13: double (nullable = true)
 |-- authors: string (nullable = true)
 |-- original_publication_year: double (nullable = true)
 |-- original_title: string (nullable = true)
 |-- title: string (nullable = true)
 |-- language_code: string (nullable = true)
 |-- average_rating: string (nullable = true)
 |-- ratings_count: string (nullable = true)
 |-- work_ratings_count: string (nullable = true)
 |-- work_text_reviews_count: string (nullable = true)
 |-- ratings_1: double (nullable = true)
 |-- ratings_2: integer (nullable = true)
 |-- ratings_3: integer (nullable = true)
 |-- ratings_4: integer (nullable = true)
 |-- ratings_5: integer (nullable = true)
 |-- image_url: string (nullable = true)
 |-- small_image_url: string (nullable = true)


In [0]:
#viewing each dataframe
ratings_df.display(5)
books_df.display(2)

book_id,user_id,rating
1,314,5
1,439,3
1,588,5
1,1169,4
1,1185,4
1,2077,4
1,2487,4
1,2900,5
1,3662,4
1,3922,5


id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,title,language_code,average_rating,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
1,2767052,2767052,2792775,272,439023483,9780439023480.0,Suzanne Collins,2008.0,The Hunger Games,"The Hunger Games (The Hunger Games, #1)",eng,4.34,4780653,4942365,155254,66715.0,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m/2767052.jpg,https://images.gr-assets.com/books/1447303603s/2767052.jpg
2,3,3,4640799,491,439554934,9780439554930.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,"Harry Potter and the Sorcerer's Stone (Harry Potter, #1)",eng,4.44,4602479,4800065,75867,75504.0,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m/3.jpg,https://images.gr-assets.com/books/1474154022s/3.jpg
3,41865,41865,3212258,226,316015849,9780316015840.0,Stephenie Meyer,2005.0,Twilight,"Twilight (Twilight, #1)",en-US,3.57,3866839,3916824,95009,456191.0,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m/41865.jpg,https://images.gr-assets.com/books/1361039443s/41865.jpg
4,2657,2657,3275794,487,61120081,9780061120080.0,Harper Lee,1960.0,To Kill a Mockingbird,To Kill a Mockingbird,eng,4.25,3198671,3340896,72586,60427.0,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m/2657.jpg,https://images.gr-assets.com/books/1361975680s/2657.jpg
5,4671,4671,245494,1356,743273567,9780743273560.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,The Great Gatsby,eng,3.89,2683664,2773745,51992,86236.0,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m/4671.jpg,https://images.gr-assets.com/books/1490528560s/4671.jpg
6,11870085,11870085,16827462,226,525478817,9780525478810.0,John Green,2012.0,The Fault in Our Stars,The Fault in Our Stars,eng,4.26,2346404,2478609,140739,47994.0,92723,327550,698471,1311871,https://images.gr-assets.com/books/1360206420m/11870085.jpg,https://images.gr-assets.com/books/1360206420s/11870085.jpg
7,5907,5907,1540236,969,618260307,9780618260300.0,J.R.R. Tolkien,1937.0,The Hobbit or There and Back Again,The Hobbit,en-US,4.25,2071616,2196809,37653,46023.0,76784,288649,665635,1119718,https://images.gr-assets.com/books/1372847500m/5907.jpg,https://images.gr-assets.com/books/1372847500s/5907.jpg
8,5107,5107,3036731,360,316769177,9780316769170.0,J.D. Salinger,1951.0,The Catcher in the Rye,The Catcher in the Rye,eng,3.79,2044241,2120637,44920,109383.0,185520,455042,661516,709176,https://images.gr-assets.com/books/1398034300m/5107.jpg,https://images.gr-assets.com/books/1398034300s/5107.jpg
9,960,960,3338963,311,1416524797,9781416524790.0,Dan Brown,2000.0,Angels & Demons,"Angels & Demons (Robert Langdon, #1)",en-CA,3.85,2001311,2078754,25112,77841.0,145740,458429,716569,680175,https://images.gr-assets.com/books/1303390735m/960.jpg,https://images.gr-assets.com/books/1303390735s/960.jpg
10,1885,1885,3060926,3455,679783261,9780679783270.0,Jane Austen,1813.0,Pride and Prejudice,Pride and Prejudice,eng,4.24,2035490,2191465,49152,54700.0,86485,284852,609755,1155673,https://images.gr-assets.com/books/1320399351m/1885.jpg,https://images.gr-assets.com/books/1320399351s/1885.jpg


In [0]:
#spliting the test and train data
training_df, validation_df = ratings_df.randomSplit([.8, .2])

In [0]:
#hyperparameters
iterations = 10
regularization_parameter = 0.1
rank= 4

In [0]:
#creating a ALS model, training and testing using the model
#also find the RMSE 
als = ALS(maxIter=iterations, regParam=regularization_parameter, rank=4, userCol="user_id", itemCol="book_id", ratingCol="rating")
model = als.fit(training_df)
predictions = model.transform(validation_df)
new_predictions = predictions.filter(col('prediction') != np.nan)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
rmse = evaluator.evaluate(new_predictions)
print("Root-mean-square error = " + str(rmse))

Root-mean-square error = 0.8939823911691192


In [0]:
import os
os.environ["SPARKML_TEMP_DFS_PATH"] = "/Volumes/workspace/recommendation_system_data/cv_caching/"


In [0]:
#finding the best model through cross validation and hperparameter tuning
#using the paramglid and cross validation
als = ALS(maxIter=iterations, regParam=regularization_parameter, rank=rank, userCol="user_id", itemCol="book_id", ratingCol="rating")
paramGrid = ParamGridBuilder() \
    .addGrid(als.regParam, [0.1, 0.01, 0.18]) \
    .addGrid(als.rank, range(4, 6)) \
    .build()
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")
crossval = CrossValidator(estimator=als,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)
cv_model = crossval.fit(training_df)
final_model = cv_model.bestModel

In [0]:
#prediction using the selected model
predictions = final_model.transform(validation_df)
predictions = predictions.filter(col('prediction') != np.nan)
rmse = evaluator.evaluate(predictions)
print("the rmse for optimal grid parameters with cross validation is: {}".format(rmse))
predictions.show(n = 10)

the rmse for optimal grid parameters with cross validation is: 0.8939823911691192
+-------+-------+------+----------+
|book_id|user_id|rating|prediction|
+-------+-------+------+----------+
|    348|  18714|     4|  3.565439|
|    452|  28057|     4| 3.9035773|
|    472|  41014|     5|  4.339942|
|    671|  28057|     4| 3.9046853|
|    698|  28057|     4| 3.9500213|
|    827|  18714|     3| 3.9161487|
|    858|  18714|     5| 4.3690095|
|    972|  28057|     5| 3.6693347|
|   1012|  28057|     3| 3.8173797|
|   1170|  34159|     5|  4.270044|
+-------+-------+------+----------+
only showing top 10 rows


In [0]:

# Generate top 10 books recommendations for each user
userRecommends = final_model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
bookRecommends = final_model.recommendForAllItems(10)


In [0]:
#writing the books recommendations for each users and vise versa
userRecommends.write.saveAsTable("workspace.recommendation_system_data.books_recommendations")
bookRecommends.write.saveAsTable("workspace.recommendation_system_data.users_recommendations")

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-8305339459285150>, line 2[0m
[1;32m      1[0m [38;5;66;03m#writing the books recommendations for each users and vise versa[39;00m
[0;32m----> 2[0m flat_userRec[38;5;241m.[39mwrite[38;5;241m.[39msaveAsTable([38;5;124m"[39m[38;5;124mworkspace.recommendation_system_data.books_recommendations[39m[38;5;124m"[39m)
[1;32m      3[0m flat_bookRec[38;5;241m.[39mwrite[38;5;241m.[39msaveAsTable([38;5;124m"[39m[38;5;124mworkspace.recommendation_system_data.users_recommendations[39m[38;5;124m"[39m)

File [0;32m/databricks/python/lib/python3.12/site-packages/pyspark/sql/connect/readwriter.py:737[0m, in [0;36mDataFrameWriter.saveAsTable[0;34m(self, name, format, mode, partitionBy, **options)[0m
[1;32m    735[0m [38;5;28mself[39m[38;5;241m.[39m_write[38;5;241m.[39mtable