In [22]:
import pandas as pd
import numpy as np

In [23]:
# Load data from CSV files
books_df = pd.read_csv("data/Books.csv")
ratings_df = pd.read_csv("data/Ratings.csv")
users_df = pd.read_csv("data/Users.csv")

  books_df = pd.read_csv("data/Books.csv")


In [24]:
# Select relevant columns in the books DataFrame
books_df = books_df[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher','Image-URL-M','Image-URL-L']]
books_df.columns = ['isbn', 'title', 'author', 'year', 'publisher', 'image_m', 'image_l']

In [25]:
# Clean year of publication data
invalid_years = ['Gallimard', 'DK Publishing Inc']
books_df = books_df[~books_df.year.isin(invalid_years)]
books_df.year = pd.to_numeric(books_df.year, errors='coerce')

In [26]:
# Rename the ratings and users DataFrames
ratings_df.columns = ['user_id', 'isbn', 'rating']
users_df.columns = ['user_id', 'location', 'age']

In [27]:
# Clean special characters from ISBN column in ratings DataFrame
filtered_chars = ["(", ")", "'", "\\", "/", "´", " ", ".", "+", "*"]
for char in filtered_chars:
    ratings_df.isbn = ratings_df.isbn.str.replace(char, "")

In [28]:
# Filter ratings with a rating greater than 6
high_ratings = ratings_df[ratings_df.rating > 6]

In [29]:
# Calculate the number of recommendations per ISBN
recommendations_per_isbn = high_ratings.groupby('isbn')['user_id'].count().reset_index()
recommendations_per_isbn.columns = ["isbn", "nb_recommendations"]

In [30]:
# Calculate rating statistics per ISBN
rating_stats = high_ratings.groupby('isbn').agg({"rating": ["mean", "count"]}).reset_index()
rating_stats.columns = ["isbn", "mean_rating", "count_rating"]

In [31]:
# Merge book information with rating statistics
books_df = books_df.merge(rating_stats, how='left', on='isbn')

In [32]:
# Filter ISBNs with more than 20 recommendations
recommendations_per_isbn = recommendations_per_isbn[recommendations_per_isbn.nb_recommendations > 20]

In [33]:
# Calculate the number of recommendations per user
recommendations_per_user = high_ratings.groupby('user_id')['isbn'].count().reset_index()
recommendations_per_user.columns = ["user_id", "nb_recommendations"]

In [34]:
# Filter users with more than 50 recommendations
recommendations_per_user = recommendations_per_user[recommendations_per_user.nb_recommendations > 50]

In [35]:
# Filter ratings based on ISBN and user criteria
high_ratings = high_ratings[high_ratings.user_id.isin(recommendations_per_user.user_id)]
high_ratings = high_ratings[high_ratings.isbn.isin(recommendations_per_isbn.isbn)]

In [36]:
# Create a user-book linking matrix
book_user_matrix = high_ratings.pivot_table(columns=["user_id"], index="isbn", values=["rating"])
book_user_matrix.fillna(0, inplace=True)

In [37]:
# Calculate the book similarity matrix based on user correlations
book_similarity_matrix = book_user_matrix.dot(book_user_matrix.transpose())

In [38]:
# Generate recommendations by finding the top 5 most similar books for each book
recommendations = np.empty((book_similarity_matrix.shape[0], 5))
similarity_matrix_np = book_similarity_matrix.to_numpy()
for row_num in range(similarity_matrix_np.shape[0]):
    row = similarity_matrix_np[row_num]
    recommendations[row_num] = np.argpartition(row, -5)[-5:]

In [39]:
# Create a dictionary of similar book pairs
book_similarity_dict = book_similarity_matrix.to_dict()

In [40]:
# Sort similar book pairs
sorted_similarity_dict = {}
for key, value in book_similarity_dict.items():
    sorted_similarity_dict[key] = [k for k, v in sorted(value.items(), key=lambda item: item[1], reverse=True)[:10]]

In [42]:
# Create a DataFrame of recommendations
top_recommendations_df = pd.DataFrame.from_dict(sorted_similarity_dict, orient="index").reset_index()
top_recommendations_df.columns = ['isbn', 'recommendation_1', 'recommendation_2', 'recommendation_3', 'recommendation_4', 'recommendation_5', 'recommendation_6', 'recommendation_7', 'recommendation_8', 'recommendation_9', 'recommendation_10']

In [43]:

# Export the final data to a CSV file
books_df.to_csv("./processed_books.csv", header=True, sep="|", quotechar='"')

In [148]:
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher','Image-URL-M','Image-URL-L']]
books.columns =['isbn', 'title', 'author', 'year', 'publisher','image_m','image_l']
books = books[~books.year.isin(['Gallimard', 'DK Publishing Inc'])]
books.year =pd.to_numeric(books.year)
ratings.columns=['user_id','isbn','rating']
users.columns=["user_id","lication","age"]
filtered_chars = ["(",")","'","\\","/","´"," ",".","+","*"]
for char in filtered_chars:
    ratings.isbn = ratings.isbn.str.replace(char,"")

In [149]:
ratings.isbn.nunique()

340289

In [150]:
ratings

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [151]:
reader_recomandations = ratings[ratings.rating > 6]
nb_recomandations_per_isbn = reader_recomandations.groupby('isbn')["user_id"].count().reset_index()
nb_recomandations_per_isbn.columns=["isbn","nb_recomandations"]


In [152]:
rating_stats = ratings.groupby('isbn').agg({"rating":["mean","count"]}).reset_index()
rating_stats.columns = ["isbn","mean_rating","count_rating"]

In [153]:
books =books.merge(rating_stats, how='left', on='isbn')

In [154]:
nb_recomandations_per_isbn = nb_recomandations_per_isbn[nb_recomandations_per_isbn.nb_recomandations > 20]
nb_recomandations_per_user = reader_recomandations.groupby('user_id')["isbn"].count().reset_index()
nb_recomandations_per_user.columns = ["user_id","nb_recomandations"]
nb_recomandations_per_user = nb_recomandations_per_user[nb_recomandations_per_user.nb_recomandations > 50]


In [156]:
reader_recomandations = reader_recomandations[reader_recomandations.user_id.isin(nb_recomandations_per_user.user_id)]
reader_recomandations = reader_recomandations[reader_recomandations.isbn.isin(nb_recomandations_per_isbn.isbn)]
reader_recomandations.loc[:,"recomand"] = 1

In [157]:
book_user_link = reader_recomandations.pivot_table(columns=["user_id"], index="isbn",values=["recomand"])
book_user_link.fillna(0,inplace=True)

In [158]:
book_user_link.head(5)

Unnamed: 0_level_0,recomand,recomand,recomand,recomand,recomand,recomand,recomand,recomand,recomand,recomand,recomand,recomand,recomand,recomand,recomand,recomand,recomand,recomand,recomand,recomand,recomand
user_id,638,643,1733,2033,2110,2276,2766,3363,3827,4017,...,273718,274004,274061,274111,274301,275970,276680,277203,277427,278137
isbn,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
000649840X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0020199600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0020442203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
0028604199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [159]:
book_user_link.transpose()

Unnamed: 0_level_0,isbn,000649840X,0020199600,0020442203,002542730X,0028604199,006000438X,0060008032,0060085452,0060096195,0060168013,...,2253150711,3150000017,3257228007,3257229534,3404148665,3423202327,3426029553,3492045170,8806142100,8807813025
Unnamed: 0_level_1,user_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
recomand,638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
recomand,643,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
recomand,1733,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
recomand,2033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
recomand,2110,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
recomand,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
recomand,275970,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
recomand,276680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
recomand,277203,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
recomand,277427,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [160]:
book_book_link = book_user_link.dot(book_user_link.transpose())

In [161]:
recomandations = np.empty((book_book_link.shape[0],5))
np_bbl =book_book_link.to_numpy()
for row_nbr in range(np_bbl.shape[0]):
    row = np_bbl[row_nbr]
    recomandations[row_nbr] =np.argpartition(row, -5)[-5:]

In [162]:
dict_bbl = book_book_link.to_dict()
books.set_index("isbn", inplace=True)

In [164]:
book_book_link.head()

isbn,000649840X,0020199600,0020442203,002542730X,0028604199,006000438X,0060008032,0060085452,0060096195,0060168013,...,2253150711,3150000017,3257228007,3257229534,3404148665,3423202327,3426029553,3492045170,8806142100,8807813025
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000649840X,6.0,0.0,1.0,2.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0
0020199600,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0020442203,1.0,0.0,9.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
002542730X,2.0,0.0,2.0,18.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0028604199,0.0,0.0,0.0,2.0,7.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [165]:
sorted_dict = {}
for key, value in dict_bbl.items():
    sorted_dict[key] = [k for k, v in sorted(value.items(), key=lambda item: item[1],reverse=True)[:10]] 

top_recomandation = pd.DataFrame.from_dict(sorted_dict, orient="index").reset_index()
top_recomandation.columns=['isbn', 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [169]:
top_recomandation

Unnamed: 0,isbn,0,1,2,3,4,5,6,7,8,9
0,000649840X,000649840X,0860074382,002542730X,006016848X,0060920084,0060938455,0064471101,0099747200,0142001740,0156027321
1,0020199600,0020199600,0316666343,0060938455,014028009X,0345313860,0345380371,0440967694,044651652X,0446605239,0449212602
2,0020442203,0020442203,002542730X,0345342968,034540288X,0380002930,0380018179,0380789019,0399501487,0440343194,0440998050
3,002542730X,002542730X,0345370775,0439136350,0064407667,0385492081,0399501487,000649840X,0020442203,0028604199,0060958022
4,0028604199,0028604199,0439139600,0609804138,002542730X,0140067477,0316096199,0345370775,0345380371,0385335881,0385730586
...,...,...,...,...,...,...,...,...,...,...,...
1435,3423202327,000649840X,0060008032,0060085452,006016848X,0060175400,006019491X,0060198133,0060199652,0060256672,0060256737
1436,3426029553,0345339703,0345339711,0345339738,0446670251,0747532745,0747546290,3426029553,000649840X,0020199600,0020442203
1437,3492045170,0312291639,0385335482,0385335881,0385336179,039592720X,3492045170,000649840X,0060008032,0060085452,006016848X
1438,8806142100,8806142100,000649840X,0060008032,0060085452,006016848X,0060175400,006019491X,0060198133,0060199652,0060256672


In [176]:
books = books.merge(top_recomandation, how='left', on='isbn')
books.to_csv("./processed_books.csv",header=True, sep="|",quotechar='"')

In [44]:
%%sh
head ./processed_books.csv 

|isbn|title|author|year|publisher|image_m|image_l|mean_rating|count_rating
0|0195153448|Classical Mythology|Mark P. O. Morford|2002|Oxford University Press|http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg|http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg||
1|0002005018|Clara Callan|Richard Bruce Wright|2001|HarperFlamingo Canada|http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg|http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg|8.285714285714286|7.0
2|0060973129|Decision in Normandy|Carlo D'Este|1991|HarperPerennial|http://images.amazon.com/images/P/0060973129.01.MZZZZZZZ.jpg|http://images.amazon.com/images/P/0060973129.01.LZZZZZZZ.jpg|7.5|2.0
3|0374157065|Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It|Gina Bari Kolata|1999|Farrar Straus Giroux|http://images.amazon.com/images/P/0374157065.01.MZZZZZZZ.jpg|http://images.amazon.com/images/P/0374157065.01.LZZZZZZZ.jpg|8.75|4.0
4|0393045218|The M