In [1]:
# importing important libraries
import numpy as np
import pandas as pd

In [2]:
pd.options.mode.chained_assignment = None

In [3]:
# reading the Ratings database
ratings = pd.read_csv('Ratings.csv')
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


BUILDING THE MODEL

In [4]:
# removing the system induced rating to have accurate averate ratings and actual user data
ratings.drop(index=ratings[ratings["Book-Rating"]==0].index, inplace = True)
ratings

Unnamed: 0,User-ID,ISBN,Book-Rating
1,276726,0155061224,5
3,276729,052165615X,3
4,276729,0521795028,6
6,276736,3257224281,8
7,276737,0600570967,6
...,...,...,...
1149773,276704,0806917695,5
1149775,276704,1563526298,9
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [5]:
# choosing a random ID to suggest books to that user
user_id = 215986

In [6]:
# finding all the books rated by that user
user_rating = ratings[ratings['User-ID']==user_id]
user_books = set(user_rating['ISBN'])
user_books

{'0020868308',
 '0060256672',
 '0140042407',
 '0140186425',
 '0142004235',
 '0156904365',
 '0345339703',
 '0345339711',
 '037541326X',
 '0375703764',
 '0375726403',
 '0380712520',
 '0385319673',
 '0439064864',
 '0439136350',
 '043935806X',
 '044651747x',
 '0446600253',
 '0446678678',
 '0449210928',
 '0451524934',
 '0553379534',
 '0590353403',
 '0609804022',
 '0671496107',
 '0679410325',
 '0679439382',
 '0679781358',
 '069022608X',
 '0743418174',
 '0785266453',
 '0786885084',
 '078688939X',
 '0802713661',
 '0809224763',
 '0811831981',
 '0849917182',
 '0864425139',
 '0864426003',
 '0878422420',
 '0880706317',
 '0891072926',
 '1557483159',
 '1573229326',
 '1573229571',
 '1573540455',
 '1853263648',
 '1864503750',
 '1864503769',
 '1879290197'}

In [7]:
# finding all the ratings on those books selected by that user
overlapped_data = ratings[ratings['ISBN'].isin(user_books)]
overlapped_data

Unnamed: 0,User-ID,ISBN,Book-Rating
84,276788,043935806X,7
837,277072,1573229571,9
2285,277527,0679439382,7
2353,277575,0142004235,5
4260,278356,043935806X,10
...,...,...,...
1148156,276165,043935806X,10
1148185,276165,0590353403,10
1148513,276313,0345339703,8
1148708,276463,0345339711,10


In [8]:
# finding the set of userd who have rated the same kind of books,
# selected users with only more that 2 books in common to increase the generality in interests
overlapped_users = overlapped_data.groupby('User-ID')['ISBN'].count()>2
overlapped_users=overlapped_users[overlapped_users].index

In [9]:
# finding all the ratings given by those users
rating = ratings[ratings['User-ID'].isin(overlapped_users)]
rating

Unnamed: 0,User-ID,ISBN,Book-Rating
10052,254,0060502320,7
10057,254,0060934700,9
10059,254,0060976977,7
10071,254,0064471047,7
10073,254,0066238501,5
...,...,...,...
1148191,276165,0743411420,9
1148192,276165,0743444078,10
1148193,276165,0743455827,5
1148194,276165,0743456424,10


In [10]:
# checking out all the unique books there are rated by those users
rating['ISBN'].unique()

array(['0060502320', '0060934700', '0060976977', ..., '0743444078',
       '0743455827', '0743456424'], dtype=object)

In [11]:
# creating book index and user index for the ratings database
rating['user_index']=rating['User-ID'].astype("category").cat.codes
rating['book_index']=rating['ISBN'].astype('category').cat.codes

In [12]:
rating

Unnamed: 0,User-ID,ISBN,Book-Rating,user_index,book_index
10052,254,0060502320,7,0,365
10057,254,0060934700,9,0,547
10059,254,0060976977,7,0,612
10071,254,0064471047,7,0,977
10073,254,0066238501,5,0,1024
...,...,...,...,...,...
1148191,276165,0743411420,9,102,10742
1148192,276165,0743444078,10,102,10803
1148193,276165,0743455827,5,102,10827
1148194,276165,0743456424,10,102,10829


In [13]:
print(len(rating['book_index'].unique()))
print(len(rating['user_index'].unique()))

16679
103


In [14]:
from scipy.sparse import coo_matrix

In [15]:
# creating a sparse metrix with book rating as values, user index as rows and book index as columns
matrix_coo =coo_matrix((rating['Book-Rating'], (rating['user_index'], rating['book_index'])))

In [16]:
matrix_coo

<103x16679 sparse matrix of type '<class 'numpy.int64'>'
	with 20274 stored elements in COOrdinate format>

In [17]:
#converting the matrix type to use diffrent functions
rating_matrix = matrix_coo.tocsc()

In [18]:
# finding the index given to our user
our_user =rating[rating['User-ID']==user_id]['user_index'].unique()[0]
our_user

77

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
# finding the similarity between our user and others
similarity = cosine_similarity(rating_matrix[our_user,:],rating_matrix).flatten()
similarity

array([0.08957266, 0.06396104, 0.17183205, 0.18777347, 0.02918065,
       0.0439481 , 0.02242731, 0.02214087, 0.14260494, 0.03578205,
       0.11336222, 0.09387863, 0.03435382, 0.05001935, 0.04829118,
       0.12669504, 0.06680219, 0.05148993, 0.22501758, 0.02737585,
       0.0791501 , 0.1244199 , 0.03283543, 0.0195201 , 0.0633884 ,
       0.04620182, 0.0495466 , 0.04133023, 0.04182025, 0.12230076,
       0.10054152, 0.06198908, 0.08986337, 0.02266707, 0.02179558,
       0.05704404, 0.03232838, 0.10834835, 0.02665749, 0.03828401,
       0.02871709, 0.03915272, 0.13816205, 0.09717481, 0.03062816,
       0.10081299, 0.12013803, 0.06232247, 0.05482122, 0.02904846,
       0.07721264, 0.12099088, 0.09628805, 0.0294692 , 0.10560391,
       0.04823891, 0.03378642, 0.05552571, 0.07146005, 0.10409786,
       0.13472559, 0.07866325, 0.10612304, 0.0704056 , 0.04341723,
       0.12935106, 0.082272  , 0.06613515, 0.23056   , 0.22501758,
       0.14355262, 0.103354  , 0.07526283, 0.07305342, 0.07861

In [21]:
len(similarity)

103

In [22]:
#  arg sorting the last 10 user indexes to find the 10 most relevant users, then storing those 10 users in an array
indices = np.argpartition(similarity,-10)[-10:]
indices

array([  3,  94, 101,  91,  69,  77,  96, 100,  18,  68], dtype=int64)

In [23]:
# finding all the ratings given by those top 10 users
similar_users = rating[rating['user_index'].isin(indices)].copy()
similar_users = similar_users[similar_users['user_index']!=our_user]
similar_users.rename(columns={'Book-Rating':'BookRating'},inplace=True)
similar_users

Unnamed: 0,User-ID,ISBN,BookRating,user_index,book_index
27114,6431,0060193395,9,3,278
27116,6431,0373217242,6,3,3455
27126,6431,0373709323,6,3,3577
27131,6431,0373710062,5,3,3584
27148,6431,0373834756,5,3,3610
...,...,...,...,...,...
1145154,275401,0552133000,9,101,7726
1145158,275401,0590353403,9,101,8545
1145163,275401,0747241406,9,101,10900
1145165,275401,0929480384,8,101,13285


In [24]:
# removing the books already read by our user
similar_users = similar_users[~similar_users['ISBN'].isin(user_books)]
similar_users

Unnamed: 0,User-ID,ISBN,BookRating,user_index,book_index
27114,6431,0060193395,9,3,278
27116,6431,0373217242,6,3,3455
27126,6431,0373709323,6,3,3577
27131,6431,0373710062,5,3,3584
27148,6431,0373834756,5,3,3610
27151,6431,0425144372,9,3,5251
27155,6431,0439139597,10,3,5503
27157,6431,0446363669,8,3,6196
27163,6431,0684853523,8,3,10136
27164,6431,0782141676,5,3,11321


In [25]:
# creating a database to mentain the average rating and rating counts of each books
recommendations = similar_users.groupby('ISBN').BookRating.agg(['count','mean'])

In [26]:
# only keeping the books whose average rating were >= 5
recommendations=recommendations[recommendations['mean']>=5]
recommendations

Unnamed: 0_level_0,count,mean
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1
0060193395,1,9.0
0310205719,1,8.0
0373217242,1,6.0
0373709323,1,6.0
0373710062,1,5.0
0373834756,1,5.0
0425144372,1,9.0
0439064872,1,10.0
0439136369,1,10.0
0439139597,7,10.0


In [27]:
books = pd.read_csv('Books.csv')

  books = pd.read_csv('Books.csv')


In [28]:
# finding the recommended books and sorting them
recommended_books = books.merge(recommendations, how='right',on='ISBN')
recommended_books.dropna(subset=['Book-Title'],inplace=True)
recommended_books=recommended_books.sort_values(by='count',ascending=False).head()
recommended_books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,count,mean
9,439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic,http://images.amazon.com/images/P/0439139597.0...,http://images.amazon.com/images/P/0439139597.0...,http://images.amazon.com/images/P/0439139597.0...,7,10.0
10,439139600,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2002,Scholastic Paperbacks,http://images.amazon.com/images/P/0439139600.0...,http://images.amazon.com/images/P/0439139600.0...,http://images.amazon.com/images/P/0439139600.0...,2,9.5
0,60193395,Body for Life: 12 Weeks to Mental and Physical...,Bill Phillips,1999,HarperCollins,http://images.amazon.com/images/P/0060193395.0...,http://images.amazon.com/images/P/0060193395.0...,http://images.amazon.com/images/P/0060193395.0...,1,9.0
13,451213599,Four Past Midnight,Stephen King,2004,Signet Book,http://images.amazon.com/images/P/0451213599.0...,http://images.amazon.com/images/P/0451213599.0...,http://images.amazon.com/images/P/0451213599.0...,1,8.0
23,972393617,Ultimate Unofficial Guide to the Mysteries of ...,Galadriel Waters,2003,Wizarding World Press,http://images.amazon.com/images/P/0972393617.0...,http://images.amazon.com/images/P/0972393617.0...,http://images.amazon.com/images/P/0972393617.0...,1,10.0


CLUBBING EVERYTHING INTO A FUNCTION

In [29]:
#making images viewable and clickable
def show_image(val):
    return '<a href="{}"><img src="{}" width=50></img></a>'.format(val, val)

In [30]:
# function to first show the prefered books of users
def show_liked(userID):
    userID = pd.to_numeric(userID)
    
    user_rating = ratings[ratings['User-ID']==userID]
    prefered_books = user_rating.sort_values(by='Book-Rating',ascending=False).head()

    bookID= set(prefered_books['ISBN'])

    book_details = books[books['ISBN'].isin(bookID)]

    book_details.drop(columns=['Image-URL-M','Image-URL-L'], inplace=True)

    if(not book_details.empty):
        print("User prefrences:")
        display(book_details.style.format({'Image-URL-S':show_image}))
    else:
        print("User have not rated any books yet.")
    

    

In [31]:
def user_recommendation(userID):
    userID = pd.to_numeric(userID)
    if(not (userID in set(ratings['User-ID']))):
        return "User not found in database."
    show_liked(userID)
    
    # finding all the books rated by that user
    user_rating = ratings[ratings['User-ID']==userID]
    user_books = set(user_rating['ISBN'])

    # finding all the ratings on those books selected by that user
    overlapped_data = ratings[ratings['ISBN'].isin(user_books)]

    # finding the set of userd who have rated the same kind of books,
    # selected users with only more that 2 books in common to increase the generality in interests
    overlapped_users = overlapped_data.groupby('User-ID')['ISBN'].count()>2
    overlapped_users=overlapped_users[overlapped_users].index

    # finding all the ratings given by those users
    rating = ratings[ratings['User-ID'].isin(overlapped_users)]

    # creating book index and user index for the ratings database
    rating['user_index']=rating['User-ID'].astype("category").cat.codes
    rating['book_index']=rating['ISBN'].astype('category').cat.codes

    # creating a sparse metrix with book rating as values, user index as rows and book index as columns
    matrix_coo =coo_matrix((rating['Book-Rating'], (rating['user_index'], rating['book_index'])))

    #converting the matrix type to use diffrent functions
    rating_matrix = matrix_coo.tocsc()

    # finding the index given to our user
    our_user =rating[rating['User-ID']==userID]['user_index'].unique()[0]

    # finding the similarity between our user and others
    similarity = cosine_similarity(rating_matrix[our_user,:],rating_matrix).flatten()

    #  arg sorting the last userS indexes to find the most relevant users, then storing those users in an array
    if(len(similarity)>=10):
        indices = np.argpartition(similarity,-10)[-10:]
    else:
        x = len(similarity)
        indices = np.argpartition(similarity,-x)[-x:]

    # finding all the ratings given by those top 10 users
    similar_users = rating[rating['user_index'].isin(indices)].copy()
    similar_users = similar_users[similar_users['user_index']!=our_user]
    similar_users.rename(columns={'Book-Rating':'BookRating'},inplace=True)

    # removing the books already read by our user
    similar_users = similar_users[~similar_users['ISBN'].isin(user_books)]

    # creating a database to mentain the average rating and rating counts of each books
    recommendations = similar_users.groupby('ISBN').BookRating.agg(['count','mean'])

    # only keeping the books whose average rating were >= 5
    recommendations=recommendations[recommendations['mean']>=5]

    # finding the recommended books and sorting them
    recommended_books = books.merge(recommendations, how='right',on='ISBN')
    recommended_books.dropna(subset=['Book-Title'],inplace=True)
    recommended_books=recommended_books.sort_values(by='count',ascending=False).head()

    recommended_books.drop(columns=['Image-URL-M','Image-URL-L','mean','count'], inplace=True)
    if(not recommended_books.empty):
        print('Recommendations for the user')
        return recommended_books.style.format({'Image-URL-S':show_image})
    print("User information not enough to show recommendations.")
    

TESTING OUT OUR RECOMMENDATION MODEL

In [32]:
user_recommendation(215986)

User prefrences:


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S
780,345339711,"The Two Towers (The Lord of the Rings, Part 2)",J.R.R. TOLKIEN,1986,Del Rey,
878,1573229571,About a Boy (Movie Tie-In),Nick Hornby,2002,Riverhead Books,
2768,142004235,East of Eden (Oprah's Book Club),John Steinbeck,2003,Penguin Books,
15055,679410325,"Secret History, The",DONNA TARTT,1992,Knopf,
20315,20868308,"Mere Christianity: A revised and enlarged edition, with a new introduction, of the three books, The case for Christianity, Christian behaviour, and Beyond personality",C. S Lewis,1984,Macmillan Pub. Co,


Recommendations for the user


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S
9,439139597,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2000,Scholastic,
10,439139600,Harry Potter and the Goblet of Fire (Book 4),J. K. Rowling,2002,Scholastic Paperbacks,
0,60193395,Body for Life: 12 Weeks to Mental and Physical Strength,Bill Phillips,1999,HarperCollins,
13,451213599,Four Past Midnight,Stephen King,2004,Signet Book,
23,972393617,Ultimate Unofficial Guide to the Mysteries of Harry Potter,Galadriel Waters,2003,Wizarding World Press,


In [33]:
user_recommendation(187598)

User prefrences:


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S
106828,2290321125,La tour sombre 3 - terres perdues,King Stephen Mickael,2002,J'ai lu,
106831,229012950X,"La Tour sombre, tome 1 : Le pistolero",Stephen King,2000,J'ai lu,
145524,2226034536,Ã?Â?a,Stephen King,2000,Albin Michel,
145537,2253144894,PÃ?Â©plum,AmÃ?Â©lie Nothomb,1998,LGF,
145550,2290301868,LA Ligne Verte,Stephen King,2000,Editions 84,


Recommendations for the user


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S
6738,2290311782,Je Voudrais Que Quelqu'un M'Attende Quelque Part,Anna Gavalda,2001,Editions J'Ai Lu,
2058,038549081X,The Handmaid's Tale : A Novel,Margaret Atwood,1998,Anchor,
6569,2070416801,Balzac Et LA Petite Tailleuse Chino,Daj Sijie,2002,Gallimard Jeunesse,
2960,0446672351,Reservation Blues,Sherman Alexie,1996,Warner Books,
6557,2070394727,La Classe de neige,Emmanuel CarrÃ?Â¨re,1997,Gallimard,


In [34]:
user_recommendation(6617)

'User not found in database.'

In [35]:
user_recommendation(166017)

User prefrences:


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S
42497,380472821,Birdy M/TV,William Wharton,1985,Avon Books,
161548,1582340854,As Seen on TV : Provocations,Lucy Grealy,2000,Bloomsbury USA,
182775,60466588,Principles of Human Physiology,Gerard J. Tortora,1986,HarperCollins Publishers,
182784,882334212,"An Anthology of Russian Neo-Realism: The \Znanie\"" School of Maxim Gorky""",Nicholas Luker,1982,Ardis,
182790,964561123,The Spirit of Prague and Other Essays,Ivan Klima,1995,Granta (NY),


Recommendations for the user


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S
4572,0679410325,"Secret History, The",DONNA TARTT,1992,Knopf,
450,0066212855,"Krakatoa : The Day the World Exploded: August 27, 1883",Simon Winchester,2003,HarperCollins,
4663,067976402X,Snow Falling on Cedars,David Guterson,1995,Vintage Books USA,
1469,0345422317,The Genesis Code,John Case,1998,Ballantine Books,
3395,0452279178,"Wizard and Glass (The Dark Tower, Book 4)",Stephen King,1997,Plume Books,
