In [122]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data is contained in '../Data/Processed' directory
# This cell lists all files under the input directory

import os
INPUT_DIR = os.path.join(os.path.dirname(os.getcwd()), 'Data', 'Processed')
for dirname, _, filenames in os.walk(INPUT_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\average_ratings_by_user.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\Books_valid_ISBN_known_year_no_images.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\normalized_user-book_matrix.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\popular_books_with_descriptions.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\ratings_for_popular_books.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\Ratings_valid_ISBN.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\user-book_matrix.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\user-book_matrix_normalized_withna.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\users_valid_age_with_country.csv


We are going to make a simple collaborative filtering engine. First, we load the user-book matrices we computed, both the normalized one and with NaNs.

In [123]:
user_book_withna = pd.read_csv(os.path.join(INPUT_DIR, 'user-book_matrix_normalized_withna.csv'), index_col=0)
print(user_book_withna.info())
print(user_book_withna.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10637 entries, 26 to 278844
Data columns (total 95 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   0060392452  158 non-null    float64
 1   0060502258  166 non-null    float64
 2   0060928336  320 non-null    float64
 3   0060930535  218 non-null    float64
 4   0060934417  159 non-null    float64
 5   0060938455  154 non-null    float64
 6   0060976845  186 non-null    float64
 7   0060987103  156 non-null    float64
 8   006101351X  118 non-null    float64
 9   014028009X  131 non-null    float64
 10  0140293248  165 non-null    float64
 11  0142000205  129 non-null    float64
 12  0142001740  307 non-null    float64
 13  0312195516  383 non-null    float64
 14  0312278586  226 non-null    float64
 15  0312291639  167 non-null    float64
 16  0312305060  129 non-null    float64
 17  0316096199  133 non-null    float64
 18  0316284955  171 non-null    float64
 19  0316601950  272 non-nul

In [124]:
user_book_normalized = pd.read_csv(os.path.join(INPUT_DIR, 'normalized_user-book_matrix.csv'), index_col=0)
print(user_book_normalized.info())
print(user_book_normalized.head())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10637 entries, 26 to 278844
Data columns (total 95 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   0060392452  10637 non-null  float64
 1   0060502258  10637 non-null  float64
 2   0060928336  10637 non-null  float64
 3   0060930535  10637 non-null  float64
 4   0060934417  10637 non-null  float64
 5   0060938455  10637 non-null  float64
 6   0060976845  10637 non-null  float64
 7   0060987103  10637 non-null  float64
 8   006101351X  10637 non-null  float64
 9   014028009X  10637 non-null  float64
 10  0140293248  10637 non-null  float64
 11  0142000205  10637 non-null  float64
 12  0142001740  10637 non-null  float64
 13  0312195516  10637 non-null  float64
 14  0312278586  10637 non-null  float64
 15  0312291639  10637 non-null  float64
 16  0312305060  10637 non-null  float64
 17  0316096199  10637 non-null  float64
 18  0316284955  10637 non-null  float64
 19  0316601950  10637 non-n

We also load the average rating for each user.

In [125]:
user_average_ratings = pd.read_csv(os.path.join(INPUT_DIR, 'average_ratings_by_user.csv'), index_col=0)
user_average_ratings = user_average_ratings.squeeze()
print(user_average_ratings)

User-ID
26        10.000000
51         9.000000
91         9.000000
114       10.000000
165        9.000000
            ...    
278755     1.000000
278773     8.000000
278798     7.000000
278843     8.333333
278844     7.000000
Name: 0, Length: 10637, dtype: float64


We will look at both item-based recommendations and user-based. 

The item-based recommendations don't change much over time (the individual user might start liking other things, but if two books used to appeal to people simultaneously, they will most probably continue to do so). The other advantage is that the books are added to the database much rarer than users, so this kind of recommendations can be pre-calculated. 

A disadvantage is that we will probably give very obvious suggestions, such that "If you liked Harry Potter and the Philosopher's Stone, you might like Harry Potter and Chamber of Secrets".

To give item-based recommendations, we transpose the matrix we have for convenience.

In [126]:
book_user_normalized = user_book_normalized.T
print(book_user_normalized.head())

User-ID     26      51      91      114     165     243     244     254     \
0060392452     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0060502258     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0060928336     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0060930535     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
0060934417     0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

User-ID     256     280     ...  278582  278586  278633  278653  278698  \
0060392452     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
0060502258     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
0060928336     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
0060930535     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   
0060934417     0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0   

User-ID     278755  278773  278798  278843  278844  
0060392452     0.0     0.0 

Now we count pairwise cosine similarities for books.

In [127]:
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(book_user_normalized)
cosine_similarity_df = pd.DataFrame(similarities,
                                    index=book_user_normalized.index,
                                    columns=book_user_normalized.index)
print(cosine_similarity_df.head())

            0060392452  0060502258  0060928336  0060930535  0060934417  \
0060392452    1.000000   -0.004525    0.002793   -0.065613    0.009068   
0060502258   -0.004525    1.000000    0.003011    0.005591   -0.048528   
0060928336    0.002793    0.003011    1.000000   -0.008091   -0.005145   
0060930535   -0.065613    0.005591   -0.008091    1.000000   -0.012554   
0060934417    0.009068   -0.048528   -0.005145   -0.012554    1.000000   

            0060938455  0060976845  0060987103  006101351X  014028009X  ...  \
0060392452    0.049843   -0.026154    0.005337    0.000000   -0.011627  ...   
0060502258   -0.045683   -0.089122    0.008107    0.003911    0.005387  ...   
0060928336   -0.005016   -0.000799   -0.013364   -0.000834   -0.043738  ...   
0060930535   -0.033716   -0.043934   -0.012240    0.014658   -0.026438  ...   
0060934417    0.010694   -0.065469    0.008616   -0.001533   -0.007323  ...   

            068484477X  0684872153  0743237188  0743418174  0786868716  \
006039

It would be nice to check if the similarities we computed make any sense. Let us load the book information.

In [128]:
books_df = pd.read_csv(os.path.join(INPUT_DIR, 'popular_books_with_descriptions.csv'))
print(books_df.info())
print(books_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95 entries, 0 to 94
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   title        95 non-null     object
 1   authors      95 non-null     object
 2   description  95 non-null     object
 3   ISBN         95 non-null     object
dtypes: object(4)
memory usage: 3.1+ KB
None
                                              title            authors  \
0                                       Wild Animus   ['Rich Shapero']   
1                                  The Lovely Bones   ['Alice Sebold']   
2                                 The Da Vinci Code      ['Dan Brown']   
3  Divine secrets of the Ya-Ya Sisterhood : a novel  ['Rebecca Wells']   
4                                      The Red Tent  ['Anita Diamant']   

                                         description        ISBN  
0  Wild animus is a search for the primordial, a ...  0971880107  
1  The spirit of fourteen-ye

We will index the books by ISBN for convenience.

In [129]:
books_df = books_df.set_index('ISBN')
print(books_df)

                                                       title  \
ISBN                                                           
0971880107                                       Wild Animus   
0316666343                                  The Lovely Bones   
0385504209                                 The Da Vinci Code   
0060928336  Divine secrets of the Ya-Ya Sisterhood : a novel   
0312195516                                      The Red Tent   
...                                                      ...   
0142000205                                        Icy Sparks   
0449212602                               The Handmaid's Tale   
080410753X                            The Kitchen God's Wife   
0345351525                           The Queen of the Damned   
014028009X                             Bridget Jones's Diary   

                         authors  \
ISBN                               
0971880107      ['Rich Shapero']   
0316666343      ['Alice Sebold']   
0385504209         ['Da

Let us find books that are most similar to Harry Potter and the Philosopher's Stone, which has ISBN 059035342X.

In [130]:
potter_ISBN = '059035342X'
potter_similarities_ordered = cosine_similarity_df.loc[potter_ISBN].sort_values(ascending=False)
print(potter_similarities_ordered.head(10))
print(books_df.loc[potter_similarities_ordered.index].head(10))

059035342X    1.000000
0060938455    0.108225
0345342968    0.070941
0440221471    0.064838
0316096199    0.059336
0440211727    0.053631
0345361792    0.052023
0312195516    0.050152
0446610038    0.026487
0449212602    0.026390
Name: 059035342X, dtype: float64
                                            title              authors  \
059035342X  Harry Potter and the Sorcerer's Stone    ['J. K. Rowling']   
0060938455                       Fast Food Nation   ['Eric Schlosser']   
0345342968                         Fahrenheit 451     ['Ray Bradbury']   
0440221471                       The Runaway Jury     ['John Grisham']   
0316096199                                  Lucky     ['Alice Sebold']   
0440211727                         A Time to Kill     ['John Grisham']   
0345361792                A Prayer for Owen Meany      ['John Irving']   
0312195516                           The Red Tent    ['Anita Diamant']   
0446610038                             1st to Die  ['James Patterson'] 

The result is quite puzzling - out of the top 10 books none is even a kid's book (except for the book itself obviously). It might be that the book we chose is just too popular and is liked by all sorts of people. Let us try our luck with Bridget Jones's Diary, ISBN 014028009X.

In [131]:
bridget_ISBN = '014028009X'
bridget_similarities_ordered = cosine_similarity_df.loc[bridget_ISBN].sort_values(ascending=False)
print(bridget_similarities_ordered.head(10))
print(books_df.loc[bridget_similarities_ordered.index].head(10))

014028009X    1.000000
0553572997    0.067094
0446606812    0.061517
0440236673    0.035990
0385335482    0.030780
0743237188    0.026424
044023722X    0.022410
006101351X    0.021034
0380789035    0.020333
0385484518    0.019147
Name: 014028009X, dtype: float64
                                  title                  authors  \
014028009X        Bridget Jones's Diary       ['Helen Fielding']   
0553572997                 The Alienist           ['Caleb Carr']   
0446606812          Message in a Bottle      ['Nicholas Sparks']   
0440236673                 The Brethren         ['John Grisham']   
0385335482  Confessions of a Shopaholic      ['Sophie Kinsella']   
0743237188           Fall on your knees  ['Ann-Marie MacDonald']   
044023722X              A Painted House         ['John Grisham']   
006101351X            The Perfect Storm     ['Sebastian Junger']   
0380789035                American Gods          ['Neil Gaiman']   
0385484518         Tuesdays with Morrie          ['Mitch 

Here at least some books in the top recommendations seems to be appealing to working women in their 30s which is the case for Bridget Jones's Diary. To be fair, all the similarities we get are pretty small, both for the Harry Potter book and for Bridget Jones's Diary. 

Let us also compute the user-user similarity matrix.

In [132]:
user_similarities = cosine_similarity(user_book_normalized)
user_similarities_df = pd.DataFrame(user_similarities,
                                    index=user_book_normalized.index,
                                    columns=user_book_normalized.index)
print(user_similarities_df.head())

User-ID  26      51      91      114     165     243     244     254     \
User-ID                                                                   
26          0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
51          0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
91          0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
114         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
165         0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

User-ID  256     280     ...  278582  278586  278633  278653  278698  278755  \
User-ID                  ...                                                   
26          0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
51          0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
91          0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
114         0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0

We notice that on the diagonal we are not getting 1.0 as we should. It seems there is an issue in sklearn with cosine similarities for the high amount of features (see https://github.com/scikit-learn/scikit-learn/issues/21939). We will use instead the distances and then compute similarity as 1-distance. 

In [133]:
from sklearn.metrics import pairwise_distances
user_similarities = 1-pairwise_distances(user_book_normalized.values, metric="cosine")
user_similarities_df = pd.DataFrame(user_similarities,
                                    index=user_book_normalized.index,
                                    columns=user_book_normalized.index)
print(user_similarities_df.head())

User-ID  26      51      91      114     165     243     244     254     \
User-ID                                                                   
26          1.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
51          0.0     1.0     0.0     0.0     0.0     0.0     0.0     0.0   
91          0.0     0.0     1.0     0.0     0.0     0.0     0.0     0.0   
114         0.0     0.0     0.0     1.0     0.0     0.0     0.0     0.0   
165         0.0     0.0     0.0     0.0     1.0     0.0     0.0     0.0   

User-ID  256     280     ...  278582  278586  278633  278653  278698  278755  \
User-ID                  ...                                                   
26          0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
51          0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
91          0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
114         0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0

It seems that we have a lot of zeroes in our matrix. Let us compute for every user the sum of absolute values of similarities with other users.

In [134]:
print(user_similarities_df.abs().sum(axis=1).sort_values(ascending=False).head(20))

User-ID
81318     197.111244
89602     192.192294
164581    190.535953
13273     190.238831
253005    188.898626
271448    188.698244
16795     187.407440
240567    186.873161
86189     186.506445
225595    186.041138
10560     186.003656
144549    185.317548
185955    185.317548
176916    185.317548
28289     185.317548
11676     184.558313
11224     183.333987
229551    181.503115
35859     181.394168
40299     180.457546
dtype: float64


So, there are some users which have non-zero similarities with others, that's a relief. Let us take user with the User-ID 35859, who has some 'similar' (or very 'non-similar', as we were looking at the absolute values) users and try to predict which rating this user will give to the books that they have not read.

First, we check what the user has read. We load ratings dataframe for that.

In [135]:
ratings_df = pd.read_csv(os.path.join(INPUT_DIR, 'ratings_for_popular_books.csv'))
print(ratings_df.info())
print(ratings_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17837 entries, 0 to 17836
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   User-ID      17837 non-null  int64 
 1   ISBN         17837 non-null  object
 2   Book-Rating  17837 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 418.2+ KB
None
   User-ID        ISBN  Book-Rating
0   276788  043935806X            7
1   276925  0385504209            8
2   276953  0446310786           10
3   276964  0440220602            9
4   277042  0971880107            2


In [136]:
user = 35859

In [137]:
print(books_df.loc[ratings_df['ISBN'][ratings_df['User-ID']==user]])

                                               title  \
ISBN                                                   
0060502258    Divine Secrets of the Ya-Ya Sisterhood   
0060934417                                 Bel Canto   
0142001740                   The Secret Life of Bees   
0312195516                              The Red Tent   
0312278586                         The Nanny Diaries   
0316666343                          The Lovely Bones   
0345342968                            Fahrenheit 451   
0375700757                             Cold Mountain   
0375707972                                The Reader   
0385722206  Balzac and the Little Chinese Seamstress   
044021145X                                  The Firm   
0446610038                                1st to Die   
059035342X     Harry Potter and the Sorcerer's Stone   
068484477X                     Stones from the River   
0743418174                               Good in Bed   
0804106304                         The Joy Luck 

We would like to predict what rating this user would give to Bridget Jones's Diary. To do that, we will find the closest users (by similarity we computed) who has read this book using KNN algorithm and compute the mean of their ratings.

First, we create a list of ratings for our target user without the rating for Bridget Jones book.

In [138]:
target_user_x = user_book_normalized.drop(bridget_ISBN, axis=1).loc[[user]]
print(target_user_x)

         0060392452  0060502258  0060928336  0060930535  0060934417  \
User-ID                                                               
35859           0.0   -1.294118         0.0         0.0    0.705882   

         0060938455  0060976845  0060987103  006101351X  0140293248  ...  \
User-ID                                                              ...   
35859           0.0         0.0         0.0         0.0         0.0  ...   

         068484477X  0684872153  0743237188  0743418174  0786868716  \
User-ID                                                               
35859      1.705882         0.0         0.0   -1.294118         0.0   

         0804106304  080410753X  0842329129  0971880107  1400034779  
User-ID                                                              
35859     -0.294118         0.0    0.705882         0.0         0.0  

[1 rows x 94 columns]


Then, we take all the other users and create a list of ratings they gave to Bridget Jones book (including NaNs which signify no rating given).

In [139]:
other_users_y = user_book_withna[bridget_ISBN]
print(other_users_y)

User-ID
26             NaN
51             NaN
91             NaN
114            NaN
165            NaN
            ...   
278755         NaN
278773         NaN
278798         NaN
278843   -0.333333
278844         NaN
Name: 014028009X, Length: 10637, dtype: float64


We now get rid of the users that didn't give a rating to our target book.

In [140]:
other_users_x = user_book_normalized[other_users_y.notnull()]
other_users_x = other_users_x.drop(columns=[bridget_ISBN])
print(other_users_x)

         0060392452  0060502258  0060928336  0060930535  0060934417  \
User-ID                                                               
4017            0.0    0.000000    0.000000         0.0    0.000000   
4228            0.0    0.000000    0.000000         0.0    0.000000   
5582            0.0    0.000000    0.000000         0.0    0.000000   
6575            0.0    0.695652    0.695652         0.0    1.695652   
8067            0.0    0.000000    0.000000         0.0    0.000000   
...             ...         ...         ...         ...         ...   
269831          0.0    0.000000    0.000000         0.0    0.000000   
272573          0.0    0.000000    0.000000         0.0    0.000000   
273307          0.0    0.000000    0.000000         0.0    0.000000   
278633          0.0    0.000000    0.000000         0.0    0.000000   
278843          0.0    0.000000    0.000000         0.0    0.000000   

         0060938455  0060976845  0060987103  006101351X  0140293248  ...  \


In [141]:
other_users_y = other_users_y.dropna()
print(other_users_y)

User-ID
4017     -0.333333
4228     -1.500000
5582      1.250000
6575     -3.304348
8067     -1.500000
            ...   
269831    0.000000
272573   -0.100000
273307   -2.250000
278633   -5.000000
278843   -0.333333
Name: 014028009X, Length: 131, dtype: float64


Finally, we are using scikit-learn's KNN regressor to predict the rating that our user would give. Let us not forget that we normalized the ratings for the users, so we need to add the average rating the user gives.

In [145]:
from sklearn.neighbors import KNeighborsRegressor
user_knn = KNeighborsRegressor(metric='cosine', n_neighbors=3)
user_knn.fit(other_users_x, other_users_y)
user_user_pred = user_knn.predict(target_user_x)
print(user_user_pred+user_average_ratings.loc[user])

[8.46078431]


It seems that our algorithm thinks that the target user will like Bridget Jones a lot.

In [146]:
def predict_rating_for_user(target_user, book_ISBN):
    """Using scikit-learn KNeighborsRegressor with cosine metric, 
    predicts what rating the user will give to the book"""
    # Create a list of ratings for our target user without the rating for the target book.
    target_user_x = user_book_normalized.drop(book_ISBN, axis=1).loc[[target_user]]
    
    # Take all the other users and create a list of ratings they gave to the book.
    # NaNs are included to signify no rating given.
    other_users_y = user_book_withna[book_ISBN]
    
    # Get rid of the users that didn't rate the target book, 
    # create a dataframe with all the other ratings they gave.
    other_users_x = user_book_normalized[other_users_y.notnull()]
    other_users_x = other_users_x.drop(columns=[book_ISBN])
    other_users_y = other_users_y.dropna()
    
    # Using scikit-learn's KNN regressor to predict the rating that our user would give.
    user_knn = KNeighborsRegressor(metric='cosine', n_neighbors=3)
    user_knn.fit(other_users_x, other_users_y)
    user_user_pred = user_knn.predict(target_user_x)
    
    # Return the prediction.
    return user_user_pred + user_average_ratings[target_user]

Let us give predictions for all the books that our user has not rated.

In [147]:
for ISBN in books_df.index:
    if ISBN not in ratings_df['ISBN'][ratings_df['User-ID']==user].values:
        print("The predicted rating for "+ books_df['title'].loc[ISBN] \
              + " is " + str(predict_rating_for_user(user, ISBN)))

The predicted rating for Wild Animus is [5.04411765]
The predicted rating for The Da Vinci Code is [8.96078431]
The predicted rating for Divine secrets of the Ya-Ya Sisterhood : a novel is [8.36078431]
The predicted rating for A Painted House is [7.23856209]
The predicted rating for Memoirs of a Geisha is [8.96078431]
The predicted rating for Snow Falling on Cedars is [9.20840336]
The predicted rating for Angels & Demons is [8.19411765]
The predicted rating for The Pilot's Wife is [7.70522876]
The predicted rating for House of Sand and Fog is [9.07745098]
The predicted rating for Girl with a Pearl Earring is [9.29411765]
The predicted rating for The Pelican Brief is [7.3496732]
The predicted rating for A Time to Kill is [8.79411765]
The predicted rating for Interview with the Vampire is [7.23856209]
The predicted rating for The Poisonwood Bible is [8.96078431]
The predicted rating for Summer Sisters is [9.28856209]
The predicted rating for She's Come Undone is [6.06237162]
The predicte