In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data is contained in '../Data/Processed' directory
# This cell lists all files under the input directory

import os
INPUT_DIR = os.path.join(os.path.dirname(os.getcwd()), 'Data', 'Processed')
for dirname, _, filenames in os.walk(INPUT_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\Books_valid_ISBN_known_year_no_images.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\popular_books_with_descriptions.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\ratings_for_popular_books.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\Ratings_valid_ISBN.csv
c:\Users\ASUS\Documents\Python Programming\Book recommendations\Data\Processed\users_valid_age_with_country.csv


Let us try to find similarities between books using their descriptions. We will use TF-IDF metric which stand for "term frequence inverse document frequency" and is computed by the following formula: 
$$TF-IDF=\frac{\frac{\text{# word occurrences}}{\text{# words in document}}}{\log\left(\frac{\text{# documents word is in}}{\text{# documents}}\right)}.$$

We first load the information about the books.

In [2]:
books_df = pd.read_csv(os.path.join(INPUT_DIR, 'popular_books_with_descriptions.csv'))
print(books_df.info())
print(books_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 94 entries, 0 to 93
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   94 non-null     int64 
 1   title        94 non-null     object
 2   authors      94 non-null     object
 3   description  94 non-null     object
 4   ISBN         94 non-null     object
dtypes: int64(1), object(4)
memory usage: 3.8+ KB
None
   Unnamed: 0                                             title  \
0           0                                       Wild Animus   
1           1                                  The Lovely Bones   
2           2                                 The Da Vinci Code   
3           3  Divine secrets of the Ya-Ya Sisterhood : a novel   
4           4                                      The Red Tent   

             authors                                        description  \
0   ['Rich Shapero']  Wild animus is a search for the primordial, a ...   
1  

We import TfidfVectorizer and initialize it so that we only look for words that occur at least twice in our descriptions (unique words won't help us with similarity) and no more than in 75% of the descriptions (so that we ignore words like 'the', 'a', 'for' and so on). 

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidfvec = TfidfVectorizer(min_df=2, max_df=0.75)

Now we fit our TfidfVectorizer to the descriptions of books and extract the words (features) with the measures for every description.

In [6]:
descriptions_vectorized = tfidfvec.fit_transform(books_df['description'])
features = tfidfvec.get_feature_names_out()
print(features)

['000' '100' '2003' '50' 'about' 'accident' 'acclaimed' 'account'
 'accused' 'achievement' 'adult' 'adventure' 'affair' 'after' 'again'
 'against' 'age' 'agent' 'alice' 'alive' 'all' 'alone' 'along' 'also'
 'american' 'an' 'ancient' 'angeles' 'any' 'anyone' 'are' 'around' 'array'
 'as' 'assistant' 'at' 'attempts' 'attorney' 'aunt' 'author' 'avenue'
 'award' 'away' 'baby' 'back' 'bank' 'baptist' 'be' 'beautiful' 'became'
 'because' 'become' 'becomes' 'been' 'before' 'begins' 'behavior' 'behind'
 'being' 'believe' 'beloved' 'best' 'bestseller' 'bestselling' 'betrayal'
 'between' 'beyond' 'biggest' 'black' 'blend' 'blood' 'bonds' 'book'
 'books' 'both' 'boy' 'boyfriend' 'boys' 'brazil' 'breaking' 'brief'
 'bright' 'brilliant' 'brings' 'brother' 'brought' 'brutally' 'but' 'buy'
 'by' 'california' 'called' 'calling' 'came' 'can' 'captures' 'car' 'care'
 'caring' 'carolina' 'case' 'celebrate' 'centuries' 'century' 'changed'
 'changes' 'characters' 'chicago' 'child' 'childhood' 'children'
 'c

Now we create a dataframe with books as index and features as columns, where the tfidf is contained.

In [7]:
tfidf_df = pd.DataFrame(descriptions_vectorized.toarray(), columns=features, index = books_df['ISBN'])
print(tfidf_df.info())
print(tfidf_df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 94 entries, 0971880107 to 014028009X
Columns: 770 entries, 000 to your
dtypes: float64(770)
memory usage: 566.2+ KB
None
                 000  100  2003        50     about  accident  acclaimed  \
ISBN                                                                       
0971880107  0.000000  0.0   0.0  0.000000  0.000000       0.0        0.0   
0316666343  0.187292  0.0   0.0  0.200221  0.000000       0.0        0.0   
0385504209  0.000000  0.0   0.0  0.000000  0.000000       0.0        0.0   
0060928336  0.000000  0.0   0.0  0.000000  0.269716       0.0        0.0   
0312195516  0.000000  0.0   0.0  0.000000  0.078475       0.0        0.0   

            account  accused  achievement  ...  writer  writing   ya  \
ISBN                                       ...                         
0971880107      0.0      0.0     0.000000  ...     0.0      0.0  0.0   
0316666343      0.0      0.0     0.000000  ...     0.0      0.0  0.0   
0385504209   

Now we will use cosine similarity to measure how close books are to each other, that is, we will compute cosine between all pairs of vectors that we created for our books. 

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity_array = cosine_similarity(tfidf_df)
print(cosine_similarity_array)

[[1.         0.03359607 0.0213571  ... 0.0927257  0.03033521 0.07000027]
 [0.03359607 1.         0.09032357 ... 0.13652754 0.08103767 0.0898522 ]
 [0.0213571  0.09032357 1.         ... 0.02379762 0.01113739 0.02934729]
 ...
 [0.0927257  0.13652754 0.02379762 ... 1.         0.10624941 0.16409527]
 [0.03033521 0.08103767 0.01113739 ... 0.10624941 1.         0.12142527]
 [0.07000027 0.0898522  0.02934729 ... 0.16409527 0.12142527 1.        ]]


So, we got out table with pairwise cosine similarities. On the diagonal we have 1s as we should, since that is similarity between the book and itself. Let us find the largest similarity that is not 1 and check what pair of books is that similar.

In [9]:
no_diagonal = cosine_similarity_array.copy()
np.fill_diagonal(no_diagonal, 0)
#print(cosine_similarity_array)
#print(no_diagonal)
ind = np.unravel_index(np.argmax(no_diagonal, axis=None), no_diagonal.shape)
print(ind, no_diagonal[ind])
print(books_df.iloc[ind[0]])
print(books_df.iloc[ind[1]])

(21, 55) 0.9484526538675051
Unnamed: 0                                                    21
title                                          The Nanny Diaries
authors                      ['Emma Mclaughlin', 'Nicola Kraus']
description    A satirical glimpse into Manhattan's upper cla...
ISBN                                                  0312278586
Name: 21, dtype: object
Unnamed: 0                                                    55
title                                          The Nanny Diaries
authors                      ['Emma McLaughlin', 'Nicola Kraus']
description    Nanny, a struggling NYU student, takes a posit...
ISBN                                                  0312291639
Name: 55, dtype: object


Unsurprisingly, the two most similar books in our list are what seems to be the different instances of the same book. It is unclear why there are two different ISBNs for this book but indeed there are two different pages on Amazon, one published in 2002 and the other in 2003.

Let us try to find Harry Potter books and see if they are similar to each other by our metric.

In [10]:
harry_potter_books_df = books_df[books_df['title'].str.contains('Harry Potter')]
print(harry_potter_books_df)

    Unnamed: 0                                      title            authors  \
10          10      Harry Potter and the Sorcerer's Stone  ['J. K. Rowling']   
64          64  Harry Potter and the Order of the Phoenix  ['J. K. Rowling']   

                                          description        ISBN  
10  Rescued from the outrageous neglect of his aun...  059035342X  
64  Collects the complete series that relates the ...  043935806X  


So, we have two Harry Potter books in our dataset. Let us check what their similarity is.

In [11]:
print(cosine_similarity_array[(10, 64)])

0.41135157773840547


They seem to be quite similar!

Now it is time to create user profiles. For each user, we will recommend something they haven't read yet but what is similar to what they have read and like (that is, gave a rating of at least 7).

Let us create a list of users that gave a rating of at least 7 to at least one book.

In [12]:
ratings_df = pd.read_csv(os.path.join(INPUT_DIR, 'ratings_for_popular_books.csv'))
print(ratings_df.info())
print(ratings_df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17737 entries, 0 to 17736
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Unnamed: 0   17737 non-null  int64 
 1   User-ID      17737 non-null  int64 
 2   ISBN         17737 non-null  object
 3   Book-Rating  17737 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 554.4+ KB
None
   Unnamed: 0  User-ID        ISBN  Book-Rating
0          80   276788  043935806X            7
1         414   276925  0385504209            8
2         624   276953  0446310786           10
3         665   276964  0440220602            9
4         785   277042  0971880107            2


In [13]:
print(ratings_df[ratings_df['Book-Rating'] >= 7].groupby('User-ID').size())
users = ratings_df[ratings_df['Book-Rating'] >= 7].groupby('User-ID').size().index
print(users)

User-ID
26        1
51        1
91        1
114       1
165       1
         ..
278698    1
278773    1
278798    1
278843    3
278844    1
Length: 8760, dtype: int64
Int64Index([    26,     51,     91,    114,    165,    243,    244,    254,
               256,    280,
            ...
            278535, 278541, 278552, 278582, 278633, 278698, 278773, 278798,
            278843, 278844],
           dtype='int64', name='User-ID', length=8760)


Now we will take a random user and try to recommend some books for that user.

In [14]:
np.random.seed(42)
user_0 = users[np.random.randint(len(users))]
print(user_0)

231264


In [15]:
books_by_isbn_df = books_df.set_index('ISBN')
print(books_by_isbn_df.head())

            Unnamed: 0                                             title  \
ISBN                                                                       
0971880107           0                                       Wild Animus   
0316666343           1                                  The Lovely Bones   
0385504209           2                                 The Da Vinci Code   
0060928336           3  Divine secrets of the Ya-Ya Sisterhood : a novel   
0312195516           4                                      The Red Tent   

                      authors  \
ISBN                            
0971880107   ['Rich Shapero']   
0316666343   ['Alice Sebold']   
0385504209      ['Dan Brown']   
0060928336  ['Rebecca Wells']   
0312195516  ['Anita Diamant']   

                                                  description  
ISBN                                                           
0971880107  Wild animus is a search for the primordial, a ...  
0316666343  The spirit of fourteen-year-old

In [16]:
def recommendations_for_user(user):
    user_ratings_df = ratings_df[ratings_df['User-ID'] == user]
    books_read = user_ratings_df['ISBN']
    books_liked = user_ratings_df['ISBN'][user_ratings_df['Book-Rating'] >= 7]
    user_books_tfidf = tfidf_df.reindex(books_liked)
    user_profile = user_books_tfidf.mean()
    print("The books the user has liked:")
    print(books_by_isbn_df.loc[books_liked])
#    print(user_books_tfidf)
#    print(user_profile)
    unread_books_tfidf = tfidf_df.drop(books_read, axis=0)
    user_profile_similarities = cosine_similarity(user_profile.values.reshape(1,-1), unread_books_tfidf)
    user_profile_similarities_df = pd.DataFrame(user_profile_similarities.T, index=unread_books_tfidf.index, \
                                                columns=["similarity score"])
    sorted_similarities_df = user_profile_similarities_df.sort_values(by="similarity score", ascending=False)
    sorted_similarities_df['title'] = books_by_isbn_df['title'][sorted_similarities_df.index]
    print(sorted_similarities_df)
    
recommendations_for_user(278843)    

The books the user has liked:
            Unnamed: 0                                  title  \
ISBN                                                            
014028009X          93                  Bridget Jones's Diary   
0142000205          90                             Icy Sparks   
059035342X          10  Harry Potter and the Sorcerer's Stone   

                         authors  \
ISBN                               
014028009X    ['Helen Fielding']   
0142000205  ['Gwyn Hyman Rubio']   
059035342X     ['J. K. Rowling']   

                                                  description  
ISBN                                                           
014028009X  USA Today's top 100 books to read while stuck ...  
0142000205  A New York Times Notable Book and the March 20...  
059035342X  Rescued from the outrageous neglect of his aun...  
            similarity score                        title
ISBN                                                     
0385335482          0.37912