# Booklore - Kaggle Goodreads Dataset

In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import ast


## Load dataset

In [2]:
goodreads_df = pd.read_csv('../raw_data/goodreads.csv')
goodreads_df.head(3)

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,firstPublishDate,awards,numRatings,ratingsByStars,likedPercent,setting,coverImg,bbeScore,bbeVotes,price
0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,9780439023481,"['Young Adult', 'Fiction', 'Dystopia', 'Fantas...","['Katniss Everdeen', 'Peeta Mellark', 'Cato (H...",...,,['Locus Award Nominee for Best Young Adult Boo...,6376780,"['3444695', '1921313', '745221', '171994', '93...",96.0,"['District 12, Panem', 'Capitol, Panem', 'Pane...",https://i.gr-assets.com/images/S/compressed.ph...,2993816,30516,5.09
1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",4.5,There is a door at the end of a silent corrido...,English,9780439358071,"['Fantasy', 'Young Adult', 'Fiction', 'Magic',...","['Sirius Black', 'Draco Malfoy', 'Ron Weasley'...",...,06/21/03,['Bram Stoker Award for Works for Young Reader...,2507623,"['1593642', '637516', '222366', '39573', '14526']",98.0,['Hogwarts School of Witchcraft and Wizardry (...,https://i.gr-assets.com/images/S/compressed.ph...,2632233,26923,7.38
2,2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,4.28,The unforgettable novel of a childhood in a sl...,English,9999999999999,"['Classics', 'Fiction', 'Historical Fiction', ...","['Scout Finch', 'Atticus Finch', 'Jem Finch', ...",...,07/11/60,"['Pulitzer Prize for Fiction (1961)', 'Audie A...",4501075,"['2363896', '1333153', '573280', '149952', '80...",95.0,"['Maycomb, Alabama (United States)']",https://i.gr-assets.com/images/S/compressed.ph...,2269402,23328,


## Preprocessing

In [3]:
goodreads_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52478 entries, 0 to 52477
Data columns (total 25 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   bookId            52478 non-null  object 
 1   title             52478 non-null  object 
 2   series            23470 non-null  object 
 3   author            52478 non-null  object 
 4   rating            52478 non-null  float64
 5   description       51140 non-null  object 
 6   language          48672 non-null  object 
 7   isbn              52478 non-null  object 
 8   genres            52478 non-null  object 
 9   characters        52478 non-null  object 
 10  bookFormat        51005 non-null  object 
 11  edition           4955 non-null   object 
 12  pages             50131 non-null  object 
 13  publisher         48782 non-null  object 
 14  publishDate       51598 non-null  object 
 15  firstPublishDate  31152 non-null  object 
 16  awards            52478 non-null  object

In [4]:
# Drop rows with missing essential data and parse genres
goodreads_df = goodreads_df.dropna(subset=['genres', 'rating', 'numRatings'])
goodreads_df['genres'] = goodreads_df['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

In [5]:
goodreads_df.describe()

Unnamed: 0,rating,numRatings,likedPercent,bbeScore,bbeVotes
count,52478.0,52478.0,51856.0,52478.0,52478.0
mean,4.021878,17878.65,92.231545,1984.023,22.529003
std,0.367146,103944.8,5.990689,35153.14,369.158541
min,0.0,0.0,0.0,0.0,-4.0
25%,3.82,341.0,90.0,84.0,1.0
50%,4.03,2307.0,94.0,97.0,1.0
75%,4.23,9380.5,96.0,187.0,2.0
max,5.0,7048471.0,100.0,2993816.0,30516.0


In [6]:
# Filter books with at least 1000 ratings and a rating of 3.5 or higher
filtered_df = goodreads_df[(goodreads_df['numRatings'] >= 1000) & (goodreads_df['rating'] >= 3.5)]

In [7]:
filtered_df.describe()

Unnamed: 0,rating,numRatings,likedPercent,bbeScore,bbeVotes
count,31874.0,31874.0,31874.0,31874.0,31874.0
mean,4.028117,28816.33,93.084865,3179.137,36.056943
std,0.237601,132044.9,3.61302,45056.07,473.044107
min,3.5,1000.0,73.0,0.0,-4.0
25%,3.86,2736.0,91.0,87.0,1.0
50%,4.03,6628.5,94.0,100.0,2.0
75%,4.19,17969.5,96.0,287.0,4.0
max,4.82,7048471.0,100.0,2993816.0,30516.0


In [8]:
# One-hot encode genres
mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(filtered_df['genres'])

In [9]:
list(mlb.classes_)

['10th Century',
 '11th Century',
 '12th Century',
 '13th Century',
 '14th Century',
 '15th Century',
 '16th Century',
 '17th Century',
 '18th Century',
 '19th Century',
 '1st Grade',
 '20th Century',
 '21st Century',
 '2nd Grade',
 '40k',
 'Abuse',
 'Academia',
 'Academic',
 'Academics',
 'Action',
 'Activism',
 'Adolescence',
 'Adoption',
 'Adult',
 'Adult Fiction',
 'Adventure',
 'Africa',
 'African American',
 'African American Literature',
 'African American Romance',
 'African Literature',
 'Agriculture',
 'Albanian Literature',
 'Alchemy',
 'Alcohol',
 'Alexandria',
 'Algeria',
 'Algorithms',
 'Aliens',
 'Alternate History',
 'Alternate Universe',
 'Alternative Medicine',
 'Amateur Sleuth',
 'Amazon',
 'American',
 'American Civil War',
 'American Classics',
 'American Fiction',
 'American History',
 'American Revolution',
 'American Revolutionary War',
 'Americana',
 'Amish',
 'Amish Fiction',
 'Anarchism',
 'Ancient',
 'Ancient History',
 'Angels',
 'Anglo Saxon',
 'Angola',
 

In [10]:
# add One-Hot-Encoded genre columns to df
genre_features_df = pd.DataFrame(genre_features, columns=mlb.classes_)

filtered_df = pd.concat([filtered_df.reset_index(drop=True), genre_features_df.reset_index(drop=True)], axis=1)

filtered_df.head(2)

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,Young Adult Historical Fiction,Young Adult Paranormal,Young Adult Romance,Young Adult Science Fiction,Young Readers,Yuri,Zambia,Zen,Zimbabwe,Zombies
0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,9780439023481,"[Young Adult, Fiction, Dystopia, Fantasy, Scie...","['Katniss Everdeen', 'Peeta Mellark', 'Cato (H...",...,0,0,0,0,0,0,0,0,0,0
1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",4.5,There is a door at the end of a silent corrido...,English,9780439358071,"[Fantasy, Young Adult, Fiction, Magic, Childre...","['Sirius Black', 'Draco Malfoy', 'Ron Weasley'...",...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Scale the rating to be compatible with genre encoding
scaler = MinMaxScaler()
rating_features = scaler.fit_transform(filtered_df[['rating']])

In [12]:
# Add scaled rating features to df
filtered_df['rating_features'] = rating_features
filtered_df.head()

Unnamed: 0,bookId,title,series,author,rating,description,language,isbn,genres,characters,...,Young Adult Paranormal,Young Adult Romance,Young Adult Science Fiction,Young Readers,Yuri,Zambia,Zen,Zimbabwe,Zombies,rating_features
0,2767052-the-hunger-games,The Hunger Games,The Hunger Games #1,Suzanne Collins,4.33,WINNING MEANS FAME AND FORTUNE.LOSING MEANS CE...,English,9780439023481,"[Young Adult, Fiction, Dystopia, Fantasy, Scie...","['Katniss Everdeen', 'Peeta Mellark', 'Cato (H...",...,0,0,0,0,0,0,0,0,0,0.628788
1,2.Harry_Potter_and_the_Order_of_the_Phoenix,Harry Potter and the Order of the Phoenix,Harry Potter #5,"J.K. Rowling, Mary GrandPré (Illustrator)",4.5,There is a door at the end of a silent corrido...,English,9780439358071,"[Fantasy, Young Adult, Fiction, Magic, Childre...","['Sirius Black', 'Draco Malfoy', 'Ron Weasley'...",...,0,0,0,0,0,0,0,0,0,0.757576
2,2657.To_Kill_a_Mockingbird,To Kill a Mockingbird,To Kill a Mockingbird,Harper Lee,4.28,The unforgettable novel of a childhood in a sl...,English,9999999999999,"[Classics, Fiction, Historical Fiction, School...","['Scout Finch', 'Atticus Finch', 'Jem Finch', ...",...,0,0,0,0,0,0,0,0,0,0.590909
3,1885.Pride_and_Prejudice,Pride and Prejudice,,"Jane Austen, Anna Quindlen (Introduction)",4.26,Alternate cover edition of ISBN 9780679783268S...,English,9999999999999,"[Classics, Fiction, Romance, Historical Fictio...","['Mr. Bennet', 'Mrs. Bennet', 'Jane Bennet', '...",...,0,0,0,0,0,0,0,0,0,0.575758
4,41865.Twilight,Twilight,The Twilight Saga #1,Stephenie Meyer,3.6,About three things I was absolutely positive.\...,English,9780316015844,"[Young Adult, Fantasy, Romance, Vampires, Fict...","['Edward Cullen', 'Jacob Black', 'Laurent', 'R...",...,0,0,0,0,0,0,0,0,0,0.075758


In [13]:
# Combine genre and rating features
book_features = pd.concat([pd.DataFrame(genre_features, columns=mlb.classes_), pd.DataFrame(rating_features, columns=['scaled_rating'])], axis=1)
book_features

Unnamed: 0,10th Century,11th Century,12th Century,13th Century,14th Century,15th Century,16th Century,17th Century,18th Century,19th Century,...,Young Adult Paranormal,Young Adult Romance,Young Adult Science Fiction,Young Readers,Yuri,Zambia,Zen,Zimbabwe,Zombies,scaled_rating
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.628788
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.757576
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.590909
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.575758
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.075758
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31869,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.651515
31870,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.484848
31871,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.431818
31872,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0.500000


## (Ignore - go to KNN MODEL) Similarity matrix for content-based filtering model 

In [14]:
# Calculate similarity matrix
similarity_matrix = cosine_similarity(book_features)

In [15]:
# Store similarity matrix in DataFrame for easier access
similarity_df = pd.DataFrame(similarity_matrix, index=filtered_df['title'], columns=filtered_df['title'])
similarity_df.head()

title,The Hunger Games,Harry Potter and the Order of the Phoenix,To Kill a Mockingbird,Pride and Prejudice,Twilight,The Book Thief,Animal Farm,The Chronicles of Narnia,J.R.R. Tolkien 4-Book Boxed Set: The Hobbit and The Lord of the Rings,Gone with the Wind,...,Sweet Possession,The Natural Way of Things,Algedonic,Arafel's Saga,Theodosia and the Last Pharaoh,Heal Your Body: The Mental Causes for Physical Illness and the Metaphysical Way to Overcome Them,Attracted to Fire,Elemental,Unbelievable,Marked
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
The Hunger Games,1.0,0.426959,0.228644,0.227921,0.494929,0.231825,0.410963,0.420908,0.429064,0.229362,...,0.230427,0.295634,0.170054,0.302803,0.420908,0.039353,0.22345,0.512276,0.22421,0.401203
Harry Potter and the Order of the Phoenix,0.426959,1.0,0.329574,0.233083,0.297241,0.428346,0.315035,0.807135,0.717633,0.234872,...,0.236197,0.09892,0.018474,0.302037,0.615784,0.047012,0.131433,0.320583,0.132439,0.30256
To Kill a Mockingbird,0.228644,0.329574,1.0,0.613154,0.20094,0.518631,0.606082,0.323028,0.427021,0.613823,...,0.132233,0.197912,0.014565,0.106549,0.419737,0.037065,0.125001,0.219641,0.125779,0.205163
Pride and Prejudice,0.227921,0.233083,0.613154,1.0,0.200999,0.4218,0.410463,0.225669,0.33105,0.806709,...,0.324414,0.198047,0.169271,0.204707,0.32246,0.036146,0.221639,0.219191,0.12515,0.205112
Twilight,0.494929,0.297241,0.20094,0.200999,1.0,0.200623,0.201358,0.299353,0.296116,0.200877,...,0.200774,0.100141,0.15947,0.200825,0.299353,0.004833,0.201263,0.696605,0.20123,0.8


In [16]:
# Define the recommendation function
def get_recommendations(title, similarity_df, top_n=5):

    # Get top N book recommendations based on the similarity matrix.

    # Parameters:
    # - title: Title of the book for which to find recommendations.
    # - similarity_df: DataFrame containing similarity scores between books.
    # - top_n: Number of recommendations to return (default is 5).

    # Returns:
    # - recommendations: Series containing recommended book titles with similarity scores.

    # Check if the title exists in the DataFrame
    if title not in similarity_df.index:
        return f"Book titled '{title}' not found in the dataset."

    # Sort similar books by their similarity score in descending order, excluding the book itself
    recommendations = similarity_df[title].sort_values(ascending=False)[1:top_n+1]

    return recommendations

In [17]:
title = "Infinite Jest"
recommendations = get_recommendations(title, similarity_df)
print("Recommended books based on similarity to:", title)
print(recommendations)

Recommended books based on similarity to: Infinite Jest
title
Another Roadside Attraction    0.899879
A Fraction of the Whole        0.804994
Nobody's Fool                  0.804684
The World According to Garp    0.804356
Bluebeard                      0.803456
Name: Infinite Jest, dtype: float64


# KNN model

In [18]:
from sklearn.neighbors import NearestNeighbors

knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(book_features)

In [19]:
import pickle
import os

notebook_dir = os.getcwd()

# Save the model to a .pkl file
model_path = os.path.join(notebook_dir,'..','package_folder','models','model-reco-1.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(knn_model, f)
print("Model trained and saved as model.pkl")

Model trained and saved as model.pkl


In [None]:
def knn_recommendations(input_title, df, knn_model, top_n=5):
    """
    Get top N book recommendations using a KNN model, allowing partial title input.

    Parameters:
    - input_title (str): Partial or full title of the book for recommendations.
    - df (pd.DataFrame): DataFrame containing book information and features.
    - knn_model (NearestNeighbors): Fitted KNN model.
    - top_n (int): Number of recommendations to return (default is 5).

    Returns:
    - recommendations (pd.Series or str): Series with recommended book titles, or a message if not found.
    """
    # Search for books with titles containing the input title
    matches = df[df['title'].str.contains(input_title, case=False, na=False)]

    if matches.empty:
        return f"No books found with title containing '{input_title}'."

    # If multiple matches are found, prompt the user to select one
    if len(matches) > 1:
        print("Multiple matches found. Please select the number corresponding to your book:")
        for idx, title in enumerate(matches['title'], 1):
            print(f"{idx}. {title}")

        # Get the user's selection
        while True:
            try:
                selection = int(input("Enter the number of your selected book: "))
                if 1 <= selection <= len(matches):
                    selected_title = matches.iloc[selection - 1]['title']
                    break
                else:
                    print("Please enter a valid number.")
            except ValueError:
                print("Invalid input. Please enter a number.")
    else:
        selected_title = matches.iloc[0]['title']
        print(f"Found match: '{selected_title}' for input '{input_title}'")

    # Find the index of the selected book
    book_idx = df[df['title'] == selected_title].index[0]

    # Find distances and indices of the nearest neighbors
    distances, indices = knn_model.kneighbors([book_features.iloc[book_idx]], n_neighbors=top_n+1)

    # Extract information for recommended books
    recommended_books = df.iloc[indices[0][1:]]
    recommended_books = recommended_books[['title', 'author', 'publisher', 'rating']].copy()
    recommended_books['distance'] = distances[0][1:]

    return recommended_books

if __name__ == "__main__":
    # Interactive input for the user
    user_input_title = input("Enter a book title or partial title: ")
    recommendations = knn_recommendations(user_input_title, filtered_df, knn_model)

    # Display recommendations
    print("Recommended books:")
    print(recommendations)


Multiple matches found. Please select the number corresponding to your book:
1. Infinite Jest
2. His Majesty's Dragon
3. The Jester
4. A Murder for Her Majesty
5. On Her Majesty's Secret Service
6. Death's Jest-Book
7. Vládkyně jestřábů
8. Chicot the Jester (The Last Valois, #2)
9. Wszyscy jesteśmy podejrzani
10. Koniec jest moim początkiem
11. A Jest of God
Recommended books:
                   title                          author      publisher  \
16839  Victory of Eagles  Naomi Novik (Goodreads Author)            NaN   
19434   Crucible of Gold  Naomi Novik (Goodreads Author)   Random House   
12104   Black Powder War  Naomi Novik (Goodreads Author)  Del Rey Books   
20207   Blood of Tyrants  Naomi Novik (Goodreads Author)        Del Rey   
20768    Empire of Ivory  Naomi Novik (Goodreads Author)        Del Rey   

       rating      distance  
16839    4.04  2.220446e-16  
19434    3.95  2.260112e-04  
12104    3.86  9.081092e-04  
20207    3.97  9.868741e-02  
20768    3.96  9.87

