# Booksplore - Kaggle Goodreads Dataset

In [3]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import ast


ModuleNotFoundError: No module named 'pandas'

## Load dataset

In [None]:
goodreads_df = pd.read_csv('/Users/gbuck/code/gbuck04/booksplore/books_1.Best_Books_Ever.csv')
goodreads_df.head(3)

## Preprocessing

In [None]:
goodreads_df.info()

In [4]:
# Drop rows with missing essential data and parse genres
goodreads_df = goodreads_df.dropna(subset=['genres', 'rating', 'numRatings'])
goodreads_df['genres'] = goodreads_df['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

In [None]:
goodreads_df.describe()

In [6]:
# Filter books with at least 1000 ratings and a rating of 3.5 or higher
filtered_df = goodreads_df[(goodreads_df['numRatings'] >= 1000) & (goodreads_df['rating'] >= 3.5)]

In [None]:
filtered_df.describe()

In [8]:
# One-hot encode genres
mlb = MultiLabelBinarizer()
genre_features = mlb.fit_transform(filtered_df['genres'])

In [None]:
list(mlb.classes_)

In [None]:
# add One-Hot-Encoded genre columns to df
genre_features_df = pd.DataFrame(genre_features, columns=mlb.classes_)

filtered_df = pd.concat([filtered_df.reset_index(drop=True), genre_features_df.reset_index(drop=True)], axis=1)

filtered_df.head(2)

In [11]:
# Scale the rating to be compatible with genre encoding
scaler = MinMaxScaler()
rating_features = scaler.fit_transform(filtered_df[['rating']])

In [None]:
# Add scaled rating features to df
filtered_df['rating_features'] = rating_features
filtered_df.head()

In [None]:
# Combine genre and rating features
book_features = pd.concat([pd.DataFrame(genre_features, columns=mlb.classes_), pd.DataFrame(rating_features, columns=['scaled_rating'])], axis=1)
book_features

## Similarity matrix for content-based filtering model

In [None]:
# Calculate similarity matrix
similarity_matrix = cosine_similarity(book_features)

In [None]:
# Store similarity matrix in DataFrame for easier access
similarity_df = pd.DataFrame(similarity_matrix, index=filtered_df['title'], columns=filtered_df['title'])
similarity_df.head()

In [None]:
# Define the recommendation function
def get_recommendations(title, similarity_df, top_n=5):

    # Get top N book recommendations based on the similarity matrix.

    # Parameters:
    # - title: Title of the book for which to find recommendations.
    # - similarity_df: DataFrame containing similarity scores between books.
    # - top_n: Number of recommendations to return (default is 5).

    # Returns:
    # - recommendations: Series containing recommended book titles with similarity scores.

    # Check if the title exists in the DataFrame
    if title not in similarity_df.index:
        return f"Book titled '{title}' not found in the dataset."

    # Sort similar books by their similarity score in descending order, excluding the book itself
    recommendations = similarity_df[title].sort_values(ascending=False)[1:top_n+1]

    return recommendations

In [None]:
title = "Infinite Jest"
recommendations = get_recommendations(title, similarity_df)
print("Recommended books based on similarity to:", title)
print(recommendations)

# KNN model

In [None]:
from sklearn.neighbors import NearestNeighbors

knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(book_features)

In [15]:
def knn_recommendations(title, df, knn_model, top_n=5):

    # Get top N book recommendations using a KNN model.

    # Parameters:
    # - title: Title of the book for which to find recommendations.
    # - df: DataFrame containing book information and features.
    # - knn_model: Fitted KNN model.
    # - top_n: Number of recommendations to return (default is 5).

    # Returns:
    # - recommendations: Series containing recommended book titles.

    # Find the index of the book in the DataFrame
    if title not in df['title'].values:
        return f"Book titled '{title}' not found in the dataset."

    book_idx = df[df['title'] == title].index[0]

    # Find distances and indices of the nearest neighbors
    distances, indices = knn_model.kneighbors([book_features.iloc[book_idx]], n_neighbors=top_n+1)

    # Get the titles of the recommended books (excluding the first one, which is the book itself)
    recommended_titles = df.iloc[indices[0][1:]]['title'].values
    recommended_distances = distances[0][1:]

    # Create a Series with titles and similarity scores
    recommendations = pd.Series(recommended_titles, index=recommended_distances, name="Recommendations")

    return recommendations

In [None]:
test_title = "Infinite Jest"
knn_recommendations(test_title, filtered_df, knn_model)