In [13]:
# 1. Import necessary libraries

import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity


In [14]:
# 2. Load datasets

ratings_df = pd.read_csv('../data/ratings.csv')
books_df = pd.read_csv('../data/books.csv')
users_df = pd.read_csv('../data/users.csv')

  books_df = pd.read_csv('../data/books.csv')


In [15]:
users_df

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",
...,...,...,...
278853,278854,"portland, oregon, usa",
278854,278855,"tacoma, washington, united kingdom",50.0
278855,278856,"brampton, ontario, canada",
278856,278857,"knoxville, tennessee, usa",


In [16]:
# 3. Initial filtering to reduce noise

# Filter books with at least 20 ratings
book_counts = ratings_df['ISBN'].value_counts()
books_to_keep = book_counts[book_counts >= 20].index

# Filter users with at least 50 ratings
user_counts = ratings_df['User-ID'].value_counts()
users_to_keep = user_counts[user_counts >= 50].index

# Apply filters
ratings_df = ratings_df[
    (ratings_df['ISBN'].isin(books_to_keep)) &
    (ratings_df['User-ID'].isin(users_to_keep))
]

In [17]:
# 4. Merge ratings with books to get titles
merged = ratings_df.merge(books_df[['ISBN', 'Book-Title']], on='ISBN', how='left')
merged = merged.merge(users_df[['User-ID', 'Location', 'Age']], on='User-ID', how='left')

In [18]:
# This block serves just for finding a book title

keyword = "Lord of the Rings"  # example: search for titles containing "ring"

matching_books = merged[merged['Book-Title'].str.contains(keyword, case=False, na=False)]

# Show count
print(f"Number of books with '{keyword}' in the title: {matching_books['Book-Title'].nunique()}")

print(matching_books['Book-Title'].dropna().unique())


Number of books with 'Lord of the Rings' in the title: 7
['The Hobbit : The Enchanting Prelude to The Lord of the Rings'
 'The Fellowship of the Ring (The Lord of the Rings, Part 1)'
 'The Two Towers (The Lord of the Rings, Part 2)'
 'The Return of the King (The Lord of The Rings, Part 3)'
 'The Return of the King (The Lord of the Rings, Part 3)'
 'The Lord of the Rings (Movie Art Cover)'
 "Bored of the Rings: A Parody of J.R.R. Tolkien's the Lord of the Rings"]


In [19]:
# 5. Further filtering for collaborative filtering (reduce noise)

min_reviews_books = 20
min_reviews_users = 30

valid_users = ratings_df['User-ID'].value_counts()[ratings_df['User-ID'].value_counts() >= min_reviews_users].index
valid_books = ratings_df['ISBN'].value_counts()[ratings_df['ISBN'].value_counts() >= min_reviews_books].index


filtered = merged[(merged['User-ID'].isin(valid_users)) & (merged['ISBN'].isin(valid_books))]

top_users = filtered['User-ID'].value_counts().nlargest(5000).index
top_books = filtered['ISBN'].value_counts().nlargest(5000).index

filtered = filtered[(filtered['User-ID'].isin(top_users)) & (filtered['ISBN'].isin(top_books))]

In [20]:
# 6. Create user-book matrix (pivot table)

user_book_matrix = filtered.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating', fill_value=0)
print(f"User-book matrix shape: {user_book_matrix.shape}")

User-book matrix shape: (2076, 4554)


In [21]:
# 7. Compute cosine similarity between books

user_book_sparse = csr_matrix(user_book_matrix.values)

item_similarity = cosine_similarity(user_book_sparse.T, dense_output=False)

item_similarity_df = pd.DataFrame(item_similarity.toarray(), index=user_book_matrix.columns, columns=user_book_matrix.columns)

In [22]:
# 8. Create mapping between ISBN and book titles

isbn_to_title = filtered[['ISBN', 'Book-Title']].drop_duplicates().set_index('ISBN')['Book-Title']
title_to_isbn = {title: isbn for isbn, title in isbn_to_title.items()}

In [23]:
# 9. Recommend Similar Books Function

def recommend_similar_books(book_title, item_sim_df=item_similarity_df, isbn_to_title=isbn_to_title, title_to_isbn=title_to_isbn, top_n=5):
    if book_title not in title_to_isbn:
        return f"Book '{book_title}' not found in dataset."

    book_isbn = title_to_isbn[book_title]

    if book_isbn not in item_sim_df.index:
        return f"Book ISBN '{book_isbn}' not found in similarity matrix."

    sim_scores = item_sim_df.loc[book_isbn].drop(book_isbn).sort_values(ascending=False)
    top_similar_isbns = sim_scores.head(top_n).index
    recommended_titles = isbn_to_title.loc[top_similar_isbns].values.tolist()

    return recommended_titles

In [24]:
# 10. Example usage

book_input = "1984"
print(f"Books similar to '{book_input}':")
print(recommend_similar_books(book_input))

Books similar to '1984':
['Auntie Mayhem (Bed-And-Breakfast Mysteries (Paperback))', 'Lovely in Her Bones', 'The Color Purple', 'Liberty Falling (Anna Pigeon Mysteries (Paperback))', 'Woman of Independence']
