In first section, libraries and data are loaded, I created a repo and downloaded a datasets.

In [177]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler


In [178]:
books_df = pd.read_csv("data/Books.csv", low_memory=False)
books_df.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [179]:
ratings_df = pd.read_csv("data/Ratings.csv")
ratings_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [180]:
users_df = pd.read_csv("data/Users.csv")
users_df.head()

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [181]:
book_counts = ratings_df['ISBN'].value_counts()
books_to_keep = book_counts[book_counts >= 20].index

# Step 2: Filter users with at least 50 reviews
user_counts = ratings_df['User-ID'].value_counts()
users_to_keep = user_counts[user_counts >= 50].index

# Step 3: Filter the original DataFrame
ratings_df = ratings_df[
    (ratings_df['ISBN'].isin(books_to_keep)) &
    (ratings_df['User-ID'].isin(users_to_keep))
]

ratings_df["User-ID"].value_counts()

User-ID
11676     3484
35859     1560
76352     1329
153662    1300
16795     1075
          ... 
148154       1
2442         1
24624        1
102532       1
196898       1
Name: count, Length: 3382, dtype: int64

In [182]:
# Merge ratings with books to get titles
merged = ratings_df.merge(books_df[['ISBN', 'Book-Title']], on='ISBN', how='left')
merged = merged.merge(users_df[['User-ID', 'Location', 'Age']], on='User-ID', how='left')

In [183]:
# Assume your dataframe with book info is 'merged'
# And 'Book-Title' column has the titles

keyword = "Lord of the Rings"  # example: search for titles containing "ring" (case insensitive)

# Filter rows where Book-Title contains the keyword (case insensitive)
matching_books = merged[merged['Book-Title'].str.contains(keyword, case=False, na=False)]

# Show count
print(f"Number of books with '{keyword}' in the title: {matching_books['Book-Title'].nunique()}")

# Optionally, display the matching titles (unique)
print(matching_books['Book-Title'].dropna().unique())


Number of books with 'Lord of the Rings' in the title: 7
['The Hobbit : The Enchanting Prelude to The Lord of the Rings'
 'The Fellowship of the Ring (The Lord of the Rings, Part 1)'
 'The Two Towers (The Lord of the Rings, Part 2)'
 'The Return of the King (The Lord of The Rings, Part 3)'
 'The Return of the King (The Lord of the Rings, Part 3)'
 'The Lord of the Rings (Movie Art Cover)'
 "Bored of the Rings: A Parody of J.R.R. Tolkien's the Lord of the Rings"]


In [184]:
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Use your full merged dataframe
ratings = merged.copy()

# Filter for books and users with at least 3 ratings (to reduce noise)
min_ratings = 3
valid_users = ratings['User-ID'].value_counts()[ratings['User-ID'].value_counts() >= min_ratings].index
valid_books = ratings['ISBN'].value_counts()[ratings['ISBN'].value_counts() >= min_ratings].index

filtered = ratings[(ratings['User-ID'].isin(valid_users)) & (ratings['ISBN'].isin(valid_books))]

# Further limit top 5000 users and books for performance (optional)
top_users = filtered['User-ID'].value_counts().nlargest(5000).index
top_books = filtered['ISBN'].value_counts().nlargest(5000).index
filtered = filtered[(filtered['User-ID'].isin(top_users)) & (filtered['ISBN'].isin(top_books))]

# Create user-item matrix: rows = users, columns = books, values = ratings
user_book_matrix = filtered.pivot_table(index='User-ID', columns='ISBN', values='Book-Rating', fill_value=0)

print(f"User-book matrix shape: {user_book_matrix.shape}")


User-book matrix shape: (3281, 5000)


In [185]:
# Convert to sparse matrix for efficiency
user_book_sparse = csr_matrix(user_book_matrix.values)

# Compute cosine similarity between books (columns)
item_similarity = cosine_similarity(user_book_sparse.T, dense_output=False)

# Create DataFrame for easy lookups
item_similarity_df = pd.DataFrame(item_similarity.toarray(), index=user_book_matrix.columns, columns=user_book_matrix.columns)


In [186]:
# Map ISBN to Book Titles (for easy lookup)
isbn_to_title = filtered[['ISBN', 'Book-Title']].drop_duplicates().set_index('ISBN')['Book-Title']
title_to_isbn = {v: k for k, v in isbn_to_title.items()}

def recommend_similar_books(book_title, item_sim_df=item_similarity_df, isbn_to_title=isbn_to_title, title_to_isbn=title_to_isbn, top_n=5):
    if book_title not in title_to_isbn:
        return f"Book '{book_title}' not found in dataset."
    
    book_isbn = title_to_isbn[book_title]
    
    if book_isbn not in item_sim_df.index:
        return f"Book ISBN '{book_isbn}' not found in similarity matrix."
    
    # Get similarity scores for the input book to all others
    sim_scores = item_sim_df.loc[book_isbn].drop(book_isbn).sort_values(ascending=False)
    
    # Pick top N similar books
    top_similar_isbns = sim_scores.head(top_n).index
    
    # Map ISBNs back to titles
    recommended_titles = isbn_to_title.loc[top_similar_isbns].values.tolist()
    
    return recommended_titles


In [187]:
book_input = "The Two Towers (The Lord of the Rings, Part 2)"  # change this to your book title
print(f"Books similar to '{book_input}':")
print(recommend_similar_books(book_input))


Books similar to 'The Two Towers (The Lord of the Rings, Part 2)':
['The Return of the King (The Lord of the Rings, Part 3)', 'The Fellowship of the Ring (The Lord of the Rings, Part 1)', 'The Silmarillion', 'The Hobbit : The Enchanting Prelude to The Lord of the Rings', 'City of Golden Shadow (Otherland, Volume 1)']
