# Imports

In [1]:
import pandas as pd
import numpy as np

In [2]:
books = pd.read_csv('BX-Books.csv', sep=';', encoding='latin-1', on_bad_lines='skip')

  books = pd.read_csv('BX-Books.csv', sep=';', encoding='latin-1', on_bad_lines='skip')


In [3]:
users = pd.read_csv('BX-Users.csv', sep=';', encoding='latin-1', on_bad_lines='skip')

In [4]:
ratings = pd.read_csv('BX-Book-Ratings.csv', sep=';', encoding='latin-1', on_bad_lines='skip')

# Data Cleaning

## Books

In [5]:
books.shape

(271360, 8)

In [6]:
books.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L'],
      dtype='object')

In [7]:
books.drop(columns=['Image-URL-S', 'Image-URL-M'], inplace=True)

In [8]:
books.rename(columns={'Book-Title': 'title', 
                      'Book-Author': 'author', 
                      'Year-Of-Publication': 'year', 
                      'Publisher': 'publisher',
                      'Image-URL-L': 'image_url'}, inplace=True)

In [9]:
books.head()

Unnamed: 0,ISBN,title,author,year,publisher,image_url
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...


## Users

In [10]:
users.columns

Index(['User-ID', 'Location', 'Age'], dtype='object')

In [11]:
users.shape

(278858, 3)

In [12]:
users.rename(columns={'User-ID': 'user_id', 
                      'Location': 'location', 
                      'Age': 'age'}, inplace=True)

In [13]:
users.head()

Unnamed: 0,user_id,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


## Ratings

In [14]:
ratings.shape

(1149780, 3)

In [15]:
ratings.columns

Index(['User-ID', 'ISBN', 'Book-Rating'], dtype='object')

In [16]:
ratings.rename(columns={'User-ID': 'user_id', 
                      'Book-Rating': 'rating'}, inplace=True)

In [17]:
ratings.head()

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


# Exploratory Analysis

In [18]:
ratings['user_id'].value_counts()

user_id
11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
          ...  
119573        1
276706        1
276697        1
276679        1
276676        1
Name: count, Length: 105283, dtype: int64

In [19]:
rating_mask = ratings['user_id'].value_counts() > 200 #we only count ratings of users who have read more than 50
rating_mask[rating_mask].shape

(899,)

In [20]:
index = rating_mask[rating_mask].index
index

Index([ 11676, 198711, 153662,  98391,  35859, 212898, 278418,  76352, 110973,
       235105,
       ...
       116122,  44296,  28634,  59727,  73681, 274808, 188951,   9856, 155916,
       268622],
      dtype='int64', name='user_id', length=899)

In [21]:
ratings = ratings[ratings['user_id'].isin(index)]
ratings.head()

Unnamed: 0,user_id,ISBN,rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


In [22]:
ratings_with_books = ratings.merge(books, on='ISBN')
ratings_with_books.head()

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,image_url
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...
1,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,http://images.amazon.com/images/P/0026217457.0...
2,277427,003008685X,8,Pioneers,James Fenimore Cooper,1974,Thomson Learning,http://images.amazon.com/images/P/003008685X.0...
3,277427,0030615321,0,"Ask for May, Settle for June (A Doonesbury book)",G. B. Trudeau,1982,Henry Holt &amp; Co,http://images.amazon.com/images/P/0030615321.0...
4,277427,0060002050,0,On a Wicked Dawn (Cynster Novels),Stephanie Laurens,2002,Avon Books,http://images.amazon.com/images/P/0060002050.0...


In [23]:
num_rating = ratings_with_books.groupby('title')['rating'].count().reset_index()
num_rating.head()

Unnamed: 0,title,rating
0,A Light in the Storm: The Civil War Diary of ...,2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance ...,1
4,Clifford Visita El Hospital (Clifford El Gran...,1


In [24]:
num_rating.rename(columns={'rating': 'num_of_rating'}, inplace=True)

In [25]:
final_rating = ratings_with_books.merge(num_rating, on='title')
final_rating.head()

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,image_url,num_of_rating
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
1,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,http://images.amazon.com/images/P/0026217457.0...,7
2,277427,003008685X,8,Pioneers,James Fenimore Cooper,1974,Thomson Learning,http://images.amazon.com/images/P/003008685X.0...,1
3,277427,0030615321,0,"Ask for May, Settle for June (A Doonesbury book)",G. B. Trudeau,1982,Henry Holt &amp; Co,http://images.amazon.com/images/P/0030615321.0...,1
4,277427,0060002050,0,On a Wicked Dawn (Cynster Novels),Stephanie Laurens,2002,Avon Books,http://images.amazon.com/images/P/0060002050.0...,13


In [26]:
# lets drop the duplicates
final_rating.drop_duplicates(['user_id','title'],inplace=True)
final_rating.isnull().sum()

user_id          0
ISBN             0
rating           0
title            0
author           1
year             0
publisher        2
image_url        3
num_of_rating    0
dtype: int64

In [27]:
final_rating

Unnamed: 0,user_id,ISBN,rating,title,author,year,publisher,image_url,num_of_rating
0,277427,002542730X,10,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner,1994,John Wiley &amp; Sons Inc,http://images.amazon.com/images/P/002542730X.0...,82
1,277427,0026217457,0,Vegetarian Times Complete Cookbook,Lucy Moll,1995,John Wiley &amp; Sons,http://images.amazon.com/images/P/0026217457.0...,7
2,277427,003008685X,8,Pioneers,James Fenimore Cooper,1974,Thomson Learning,http://images.amazon.com/images/P/003008685X.0...,1
3,277427,0030615321,0,"Ask for May, Settle for June (A Doonesbury book)",G. B. Trudeau,1982,Henry Holt &amp; Co,http://images.amazon.com/images/P/0030615321.0...,1
4,277427,0060002050,0,On a Wicked Dawn (Cynster Novels),Stephanie Laurens,2002,Avon Books,http://images.amazon.com/images/P/0060002050.0...,13
...,...,...,...,...,...,...,...,...,...
487666,275970,1931868123,0,There's a Porcupine in My Outhouse: Misadventu...,Mike Tougias,2002,Capital Books (VA),http://images.amazon.com/images/P/1931868123.0...,1
487667,275970,3411086211,10,Die Biene.,Sybil GrÃ?Â¤fin SchÃ?Â¶nfeldt,1993,"Bibliographisches Institut, Mannheim",http://images.amazon.com/images/P/3411086211.0...,1
487668,275970,3829021860,0,The Penis Book,Joseph Cohen,1999,Konemann,http://images.amazon.com/images/P/3829021860.0...,1
487669,275970,4770019572,0,Musashi,Eiji Yoshikawa,1995,Kodansha International (JPN),http://images.amazon.com/images/P/4770019572.0...,1


# Model Building

All the sections are built to run separately.

In [28]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
import re
from collections import Counter

In [30]:
def prepare_book_features(books_df):
    """
    Prepare book features for content-based filtering
    """
    books_features = books_df.copy()
    books_features['combined_text'] = (
        books_features['title'].fillna('') + ' ' +
        books_features['author'].fillna('') + ' ' +
        books_features['publisher'].fillna('')
    )
    books_features['combined_text'] = books_features['combined_text'].apply(
        lambda x: re.sub(r'[^a-zA-Z\s]', '', str(x).lower())
    )
    books_features['year'] = pd.to_numeric(books_features['year'], errors='coerce')
    books_features['year'] = books_features['year'].fillna(books_features['year'].median())
    return books_features

books_with_features = prepare_book_features(final_rating).drop_duplicates('ISBN').reset_index(drop=True)

print(f"Prepared features for {len(books_with_features)} books")
print("\nSample book features:")
print(books_with_features[['title', 'author', 'year', 'publisher', 'combined_text']].head())

Prepared features for 177518 books

Sample book features:
                                               title                 author  \
0  Politically Correct Bedtime Stories: Modern Ta...      James Finn Garner   
1                 Vegetarian Times Complete Cookbook             Lucy  Moll   
2                                           Pioneers  James Fenimore Cooper   
3   Ask for May, Settle for June (A Doonesbury book)          G. B. Trudeau   
4                  On a Wicked Dawn (Cynster Novels)      Stephanie Laurens   

     year                  publisher  \
0  1994.0  John Wiley &amp; Sons Inc   
1  1995.0      John Wiley &amp; Sons   
2  1974.0           Thomson Learning   
3  1982.0        Henry Holt &amp; Co   
4  2002.0                 Avon Books   

                                       combined_text  
0  politically correct bedtime stories modern tal...  
1  vegetarian times complete cookbook lucy  moll ...  
2    pioneers james fenimore cooper thomson learning  
3  ask

In [31]:
# Create TF-IDF vectors for book text features
def create_tfidf_features(books_df, max_features=1000):
    """
    Create TF-IDF features from book text
    """
    tfidf = TfidfVectorizer(
        max_features=max_features,
        stop_words='english',
        ngram_range=(1, 2),  # Include both unigrams and bigrams
        min_df=2,  # Ignore terms that appear in less than 2 documents
        max_df=0.8  # Ignore terms that appear in more than 80% of documents
    )
    
    tfidf_matrix = tfidf.fit_transform(books_df['combined_text'])
    return tfidf_matrix, tfidf

tfidf_matrix, tfidf_vectorizer = create_tfidf_features(books_with_features)
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Number of features: {tfidf_matrix.shape[1]}")

# Show some feature names
feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"\nSample feature names: {feature_names[:20]}")

TF-IDF matrix shape: (177518, 1000)
Number of features: 1000

Sample feature names: ['ace' 'ace books' 'adams' 'adventure' 'adventures' 'agatha'
 'agatha christie' 'age' 'aladdin' 'alan' 'albert' 'alex' 'alexander'
 'alfred' 'alfred knopf' 'alice' 'allan' 'allen' 'amer' 'amer library']


## Content Based Filtering

In [32]:
from sklearn.preprocessing import normalize
import numpy as np

# L2-normalize rows in sparse CSR tfidf_matrix (in-place copy to keep original if needed)
tfidf_norm = normalize(tfidf_matrix, norm='l2', axis=1, copy=True)

def cosine_similarities_sparse(user_profile: np.ndarray):
    """
    Compute cosine similarities between a dense user profile (1 x D)
    and all books using sparse, row-normalized TF-IDF (N x D).
    Returns a dense 1-D numpy array of length N.
    """
    up = user_profile.astype(np.float32, copy=True)
    norm = np.linalg.norm(up)
    if norm > 0:
        up = up / norm
    sims = tfidf_norm @ up  # (N, D) @ (D,) -> (N,)
    return np.asarray(sims).ravel()

In [33]:
class ContentBasedRecommender:
    def __init__(self, books_df, tfidf_matrix, tfidf_vectorizer):
        self.books_df = books_df
        self.tfidf_matrix = tfidf_matrix
        self.tfidf_vectorizer = tfidf_vectorizer
        
    def create_user_profile(self, liked_books_isbns, ratings=None):
        """
        Create a user profile based on books they liked
        
        Args:
            liked_books_isbns: List of ISBNs of books the user liked
            ratings: Optional list of ratings for the books (1-10 scale)
        
        Returns:
            user_profile: TF-IDF vector representing user preferences
        """
        if ratings is None:
            ratings = [1.0] * len(liked_books_isbns)  # Default equal weight
        
        # Normalize ratings to 0-1 scale
        ratings = np.array(ratings) / 10.0
        
        # Find book indices
        book_indices = []
        valid_ratings = []
        
        for isbn, rating in zip(liked_books_isbns, ratings):
            book_idx = self.books_df[self.books_df['ISBN'] == isbn].index
            if len(book_idx) > 0:
                book_indices.append(book_idx[0])
                valid_ratings.append(rating)
        
        if len(book_indices) == 0:
            print("Warning: No valid books found in the dataset")
            return None
            
        # Create weighted user profile
        user_profile = np.zeros(self.tfidf_matrix.shape[1])
        
        for idx, rating in zip(book_indices, valid_ratings):
            user_profile += rating * self.tfidf_matrix[idx].toarray().flatten()
        
        # Normalize by number of books
        user_profile = user_profile / len(book_indices)
        
        return user_profile
    
    def get_recommendations(self, user_profile, n_recommendations=10, exclude_books=None):
        """
        Get book recommendations based on user profile
        
        Args:
            user_profile: TF-IDF vector representing user preferences
            n_recommendations: Number of recommendations to return
            exclude_books: List of ISBNs to exclude from recommendations
        
        Returns:
            recommendations: DataFrame with recommended books
        """
        if user_profile is None:
            return pd.DataFrame()
        
        # Calculate similarity between user profile and all books (sparse CPU path)
        similarities = cosine_similarities_sparse(user_profile)
        
        # Create recommendations dataframe
        recommendations = self.books_df.copy()
        recommendations['similarity_score'] = similarities
        
        # Exclude books if specified
        if exclude_books:
            recommendations = recommendations[~recommendations['ISBN'].isin(exclude_books)]
        
        # Sort by similarity and return top recommendations
        recommendations = recommendations.sort_values('similarity_score', ascending=False).drop_duplicates(subset='ISBN', keep='first')
        
        return recommendations.head(n_recommendations)
    
    def get_similar_books(self, book_isbn, n_similar=5):
        """
        Get books similar to a given book
        
        Args:
            book_isbn: ISBN of the reference book
            n_similar: Number of similar books to return
        
        Returns:
            similar_books: DataFrame with similar books
        """
        # Find book index
        book_idx = self.books_df[self.books_df['ISBN'] == book_isbn].index
        if len(book_idx) == 0:
            print(f"Book with ISBN {book_isbn} not found")
            return pd.DataFrame()
        
        book_idx = book_idx[0]
        
        # Compute similarity to all books using the selected book's TF-IDF vector
        ref_vec = self.tfidf_matrix[book_idx].toarray().ravel()
        similarities = cosine_similarities_sparse(ref_vec)
        
        # Create similar books dataframe
        similar_books = self.books_df.copy()
        similar_books['similarity_score'] = similarities
        
        # Exclude the reference book itself
        similar_books = similar_books[similar_books['ISBN'] != book_isbn]
        
        # Sort by similarity and return top similar books
        similar_books = similar_books.sort_values('similarity_score', ascending=False).drop_duplicates(subset='ISBN', keep='first')
        
        return similar_books.head(n_similar)

# Initialize the content-based recommender
content_recommender = ContentBasedRecommender(books_with_features, tfidf_matrix, tfidf_vectorizer)
print("Content-based recommender initialized successfully!")

Content-based recommender initialized successfully!


## Memory Efficient

In [53]:
# Memory-Efficient Content-Based Filtering Solutions

import torch
import torch.nn.functional as F
from sklearn.neighbors import NearestNeighbors
import gc

print("=== Memory-Efficient Solutions for Large-Scale Content-Based Filtering ===\n")
print(f"Current TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Full similarity matrix would be: {tfidf_matrix.shape[0]} x {tfidf_matrix.shape[0]} = {tfidf_matrix.shape[0]**2:,} elements")
print(f"Estimated memory needed: {(tfidf_matrix.shape[0]**2 * 8) / (1024**3):.2f} GB (float64)")
print("\nSolutions implemented below:")

=== Memory-Efficient Solutions for Large-Scale Content-Based Filtering ===

Current TF-IDF matrix shape: (177518, 1000)
Full similarity matrix would be: 177518 x 177518 = 31,512,640,324 elements
Estimated memory needed: 234.79 GB (float64)

Solutions implemented below:


In [54]:
# Solution 1: GPU-Based Batch Similarity Computation (optional)
class MemoryEfficientContentRecommender:
    def __init__(self, books_df, tfidf_matrix, tfidf_vectorizer, device='cuda' if torch.cuda.is_available() else 'cpu'):
        self.books_df = books_df
        self.tfidf_matrix = tfidf_matrix
        self.tfidf_vectorizer = tfidf_vectorizer
        self.device = device
        
        # Convert TF-IDF matrix to PyTorch tensor on GPU
        print(f"Converting TF-IDF matrix to PyTorch tensor on {device}...")
        self.tfidf_tensor = torch.tensor(tfidf_matrix.toarray(), dtype=torch.float32, device=device)
        print(f"TF-IDF tensor shape: {self.tfidf_tensor.shape}")
        print(f"TF-IDF tensor memory usage: {self.tfidf_tensor.element_size() * self.tfidf_tensor.nelement() / (1024**2):.2f} MB")
        
        # Pre-compute norms for efficient cosine similarity
        self.tfidf_norms = F.normalize(self.tfidf_tensor, p=2, dim=1)

    def create_user_profile(self, liked_books_isbns, ratings=None):
        """
        Create a user profile based on books they liked
        
        Args:
            liked_books_isbns: List of ISBNs of books the user liked
            ratings: Optional list of ratings for the books (1-10 scale)
        
        Returns:
            user_profile: TF-IDF vector representing user preferences
        """
        if ratings is None:
            ratings = [1.0] * len(liked_books_isbns)  # Default equal weight
        
        # Normalize ratings to 0-1 scale
        ratings = np.array(ratings) / 10.0
        
        # Find book indices
        book_indices = []
        valid_ratings = []
        
        for isbn, rating in zip(liked_books_isbns, ratings):
            book_idx = self.books_df[self.books_df['ISBN'] == isbn].index
            if len(book_idx) > 0:
                book_indices.append(book_idx[0])
                valid_ratings.append(rating)
        
        if len(book_indices) == 0:
            print("Warning: No valid books found in the dataset")
            return None
            
        # Create weighted user profile
        user_profile = np.zeros(self.tfidf_matrix.shape[1])
        
        for idx, rating in zip(book_indices, valid_ratings):
            user_profile += rating * self.tfidf_matrix[idx].toarray().flatten()
        
        # Normalize by number of books
        user_profile = user_profile / len(book_indices)
        
        return user_profile
    
    def compute_similarity_batch(self, query_vector, batch_size=1000):
        """
        Compute similarity between query vector and all books in batches
        """
        query_vector = torch.tensor(query_vector, dtype=torch.float32, device=self.device)
        query_norm = F.normalize(query_vector.unsqueeze(0), p=2, dim=1)
        
        similarities = torch.zeros(self.tfidf_tensor.shape[0], device=self.device)
        
        # Process in batches to avoid memory issues
        for i in range(0, self.tfidf_tensor.shape[0], batch_size):
            end_idx = min(i + batch_size, self.tfidf_tensor.shape[0])
            batch_norms = self.tfidf_norms[i:end_idx]
            
            # Compute cosine similarity
            batch_similarities = torch.mm(query_norm, batch_norms.T).squeeze()
            similarities[i:end_idx] = batch_similarities
            
            # Clear batch from GPU memory
            del batch_norms, batch_similarities
            torch.cuda.empty_cache() if self.device == 'cuda' else None
        
        return similarities.cpu().numpy()
    
    def get_recommendations(self, user_profile, n_recommendations=10, exclude_books=None, batch_size=1000):
        """
        Get recommendations using GPU batch processing
        """
        if user_profile is None:
            return pd.DataFrame()
        
        # Compute similarities in batches
        similarities = self.compute_similarity_batch(user_profile, batch_size)
        
        # Create recommendations dataframe
        recommendations = self.books_df.copy()
        recommendations['similarity_score'] = similarities
        
        # Exclude books if specified
        if exclude_books:
            recommendations = recommendations[~recommendations['ISBN'].isin(exclude_books)]
        
        # Sort by similarity and return top recommendations
        recommendations = recommendations.sort_values('similarity_score', ascending=False)
        
        return recommendations.head(n_recommendations)
    
    def get_similar_books(self, book_isbn, n_similar=5, batch_size=1000):
        """
        Get similar books using GPU batch processing
        """
        # Find book index
        book_idx = self.books_df[self.books_df['ISBN'] == book_isbn].index
        if len(book_idx) == 0:
            print(f"Book with ISBN {book_isbn} not found")
            return pd.DataFrame()
        
        book_idx = book_idx[0]
        
        # Get the book's TF-IDF vector
        book_vector = self.tfidf_tensor[book_idx]
        
        # Compute similarities in batches
        similarities = self.compute_similarity_batch(book_vector.cpu().numpy(), batch_size)
        
        # Create similar books dataframe
        similar_books = self.books_df.copy()
        similar_books['similarity_score'] = similarities
        
        # Exclude the reference book itself
        similar_books = similar_books[similar_books['ISBN'] != book_isbn]
        
        # Sort by similarity and return top similar books
        similar_books = similar_books.sort_values('similarity_score', ascending=False)
        
        return similar_books.head(n_similar)


print("Initializing GPU-based content recommender...")
gpu_recommender = MemoryEfficientContentRecommender(books_with_features, tfidf_matrix, tfidf_vectorizer)
print("GPU-based recommender initialized successfully!")


Initializing GPU-based content recommender...
Converting TF-IDF matrix to PyTorch tensor on cuda...
TF-IDF tensor shape: torch.Size([177518, 1000])
TF-IDF tensor memory usage: 677.18 MB
GPU-based recommender initialized successfully!


## ANN

In [40]:
import torch
import torch.nn.functional as F
from sklearn.neighbors import NearestNeighbors
import gc

In [47]:
# Solution 2: Approximate Nearest Neighbors (FAISS/Annoy)
class ApproximateContentRecommender:
    def __init__(self, books_df, tfidf_matrix, tfidf_vectorizer, n_neighbors=100):
        self.books_df = books_df
        self.tfidf_matrix = tfidf_matrix
        self.tfidf_vectorizer = tfidf_vectorizer
        self.n_neighbors = n_neighbors
        
        # Build approximate nearest neighbors index
        print("Building approximate nearest neighbors index...")
        self.nn_model = NearestNeighbors(
            n_neighbors=min(n_neighbors, tfidf_matrix.shape[0]),
            metric='cosine',
            algorithm='brute'  # Can use 'ball_tree' or 'kd_tree' for larger datasets
        )
        self.nn_model.fit(tfidf_matrix)
        print("ANN index built successfully!")

    def create_user_profile(self, liked_books_isbns, ratings=None):
        """
        Create a user profile based on books they liked
        
        Args:
            liked_books_isbns: List of ISBNs of books the user liked
            ratings: Optional list of ratings for the books (1-10 scale)
        
        Returns:
            user_profile: TF-IDF vector representing user preferences
        """
        if ratings is None:
            ratings = [1.0] * len(liked_books_isbns)  # Default equal weight
        
        # Normalize ratings to 0-1 scale
        ratings = np.array(ratings) / 10.0
        
        # Find book indices
        book_indices = []
        valid_ratings = []
        
        for isbn, rating in zip(liked_books_isbns, ratings):
            book_idx = self.books_df[self.books_df['ISBN'] == isbn].index
            if len(book_idx) > 0:
                book_indices.append(book_idx[0])
                valid_ratings.append(rating)
        
        if len(book_indices) == 0:
            print("Warning: No valid books found in the dataset")
            return None
            
        # Create weighted user profile
        user_profile = np.zeros(self.tfidf_matrix.shape[1])
        
        for idx, rating in zip(book_indices, valid_ratings):
            user_profile += rating * self.tfidf_matrix[idx].toarray().flatten()
        
        # Normalize by number of books
        user_profile = user_profile / len(book_indices)
        
        return user_profile
    
    def get_recommendations(self, user_profile, n_recommendations=10, exclude_books=None):
        """
        Get recommendations using approximate nearest neighbors
        """
        if user_profile is None:
            return pd.DataFrame()
        
        # Find nearest neighbors
        user_profile = user_profile.reshape(1, -1)
        distances, indices = self.nn_model.kneighbors(user_profile)
        
        # Create recommendations dataframe
        recommendations = self.books_df.iloc[indices[0]].copy()
        recommendations['similarity_score'] = 1 - distances[0]  # Convert distance to similarity
        
        # Exclude books if specified
        if exclude_books:
            recommendations = recommendations[~recommendations['ISBN'].isin(exclude_books)]
        
        return recommendations.head(n_recommendations)
    
    def get_similar_books(self, book_isbn, n_similar=5):
        """
        Get similar books using approximate nearest neighbors
        """
        # Find book index
        book_idx = self.books_df[self.books_df['ISBN'] == book_isbn].index
        if len(book_idx) == 0:
            print(f"Book with ISBN {book_isbn} not found")
            return pd.DataFrame()
        
        book_idx = book_idx[0]
        
        # Get the book's TF-IDF vector
        book_vector = self.tfidf_matrix[book_idx]
        
        # Find nearest neighbors
        distances, indices = self.nn_model.kneighbors(book_vector)
        
        # Create similar books dataframe
        similar_books = self.books_df.iloc[indices[0]].copy()
        similar_books['similarity_score'] = 1 - distances[0]  # Convert distance to similarity
        
        # Exclude the reference book itself
        similar_books = similar_books[similar_books['ISBN'] != book_isbn]
        
        return similar_books.head(n_similar)

# Initialize approximate nearest neighbors recommender
print("Initializing approximate nearest neighbors recommender...")
ann_recommender = ApproximateContentRecommender(books_with_features, tfidf_matrix, tfidf_vectorizer)
print("ANN recommender initialized successfully!")


Initializing approximate nearest neighbors recommender...
Building approximate nearest neighbors index...
ANN index built successfully!
ANN recommender initialized successfully!


# Demo

In [55]:
recommendation_model = gpu_recommender

In [65]:
def demo_content_based_recommendations(recommendation_model):
    """
    Demonstrate how to use content-based filtering for new users
    """
    print("=== Content-Based Filtering Demo ===\n")
    
    # Example 1: New user who likes fantasy books
    print("1. New User Profile: Fantasy Book Lover")
    print("-" * 40)
    
    # Find some fantasy books in the dataset
    fantasy_keywords = ['fantasy', 'magic', 'dragon', 'wizard', 'lord', 'ring']
    fantasy_books = books_with_features[
        books_with_features['combined_text'].str.contains('|'.join(fantasy_keywords), case=False, na=False)
    ]
    
    if len(fantasy_books) > 0:
        # Take first 3 fantasy books as user's liked books
        liked_books = fantasy_books.head(3)
        liked_isbns = liked_books['ISBN'].tolist()
        liked_ratings = [8, 9, 7]  # User ratings for these books
        
        print("User's liked books:")
        for i, (_, book) in enumerate(liked_books.iterrows()):
            print(f"  {i+1}. {book['title']} by {book['author']} (Rating: {liked_ratings[i]})")
        
        # Create user profile
        user_profile = recommendation_model.create_user_profile(liked_isbns, liked_ratings)
        
        if user_profile is not None:
            # Get recommendations
            recommendations = recommendation_model.get_recommendations(
                user_profile, 
                n_recommendations=10, 
                exclude_books=liked_isbns
            )
            
            print(f"\nTop 5 Recommendations:")
            for i, (_, book) in enumerate(recommendations.iterrows()):
                print(f"  {i+1}. {book['title']} by {book['author']} (Similarity: {book['similarity_score']:.3f}); ISBN {book['ISBN']}")
    
    print("\n" + "="*60 + "\n")
    
    # Example 2: Book-to-book similarity
    print("2. Book Similarity Example")
    print("-" * 40)
    
    # Find a popular book to use as reference
    popular_books = books_with_features[books_with_features['title'].str.contains('Alchemist', case=False, na=False)]
    if len(popular_books) > 0:
        reference_book = popular_books.iloc[0]
        print(f"Reference book: {reference_book['title']} by {reference_book['author']}")
        
        # Get similar books
        similar_books = recommendation_model.get_similar_books(reference_book['ISBN'], n_similar=10)
        
        if len(similar_books) > 0:
            print(f"\nSimilar books:")
            for i, (_, book) in enumerate(similar_books.iterrows()):
                print(f"  {i+1}. {book['title']} by {book['author']} (Similarity: {book['similarity_score']:.3f}); ISBN {book['ISBN']}")
    
    print("\n" + "="*60 + "\n")

demo_content_based_recommendations(recommendation_model)

=== Content-Based Filtering Demo ===

1. New User Profile: Fantasy Book Lover
----------------------------------------
User's liked books:
  1. The Woman's Comfort Book : A Self-Nurturing Guide for Restoring Balance in Your Life by Jennifer Louden (Rating: 8)
  2. Dragonwings : Golden Mountain Chronicles: 1903 (Golden Mountain Chronicles) by Laurence Yep (Rating: 9)
  3. The Hobbit : The Enchanting Prelude to The Lord of the Rings by J.R.R. TOLKIEN (Rating: 7)

Top 5 Recommendations:
  1. Child of the Owl : Golden Mountain Chronicles: 1965 (Golden Mountain Chronicles) by Laurence Yep (Similarity: 0.623); ISBN 006440336X
  2. The Traitor : Golden Mountain Chronicles: 1885 (Golden Mountain Chronicles) by Laurence Yep (Similarity: 0.608); ISBN 0060275227
  3. Dragon's Gate (Golden Mountain Chronicles, 1867) by Laurence Yep (Similarity: 0.544); ISBN 0064404897
  4. Lord Foul's Bane (The Chronicles of Thomas Covenant the Unbeliever, Book One) by Stephen R. Donaldson (Similarity: 0.522); ISB

In [62]:
# Helper function for easy recommendations
def get_recommendations_for_new_user(recommendation_model, liked_books_isbns, ratings=None, n_recommendations=10):
    """
    Get recommendations for a new user based on their book preferences
    
    Args:
        liked_books_isbns: List of ISBNs of books the user liked
        ratings: Optional list of ratings (1-10 scale)
        n_recommendations: Number of recommendations to return
    
    Returns:
        DataFrame with recommended books
    """
    user_profile = recommendation_model.create_user_profile(liked_books_isbns, ratings)
    if user_profile is not None:
        return recommendation_model.get_recommendations(
            user_profile, 
            n_recommendations=n_recommendations,
            exclude_books=liked_books_isbns
        )
    return pd.DataFrame()

# Example usage for new users
print("=== Example Usage for New Users ===\n")

# Example 1: User who likes mystery books
print("Example 1: Mystery Book Lover")
print("-" * 30)

# Find some mystery books
mystery_books = books_with_features[
    books_with_features['combined_text'].str.contains('mystery|detective|crime|murder', case=False, na=False)
].head(2)

if len(mystery_books) > 0:
    mystery_isbns = mystery_books['ISBN'].tolist()
    mystery_ratings = [8, 9]
    
    print("User's liked mystery books:")
    for i, (_, book) in enumerate(mystery_books.iterrows()):
        print(f"  - {book['title']} by {book['author']} (Rating: {mystery_ratings[i]})")
    
    # Get recommendations
    mystery_recommendations = get_recommendations_for_new_user(
        recommendation_model,
        mystery_isbns, 
        mystery_ratings, 
        n_recommendations=10
    )
    
    print(f"\nRecommendations:")
    for i, (_, book) in enumerate(mystery_recommendations.iterrows()):
        print(f"  {i+1}. {book['title']} by {book['author']} (Score: {book['similarity_score']:.3f})")

print("\n" + "="*50 + "\n")

# Example 2: User who likes romance books
print("Example 2: Romance Book Lover")
print("-" * 30)

# Find some romance books
romance_books = books_with_features[
    books_with_features['combined_text'].str.contains('romance|love|wedding|heart', case=False, na=False)
].head(2)

if len(romance_books) > 0:
    romance_isbns = romance_books['ISBN'].tolist()
    romance_ratings = [7, 8]
    
    print("User's liked romance books:")
    for i, (_, book) in enumerate(romance_books.iterrows()):
        print(f"  - {book['title']} by {book['author']} (Rating: {romance_ratings[i]})")
    
    # Get recommendations
    romance_recommendations = get_recommendations_for_new_user(
        recommendation_model,
        romance_isbns, 
        romance_ratings, 
        n_recommendations=10
    )
    
    print(f"\nRecommendations:")
    for i, (_, book) in enumerate(romance_recommendations.iterrows()):
        print(f"  {i+1}. {book['title']} by {book['author']} (Score: {book['similarity_score']:.3f})")


=== Example Usage for New Users ===

Example 1: Mystery Book Lover
------------------------------
User's liked mystery books:
  - The Murder Book by Jonathan Kellerman (Rating: 8)
  - Dead Man'S Fingers (Wwl Mystery) by Barbara Lee (Rating: 9)

Recommendations:
  1. Murder Takes Two by Bernie Lee (Score: 0.670)
  2. Clear Cut Murder by Lee Wallingford (Score: 0.670)
  3. Murder At Musket Beach by Bernie Lee (Score: 0.670)
  4. Murder Without Reservation by Bernie Lee (Score: 0.670)
  5. The Murder Book by JONATHAN KELLERMAN (Score: 0.664)
  6. Manmade For Murder (Wwl Mystery) by Sabrina Burton (Score: 0.647)
  7. The Hydrogen Murder: A Gloria Lamerino Mystery by Camille Minichino (Score: 0.647)
  8. Murder Flies Left Seat (Wwl Mystery) by Jackie Lewin (Score: 0.647)
  9. Dead On Her Feet (Stella the Stargazer Mystery) by Christine T. Jorgenson (Score: 0.638)
  10. Deadbeat (Wwl Mystery) by Wendi Lee (Score: 0.636)


Example 2: Romance Book Lover
------------------------------
User's li

In [58]:
# Fixed Evaluation of Content-Based Filtering Model

def evaluate_content_based_model_fixed(recommendation_model):
    """
    Fixed evaluation that properly handles the data structure
    """
    print("=== Fixed Content-Based Model Evaluation ===\n")
    
    # Get unique books that are actually in our TF-IDF matrix
    available_isbns = set(books_with_features['ISBN'].unique())
    print(f"Books available in TF-IDF matrix: {len(available_isbns):,}")
    
    # Filter final_rating to only include books that are in our TF-IDF matrix
    filtered_ratings = final_rating[final_rating['ISBN'].isin(available_isbns)]
    print(f"Ratings for available books: {len(filtered_ratings):,}")
    
    # Get users who have rated multiple books in our available set
    user_book_counts = filtered_ratings.groupby('user_id')['ISBN'].nunique()
    valid_users = user_book_counts[user_book_counts >= 5].index  # At least 5 different books
    
    print(f"Users with 5+ different books: {len(valid_users)}")
    
    if len(valid_users) == 0:
        print("No valid users found for evaluation!")
        return []
    
    # Take first 3 users for evaluation
    sample_users = valid_users[:3]
    evaluation_results = []
    
    for user_id in sample_users:
        print(f"\nEvaluating for User {user_id}")
        print("-" * 40)
        
        # Get user's ratings for available books
        user_ratings = filtered_ratings[filtered_ratings['user_id'] == user_id]
        
        # Get unique books this user has rated
        user_books = user_ratings.groupby('ISBN')['rating'].mean().reset_index()
        user_books = user_books.sort_values('rating', ascending=False)
        
        print(f"User has rated {len(user_books)} unique books")
        
        if len(user_books) < 5:
            print("Skipping user with too few unique books")
            continue
        
        # Split into known (70%) and unknown (30%) books
        n_known = max(2, int(len(user_books) * 0.7))
        known_books = user_books.head(n_known)
        unknown_books = user_books.tail(len(user_books) - n_known)
        
        print(f"Known books: {len(known_books)}, Unknown books: {len(unknown_books)}")
        
        # Create user profile from known books
        known_isbns = known_books['ISBN'].tolist()
        known_ratings = known_books['rating'].tolist()
        
        print(f"Creating profile from: {known_isbns[:3]}... (showing first 3)")
        
        user_profile = recommendation_model.create_user_profile(known_isbns, known_ratings)
        
        if user_profile is not None:
            # Get recommendations
            recommendations = recommendation_model.get_recommendations(
                user_profile, 
                n_recommendations=50,  # Get more recommendations
                exclude_books=known_isbns
            )
            
            print(f"Got {len(recommendations)} recommendations")
            
            # Check how many of the unknown books are in top recommendations
            recommended_isbns = set(recommendations['ISBN'].tolist())
            unknown_isbns = set(unknown_books['ISBN'].tolist())
            
            # Calculate metrics
            hits = len(recommended_isbns.intersection(unknown_isbns))
            precision = hits / len(recommendations) if len(recommendations) > 0 else 0
            recall = hits / len(unknown_books) if len(unknown_books) > 0 else 0
            f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
            
            evaluation_results.append({
                'user_id': user_id,
                'known_books': len(known_books),
                'unknown_books': len(unknown_books),
                'hits': hits,
                'precision': precision,
                'recall': recall,
                'f1_score': f1_score
            })
            
            print(f"Precision: {precision:.3f}")
            print(f"Recall: {recall:.3f}")
            print(f"F1-Score: {f1_score:.3f}")
            print(f"Hits: {hits}/{len(unknown_books)}")
            
            # Show some actual recommendations
            print(f"\nTop 5 recommendations:")
            for i, (_, book) in enumerate(recommendations.head(5).iterrows()):
                print(f"  {i+1}. {book['title'][:50]}... (Score: {book['similarity_score']:.3f})")
        else:
            print("Failed to create user profile")
        
        print()
    
    # Calculate average metrics
    if evaluation_results:
        avg_precision = np.mean([r['precision'] for r in evaluation_results])
        avg_recall = np.mean([r['recall'] for r in evaluation_results])
        avg_f1 = np.mean([r['f1_score'] for r in evaluation_results])
        
        print("=== Overall Evaluation Results ===")
        print(f"Average Precision: {avg_precision:.3f}")
        print(f"Average Recall: {avg_recall:.3f}")
        print(f"Average F1-Score: {avg_f1:.3f}")
        print(f"Number of users evaluated: {len(evaluation_results)}")
    else:
        print("No evaluation results - check data availability")
    
    return evaluation_results

evaluation_results = evaluate_content_based_model_fixed(recommendation_model)

=== Fixed Content-Based Model Evaluation ===

Books available in TF-IDF matrix: 177,518
Ratings for available books: 483,423
Users with 5+ different books: 899

Evaluating for User 254
----------------------------------------
User has rated 293 unique books
Known books: 205, Unknown books: 88
Creating profile from: ['0380789035', '1931081727', '0590353403']... (showing first 3)
Got 50 recommendations
Precision: 0.040
Recall: 0.023
F1-Score: 0.029
Hits: 2/88

Top 5 recommendations:
  1. The Kindly Ones (Sandman, Book 9)... (Score: 0.516)
  2. Fables and Reflections (Sandman, Book 6)... (Score: 0.516)
  3. Preludes and Nocturnes (Sandman, Book 1)... (Score: 0.516)
  4. Preludes and Nocturnes (Sandman, Book 1)... (Score: 0.516)
  5. The Doll's House (Sandman, Book 2)... (Score: 0.481)


Evaluating for User 2276
----------------------------------------
User has rated 456 unique books
Known books: 319, Unknown books: 137
Creating profile from: ['1562059513', '1893224902', '0886771897']... (

In [59]:
# Quick Test: Verify the model is working
def test_model_basic(recommendation_model):
    """
    Basic test to verify the content-based model is working
    """
    print("=== Basic Model Test ===\n")
    
    # Get some books from our dataset
    test_books = books_with_features.head(3)
    test_isbns = test_books['ISBN'].tolist()
    test_ratings = [8, 7, 9]
    
    print("Test user profile:")
    for i, (_, book) in enumerate(test_books.iterrows()):
        print(f"  {i+1}. {book['title'][:60]}... (Rating: {test_ratings[i]})")
    
    # Create user profile
    user_profile = recommendation_model.create_user_profile(test_isbns, test_ratings)
    
    if user_profile is not None:
        print(f"\nUser profile created successfully (shape: {user_profile.shape})")
        
        # Get recommendations
        recommendations = recommendation_model.get_recommendations(
            user_profile, 
            n_recommendations=5, 
            exclude_books=test_isbns
        )
        
        print(f"\nGot {len(recommendations)} recommendations:")
        for i, (_, book) in enumerate(recommendations.iterrows()):
            print(f"  {i+1}. {book['title'][:60]}... (Score: {book['similarity_score']:.3f})")
        
        # Test book-to-book similarity
        print(f"\nTesting book-to-book similarity:")
        similar_books = recommendation_model.get_similar_books(test_isbns[0], n_similar=3)
        print(f"Books similar to '{test_books.iloc[0]['title'][:50]}...':")
        for i, (_, book) in enumerate(similar_books.iterrows()):
            print(f"  {i+1}. {book['title'][:60]}... (Score: {book['similarity_score']:.3f})")
        
        return True
    else:
        print("Failed to create user profile!")
        return False

# Run basic test
test_model_basic(recommendation_model)


=== Basic Model Test ===

Test user profile:
  1. Politically Correct Bedtime Stories: Modern Tales for Our Li... (Rating: 8)
  2. Vegetarian Times Complete Cookbook... (Rating: 7)
  3. Pioneers... (Rating: 9)

User profile created successfully (shape: (1000,))

Got 5 recommendations:
  1. Politically Correct Bedtime Stories: Modern Tales for Our Li... (Score: 0.825)
  2. Upgrade &amp; Maintain Your PC... (Score: 0.749)
  3. A Time: The Busy Managers Action Plan for Effective Self Man... (Score: 0.730)
  4. International Economics... (Score: 0.715)
  5. Leadership Challenge: How to Get Extraordinary Things Done i... (Score: 0.711)

Testing book-to-book similarity:
Books similar to 'Politically Correct Bedtime Stories: Modern Tales ...':
  1. Politically Correct Bedtime Stories: Modern Tales for Our Li... (Score: 1.000)
  2. Upgrade &amp; Maintain Your PC... (Score: 0.783)
  3. Politically Correct Holiday Stories: For an Enlightened Yule... (Score: 0.769)


True

In [64]:
for recommendation_model in [content_recommender, gpu_recommender, ann_recommender]:
    demo_content_based_recommendations(recommendation_model)

=== Content-Based Filtering Demo ===

1. New User Profile: Fantasy Book Lover
----------------------------------------
User's liked books:
  1. The Woman's Comfort Book : A Self-Nurturing Guide for Restoring Balance in Your Life by Jennifer Louden (Rating: 8)
  2. Dragonwings : Golden Mountain Chronicles: 1903 (Golden Mountain Chronicles) by Laurence Yep (Rating: 9)
  3. The Hobbit : The Enchanting Prelude to The Lord of the Rings by J.R.R. TOLKIEN (Rating: 7)

Top 5 Recommendations:
  1. Child of the Owl : Golden Mountain Chronicles: 1965 (Golden Mountain Chronicles) by Laurence Yep (Similarity: 0.623)
  2. The Traitor : Golden Mountain Chronicles: 1885 (Golden Mountain Chronicles) by Laurence Yep (Similarity: 0.608)
  3. Dragon's Gate (Golden Mountain Chronicles, 1867) by Laurence Yep (Similarity: 0.544)
  4. Lord Foul's Bane (The Chronicles of Thomas Covenant the Unbeliever, Book 1) by Stephen R. Donaldson (Similarity: 0.522)
  5. Lord Foul's Bane (The Chronicles of Thomas Covenant 