In [13]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors


# Step 1. Load the datasets from uploaded files (I downloaded the csv files and placed them in the folder book_knn within this project)

books_url =  "https://raw.githubusercontent.com/ashwanidv100/Recommendation-System---Book-Crossing-Dataset/master/BX-CSV-Dump/BX-Books.csv"
users_url =  "https://raw.githubusercontent.com/ashwanidv100/Recommendation-System---Book-Crossing-Dataset/master/BX-CSV-Dump/BX-Users.csv"
ratings_url =  "https://raw.githubusercontent.com/ashwanidv100/Recommendation-System---Book-Crossing-Dataset/master/BX-CSV-Dump/BX-Book-Ratings.csv"



books = pd.read_csv(books_url, sep=';', encoding='latin-1', on_bad_lines='skip', low_memory=False)   
users = pd.read_csv(users_url, sep=';', encoding='latin-1', on_bad_lines='skip', low_memory=False)    
ratings = pd.read_csv(ratings_url, sep=';', encoding='latin-1', on_bad_lines='skip', low_memory=False) 

# Create lables - Rename columns for easier handling (these are the labels already in the csv files)
books.columns = ['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L']
ratings.columns = ['User-ID', 'ISBN', 'Book-Rating']
users.columns = ['User-ID', 'Location', 'Age']

# # Filter users with at least 50 ratings
# user_counts = ratings['User-ID'].value_counts()
# filtered_ratings = ratings[ratings['User-ID'].isin(user_counts[user_counts >= 50].index)]


# Step 2: Clean the Data
# Remove zero ratings as they indicate no rating given
ratings = ratings[ratings['Book-Rating'] > 0]

# Drop rows with missing values in important columns
books = books.dropna(subset=['Book-Author', 'Publisher'])

# Step 3: Create ISBN to Book-Title Dictionary
# Map ISBNs to Book-Titles
isbn_to_title = books.set_index('ISBN')['Book-Title'].to_dict()

# Step 4: Filter Books and Users to Reduce Matrix Size
# Filter books with at least 20 ratings
book_counts = ratings['ISBN'].value_counts()
ratings = ratings[ratings['ISBN'].isin(book_counts[book_counts >= 20].index)]

# Filter users who rated at least 10 books
user_counts = ratings['User-ID'].value_counts()
ratings = ratings[ratings['User-ID'].isin(user_counts[user_counts >= 10].index)]

# Step 5: Convert ISBNs to Book-Titles using the Dictionary
# This step eliminates the need for a merge
ratings['Book-Title'] = ratings['ISBN'].map(isbn_to_title)

# Drop rows where ISBNs didn't map to any Book-Title (in case of missing data)
ratings = ratings.dropna(subset=['Book-Title'])

# Step 6: Fix Duplicate Entries by Averaging Ratings
# Average ratings for duplicate (Book-Title, User-ID) pairs
ratings = ratings.groupby(['Book-Title', 'User-ID'])['Book-Rating'].mean().reset_index()

# Step 7: Manually Create User-Item Matrix
# Get the list of unique books and users
unique_books = list(ratings['Book-Title'].unique())
unique_users = list(ratings['User-ID'].unique())

# Create a dictionary to store the matrix
user_item_matrix = {}

# Initialize the matrix with zeros
for book in unique_books:
    user_item_matrix[book] = [0] * len(unique_users)

# Fill the matrix with ratings
for _, row in ratings.iterrows():
    book = row['Book-Title']
    user = row['User-ID']
    rating = row['Book-Rating']
    user_index = unique_users.index(user)
    user_item_matrix[book][user_index] = rating

# Convert the dictionary to a list of lists (rows: books, columns: users)
book_user_matrix = np.array(list(user_item_matrix.values()))

# Step 8: Build the KNN Model using cosine similarity
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(book_user_matrix)

# Step 9: Create the get_recommends() Function
def get_recommends(book_title):
    # Check if the book is in the dataset
    if book_title not in unique_books:
        return f"Book '{book_title}' not found in dataset."
    
    # Get the index of the book
    book_idx = unique_books.index(book_title)
    
    # Find the 5 nearest neighbors
    distances, indices = model.kneighbors(book_user_matrix[book_idx].reshape(1, -1), n_neighbors=6)
    
    # Create the list of recommendations
    recommendations = []
    for i in range(1, len(distances.flatten())):
        similar_book = unique_books[indices.flatten()[i]]
        recommendations.append([similar_book, distances.flatten()[i]])
    
    # Sort recommendations by distance in ascending order
    recommendations.sort(key=lambda x: x[1])
    
    # Return the result in the required format
    return [book_title, recommendations]

# Test the function with the originally requested title
get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))")


['The Queen of the Damned (Vampire Chronicles (Paperback))',
 [['The Tale of the Body Thief (Vampire Chronicles (Paperback))',
   0.44809595393604096],
  ['The Vampire Lestat (Vampire Chronicles, Book II)', 0.48877937652046544],
  ['Interview with the Vampire', 0.610596672563833],
  ['Memnoch the Devil (Vampire Chronicles, No 5)', 0.6718862952175826],
  ['Feast of All Saints', 0.7491947111915684]]]