In [1]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2024-09-01 20:08:41--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 104.26.3.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2024-09-01 20:08:42 (73.1 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [3]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [4]:
# add your code here - consider creating a new cell for each section of code
df_books.head()

df_users = pd.read_csv('BX-Users.csv',
                        encoding="ISO-8859-1",
                        sep=';',
                        header=0,
                        names=['user', 'location', 'age'],
                        usecols=['user', 'location', 'age'],
                       dtype={'user': 'Int32', 'location': 'str', 'age': 'Int32'})
df_users.head()

Unnamed: 0,user,location,age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [5]:
# Merge ratings with books
df_merged = pd.merge(df_ratings, df_books, on='isbn')

# Merge the result with users
df_combined = pd.merge(df_merged, df_users, on='user')

# Now df_combined contains all the information from books, ratings, and users
print(df_combined.head())
print(df_combined.columns)

     user        isbn  rating  \
0  276725  034545104X     0.0   
1    2313  034545104X     5.0   
2    2313  0812533550     9.0   
3    2313  0679745580     8.0   
4    2313  0060173289     9.0   

                                              title            author  \
0                              Flesh Tones: A Novel        M. J. Rose   
1                              Flesh Tones: A Novel        M. J. Rose   
2     Ender's Game (Ender Wiggins Saga (Paperback))  Orson Scott Card   
3             In Cold Blood (Vintage International)     TRUMAN CAPOTE   
4  Divine Secrets of the Ya-Ya Sisterhood : A Novel     Rebecca Wells   

                location   age  
0      tyler, texas, usa  <NA>  
1  cincinnati, ohio, usa    23  
2  cincinnati, ohio, usa    23  
3  cincinnati, ohio, usa    23  
4  cincinnati, ohio, usa    23  
Index(['user', 'isbn', 'rating', 'title', 'author', 'location', 'age'], dtype='object')


In [6]:
# Count ratings for each user and book
user_ratings_count = df_combined.groupby('user')['rating'].count()
book_ratings_count = df_combined.groupby('isbn')['rating'].count()

# Filter users with at least 200 ratings
users_to_keep = user_ratings_count[user_ratings_count >= 200].index

# Filter books with at least 100 ratings
books_to_keep = book_ratings_count[book_ratings_count >= 100].index

# Apply filters to the combined dataframe
df_filtered = df_combined[
    (df_combined['user'].isin(users_to_keep)) &
    (df_combined['isbn'].isin(books_to_keep))
]

print(f"Original shape: {df_combined.shape}")
print(f"Filtered shape: {df_filtered.shape}")

Original shape: (1031175, 7)
Filtered shape: (48358, 7)


In [7]:
# Create pivot table
pivot_table = df_filtered.pivot_table(
    index='user', columns='title', values='rating', fill_value=0)

# Fill NaN values with 0
pivot_table = pivot_table.fillna(0)

print(f"Pivot table shape: {pivot_table.shape}")
print(pivot_table.head())

Pivot table shape: (812, 673)
title  1984  1st to Die: A Novel  2nd Chance  4 Blondes  \
user                                                      
254     9.0                  0.0         0.0        0.0   
2276    0.0                  0.0        10.0        0.0   
2766    0.0                  0.0         0.0        0.0   
2977    0.0                  0.0         0.0        0.0   
3363    0.0                  0.0         0.0        0.0   

title  A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash  \
user                                                                                    
254                                                  0.0                                
2276                                                 0.0                                
2766                                                 0.0                                
2977                                                 0.0                                
3363                  

In [8]:
# Convert pivot table to sparse matrix
sparse_matrix = csr_matrix(pivot_table.values)

# Get book titles (features)
book_titles = pivot_table.columns

print(f"Sparse matrix shape: {sparse_matrix.shape}")
print(f"Number of book titles: {len(book_titles)}")

Sparse matrix shape: (812, 673)
Number of book titles: 673


In [9]:
# Initialize the NearestNeighbors model
model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=10)

# Fit the model with the sparse matrix
model.fit(sparse_matrix)

print("NearestNeighbors model has been initialized and fitted.")
# Verify book titles
print("Sample book titles:")
print(book_titles[:5])

print(f"Pivot table shape: {pivot_table.shape}")
print(f"Sparse matrix shape: {sparse_matrix.shape}")
print(f"Number of book titles: {len(book_titles)}")

NearestNeighbors model has been initialized and fitted.
Sample book titles:
Index(['1984', '1st to Die: A Novel', '2nd Chance', '4 Blondes',
       'A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash'],
      dtype='object', name='title')
Pivot table shape: (812, 673)
Sparse matrix shape: (812, 673)
Number of book titles: 673


In [10]:
# function to return recommended books - this will be tested
def get_recommends(book=""):
    if book not in book_titles:
        return f"The book '{book}' is not in the dataset."

    book_index = np.where(book_titles == book)[0][0]
    distances, indices = model.kneighbors(
        sparse_matrix[book_index].reshape(1, -1), n_neighbors=20)

    recommended_books = []
    for idx, dist in zip(indices[0][1:], distances[0][1:]):
        if idx < len(book_titles):
            recommended_books.append([book_titles[idx], float(dist)])
        if len(recommended_books) == 5:
            break

    return [book, recommended_books]
    # return recommended_books

In [11]:
# books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")

books = get_recommends(
    "The Queen of the Damned (Vampire Chronicles (Paperback))")
print(books)

['The Queen of the Damned (Vampire Chronicles (Paperback))', [['The Perks of Being a Wallflower', 1.0], ['The Phantom Tollbooth', 1.0], ['The Pillars of the Earth', 1.0], ['The Perfect Husband', 1.0], ['The Perfect Storm : A True Story of Men Against the Sea', 1.0]]]


In [12]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

["Where the Heart Is (Oprah's Book Club (Paperback))", [['Misery', 0.6239112615585327], ['The Gunslinger (The Dark Tower, Book 1)', 0.645896315574646], ['Degree of Guilt', 0.6845024228096008], ['The Girl Who Loved Tom Gordon', 0.6918960809707642], ['Interview with the Vampire', 0.7340651154518127]]]
You haven't passed yet. Keep trying!
