In [18]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2025-02-26 14:21:44--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 104.26.2.33, 172.67.70.149, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2025-02-26 14:21:44 (76.5 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [3]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [6]:
# Clean data - remove users whith less than 200 ratings and books with less than 100
df_filtered_ratings = df_ratings[df_ratings.groupby('user')['rating'].transform('count') >= 200]
df_filtered_ratings = df_filtered_ratings[df_ratings.groupby('isbn')['rating'].transform('count') >= 100]
df_filtered_ratings




  df_filtered_ratings = df_filtered_ratings[df_ratings.groupby('isbn')['rating'].transform('count') >= 100]


Unnamed: 0,user,isbn,rating
1456,277427,002542730X,10.0
1469,277427,0060930535,0.0
1471,277427,0060934417,0.0
1474,277427,0061009059,9.0
1484,277427,0140067477,0.0
...,...,...,...
1147304,275970,0804111359,0.0
1147436,275970,140003065X,0.0
1147439,275970,1400031346,0.0
1147440,275970,1400031354,0.0


In [8]:
# Create new dataframe
rating_count = df_filtered_ratings.groupby('isbn')['rating'].count()
rating_avg = df_filtered_ratings.groupby('isbn')['rating'].mean()
new_ratings_df = pd.DataFrame({'ratings_count':rating_count, 'rating_avg': rating_avg})
df = df_books.merge(new_ratings_df, on='isbn')
df

Unnamed: 0,isbn,title,author,ratings_count,rating_avg
0,0440234743,The Testament,John Grisham,124,1.435484
1,0452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,53,2.490566
2,0971880107,Wild Animus,Rich Shapero,365,0.435616
3,0345402871,Airframe,Michael Crichton,61,1.393443
4,0345417623,Timeline,MICHAEL CRICHTON,99,2.090909
...,...,...,...,...,...
722,0425178765,Easy Prey,John Sandford,47,1.808511
723,0449223604,M Is for Malice,Sue Grafton,72,2.263889
724,0345444884,The Talisman,STEPHEN KING,39,3.641026
725,0060008032,Angels,Marian Keyes,49,1.367347


In [54]:
# Drop duplicates
df.drop_duplicates('title')
df_filtered_ratings.drop_duplicates('isbn')

Unnamed: 0,user,isbn,rating
1456,277427,002542730X,10.0
1469,277427,0060930535,0.0
1471,277427,0060934417,0.0
1474,277427,0061009059,9.0
1484,277427,0140067477,0.0
...,...,...,...
72328,15408,0671867156,0.0
72408,15408,0743457358,0.0
74774,16106,1878424319,0.0
79746,16795,0553292722,0.0


In [58]:
# Prepare features
user_book_matrix = df_filtered_ratings.pivot(index='isbn', columns='user', values='rating').fillna(0)
user_book_matrix

# Scale the features to avoid biasing KNN due to different scales
# scaler = StandardScaler()
# x_scaled = scaler.fit_transform(x)
# print(x_scaled)



user,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0
0060008032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060096195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
006016848X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060173289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1573227331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1573229326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1573229571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1592400876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
knn = NearestNeighbors(n_neighbors=5, metric='euclidean')
knn.fit(user_book_matrix)

ValueError: X has 2 features, but NearestNeighbors is expecting 888 features as input.

In [53]:
# function to return recommended books - this will be tested
def get_recommends(book_title):
  book_index = df[df['title'] == book_title].index

  distances, indices = knn.kneighbors(x_scaled[book_index])

  # Get book titles
  recommended_books = df.iloc[indices[0]]['title']


  return recommended_books.tolist()

get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")

["Where the Heart Is (Oprah's Book Club (Paperback))",
 'Interview with the Vampire',
 'Angels &amp; Demons',
 'Jurassic Park',
 "The Pilot's Wife : A Novel"]

In [34]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

['Interview with the Vampire', 'Angels &amp; Demons', 'Jurassic Park', "The Pilot's Wife : A Novel"]


IndexError: string index out of range