In [1]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt


In [2]:
# get data files
#!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

#!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'


In [3]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

del books_filename, ratings_filename


In [4]:
# Create a df that contains only the relevant information
data = pd.merge(df_ratings,df_books,how='left', on='isbn')[['user','isbn','rating']]

# Compute users count
count_users = data['user'].value_counts().reset_index()
count_users.columns = ['user', 'count_users']

# Compute books count
count_books = data['isbn'].value_counts().reset_index()
count_books.columns = ['isbn', 'count_books']

# Join data with counts
data = data.merge(count_users, how='left', on='user')
data = data.merge(count_books, how='left', on='isbn')

# Data Cleaning: remove from the dataset users with less than 200 ratings...
data = data[data['count_users'] >= 200]

# ... and books with less than 100 ratings.
data = data[data['count_books'] >= 100]

# Keep only 'user', 'isbn' and 'rating' columns
data = data[['user', 'isbn', 'rating']].reset_index(drop=True)

data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49781 entries, 0 to 49780
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   user    49781 non-null  int32  
 1   isbn    49781 non-null  object 
 2   rating  49781 non-null  float32
dtypes: float32(1), int32(1), object(1)
memory usage: 778.0+ KB


In [5]:
# Convert table to a 2D matrix, and fill the missing values with zeros, since ...
# ... we will calculate distances between rating vectors
data_wide = data.pivot(index="isbn",columns="user",values="rating").fillna(0)

#  Transform the values (ratings) of the matrix dataframe into ...
# ... a scipy sparse matrix for more efficient calculations.
data_csr_matrix = csr_matrix(data_wide.values)


In [6]:
# Train model and make recommendations
model_knn = NearestNeighbors(metric="cosine",algorithm="brute")
model_knn.fit(data_csr_matrix)


In [7]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
  # Get isbn from the title
  book_isbn = df_books[df_books['title'] == book]['isbn'].iloc[0]
  # Create a DataFrame for queries
  data_for_queries = pd.DataFrame(data_wide.copy(deep=True).reset_index()['isbn']).reset_index()
  # Create an empty list for recommended books (output file)
  recommended_books = []
  recommended_books.append(book)
  # Compute distances and indices of 5 similar books
  distance, n_index = model_knn.kneighbors(data_wide[data_wide.index == book_isbn].values.reshape(1,-1),n_neighbors=6)
  distance = distance.flatten()
  n_index = n_index.flatten()
  # Create an empty list to collect recommendations during the loop
  recommendations_all = []
  # Loop through arrays to append results to the desired output
  for i in range(1, 6):
    isbn_query = data_for_queries[data_for_queries['index'] == n_index[i]]['isbn'].iloc[0]
    title_query = df_books[df_books['isbn'] == isbn_query]['title'].iloc[0]
    # Appent to the temporal list
    list_to_append = [title_query, distance[i]]
    # Append to recommendations to the list for all recommendatios
    recommendations_all.append(list_to_append)
  
  # Append these to the output list
  recommended_books.append(recommendations_all)

  return recommended_books

get_recommends(book="Where the Heart Is (Oprah's Book Club (Paperback))")


["Where the Heart Is (Oprah's Book Club (Paperback))",
 [['The Lovely Bones: A Novel', 0.7234864],
  ['I Know This Much Is True', 0.7677075],
  ['The Surgeon', 0.7699411],
  ['The Weight of Water', 0.77085835],
  ["I'll Be Seeing You", 0.8016211]]]

In [8]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2): 
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()


["Where the Heart Is (Oprah's Book Club (Paperback))", [['The Lovely Bones: A Novel', 0.7234864], ['I Know This Much Is True', 0.7677075], ['The Surgeon', 0.7699411], ['The Weight of Water', 0.77085835], ["I'll Be Seeing You", 0.8016211]]]
You haven't passed yet. Keep trying!
