In [1]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

--2023-10-08 17:14:14--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.2.33, 172.67.70.149, 104.26.3.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.2.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip’


2023-10-08 17:14:14 (155 MB/s) - ‘book-crossings.zip’ saved [26085508/26085508]

Archive:  book-crossings.zip
  inflating: BX-Book-Ratings.csv     
  inflating: BX-Books.csv            
  inflating: BX-Users.csv            


In [3]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [4]:
#Common Column is isbn.
#Clean Data, check for NAN
df_ratings.info()
# df_ratings.describe()

# df_books.info()
# df_books.describe()
# print('No NAN')
# There is more ratings than books. Books are rated more than one time. That bescause diferents users can rate the same book
# Following the steps that were suggested
# Do NOT remove the 0 values.
#
# Remove users that appear <200 times in the list and books that have <100 users from df_ratings (using the isbn column for the latter). This should be done at the same time (i.e. you can’t just remove users first then books from the resulting dataset).
# Now merge df_ratings with df_books.
# Drop the title duplicates with the default keep='first' parameter setting (so no need to get the max or the mean rating for the duplicated bunch) when using the drop_duplicates function.
# For your final answer, the recommendations sublist should be in backwards order for some reason to pass.
# Forget about change duplicated ID.

#Create a df with both dataframes
# df_full = pd.merge(df_ratings, df_books, on = 'isbn', how = 'inner')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   user    1149780 non-null  int32  
 1   isbn    1149780 non-null  object 
 2   rating  1149780 non-null  float32
dtypes: float32(1), int32(1), object(1)
memory usage: 17.5+ MB


In [16]:
#Get users that appear more than 200 times and books that appears more than 100 times from ratings df
count_category1 = df_ratings.groupby('user')['user'].transform('count')
count_category2 = df_ratings.groupby('isbn')['isbn'].transform('count')

# Filter the DataFrame based on count conditions for both columns
filtered_df = df_ratings[(count_category1 >=200) & (count_category2 >= 100)]
print(filtered_df.shape)


#merge dfs
df_full = pd.merge(filtered_df, df_books, on = 'isbn', how = 'inner')
print(df_full.shape)

#Drop duplicates titles
df_full_final = df_full.drop_duplicates(subset=['user', 'title'], keep='first')
print(df_full_final.shape)

#Still neds to pivot table
df_full_pivot = pd.pivot_table(df_full_final, index = 'title', columns = 'user', values='rating').fillna(0)
print(df_full_pivot.shape)


(49781, 3)
(49517, 5)
(49136, 5)
(673, 888)


In [6]:
#plot some info
# filtered_df.groupby(['rating']).count()[['isbn']].plot(kind='bar', title = 'Books Count for each Rating')

In [118]:
# function to return recommended books - this will be tested
# Note that users that the idea is to get users that rated same movies as good, get from those users other movies that they did not saw but that was rated good for others users.
# They share some same movies that bring them together and then get movies and reccomend
def get_recommends(book = ""):
  # Create a Nearest Neighbors model
  k = 6  # Number of neighbors to recommend
  metric = 'cosine'  # Distance metric euclidean  cosine
  algorithm="brute"
  knn = NearestNeighbors(n_neighbors=k, metric=metric,algorithm=algorithm, p=2)
  knn.fit(df_full_pivot)

  # Choose a specific movie to find recommendations for
  # movie = "The Queen of the Damned (Vampire Chronicles (Paperback))"
  specific_movie = df_full_pivot[df_full_pivot.index == book] # Replace with the ratings for your specific movie



  distances, indices = knn.kneighbors(specific_movie)
  distances = np.round(distances, 2)
  distances = distances[0][::-1]

  # print(indices)
  # Get the movie IDs of the nearest neighbors
  nearest_movie_ids = df_full_pivot.iloc[indices[0]]

  recommended_books_aux = list( nearest_movie_ids.index)[::-1][:-2]
  combined_list = []

  for item1, item2 in zip(recommended_books_aux, distances):
      combined_list.append([item1, item2])




  recommended_books = [book, combined_list]
  return recommended_books

In [116]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)


Recommended Movies:
["Where the Heart Is (Oprah's Book Club (Paperback))", [["I'll Be Seeing You", 0.8], ['The Weight of Water', 0.77], ['The Surgeon', 0.77], ['I Know This Much Is True', 0.77]]]


In [None]:


def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False

  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False

    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False

  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()

You passed the challenge! 🎉🎉🎉🎉🎉
