<a href="https://colab.research.google.com/github/costa-developer/machin-learning/blob/main/fcc_book_recommendation_knn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# import libraries (you may add additional imports but you may not have to)
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt


In [14]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'


--2025-08-24 12:23:29--  https://cdn.freecodecamp.org/project-data/books/book-crossings.zip
Resolving cdn.freecodecamp.org (cdn.freecodecamp.org)... 104.26.3.33, 172.67.70.149, 104.26.2.33, ...
Connecting to cdn.freecodecamp.org (cdn.freecodecamp.org)|104.26.3.33|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 26085508 (25M) [application/zip]
Saving to: ‘book-crossings.zip.1’


2025-08-24 12:23:29 (244 MB/s) - ‘book-crossings.zip.1’ saved [26085508/26085508]

Archive:  book-crossings.zip
replace BX-Book-Ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: tendai
error:  invalid response [tendai]
replace BX-Book-Ratings.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [None]:
# import csv data into dataframes
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})


In [None]:
# --- Clean book fields ---
df_books['title'] = df_books['title'].str.strip()
df_books['author'] = df_books['author'].str.strip()

# --- Keep explicit ratings only (ignore 0) ---
df_ratings = df_ratings[df_ratings['rating'] > 0].copy()

# --- Start with target thresholds ---
USER_MIN_RATINGS = 200
BOOK_MIN_RATINGS = 100

# --- Function to filter with given thresholds ---
def filter_data(user_min=200, book_min=100):
    df_temp = df_ratings.copy()

    # filter active users
    active_user_counts = df_temp['user'].value_counts()
    active_users = active_user_counts[active_user_counts >= user_min].index
    df_temp = df_temp[df_temp['user'].isin(active_users)]

    # filter popular books
    book_counts = df_temp['isbn'].value_counts()
    popular_books = book_counts[book_counts >= book_min].index
    df_temp = df_temp[df_temp['isbn'].isin(popular_books)]

    return df_temp

# --- Try filtering with decreasing thresholds until non-empty ---
final_df = None
for u_thresh, b_thresh in [(200,100), (100,50), (50,50), (20,20)]:
    df_try = filter_data(user_min=u_thresh, book_min=b_thresh)
    if len(df_try) > 0 and df_try['isbn'].nunique() > 0 and df_try['user'].nunique() > 0:
        print(f"Using thresholds: users >= {u_thresh}, books >= {b_thresh}")
        final_df = df_try
        break

if final_df is None:
    raise ValueError("No data left even after relaxing thresholds!")

df = final_df.merge(df_books, on='isbn', how='inner')
print("Final ratings:", len(df))
print("Unique users:", df['user'].nunique())
print("Unique books:", df['isbn'].nunique())


In [None]:
# Build indices for rows (books/ISBN) and columns (users)
isbn_cat = pd.Categorical(df['isbn'])
user_cat = pd.Categorical(df['user'])

row_ind = isbn_cat.codes           # book rows
col_ind = user_cat.codes           # user cols
data_vals = df['rating'].astype(np.float32).values

# Sparse matrix: rows = books (ISBN), cols = users
item_user_matrix = csr_matrix((data_vals, (row_ind, col_ind)),
                              shape=(len(isbn_cat.categories), len(user_cat.categories)))

# Mappings for lookups
isbn_index_to_isbn = pd.Series(isbn_cat.categories)
isbn_to_title = df_books.drop_duplicates('isbn').set_index('isbn')['title']

# Build a stable mapping from title -> a representative ISBN.
# For duplicates, pick the first ISBN occurrence found in df_books.
title_to_isbn = (df_books
                 .dropna(subset=['title', 'isbn'])
                 .drop_duplicates(subset=['title'], keep='first')
                 .set_index('title')['isbn'])


In [None]:
# Fit KNN on the item-user matrix
# Cosine distance on high-dimensional sparse vectors is a common + solid choice here
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(item_user_matrix)


In [None]:
def _title_to_row_index(book_title: str):
    """
    Map a book title to the row index in the item_user_matrix via representative ISBN.
    Returns the row index if found, else raises a KeyError with a meaningful message.
    """
    if book_title not in title_to_isbn.index:
        raise KeyError(f"Title not found in books list: {book_title}")

    isbn = title_to_isbn[book_title]
    # Find the matrix row (using the categorical for ISBNs we built earlier)
    try:
        row_idx = np.where(isbn_index_to_isbn.values == isbn)[0][0]
    except IndexError:
        raise KeyError(f"Title found, but its ISBN has insufficient ratings in the filtered matrix: {book_title}")
    return row_idx


In [None]:
# OPTIONAL: visualize distribution of ratings per kept book to understand sparsity
# (This is just informational; does not affect the model/test.)
book_counts_kept = pd.Series(item_user_matrix.getnnz(axis=1))
plt.figure()
plt.hist(book_counts_kept, bins=50)
plt.title("Non-zero ratings per kept book (after filtering)")
plt.xlabel("Count of ratings")
plt.ylabel("Number of books")
plt.show()


In [None]:
# function to return recommended books - this will be tested
def get_recommends(book = ""):
    """
    Given a book title (string), return:
      [ book_title,
        [ [rec_title_1, distance_1],
          [rec_title_2, distance_2],
          ...
          [rec_title_5, distance_5] ] ]
    Distances are cosine distances from sklearn.neighbors.NearestNeighbors.kneighbors.
    """
    # 1) locate the query book row
    row_idx = _title_to_row_index(book)

    # 2) query k=6 (the first neighbor is the book itself)
    distances, indices = knn_model.kneighbors(item_user_matrix[row_idx], n_neighbors=6)

    distances = distances.flatten().tolist()
    indices = indices.flatten().tolist()

    # 3) Skip the first neighbor (self), then map neighbor rows → titles
    recs = []
    for dist, idx in zip(distances[1:], indices[1:]):
        isbn_neighbor = isbn_index_to_isbn.iloc[idx]
        title_neighbor = isbn_to_title.get(isbn_neighbor, "Unknown Title")
        recs.append([title_neighbor, float(dist)])

    # 4) ensure exactly 5 recommendations (kneighbors returns k=6 including self)
    recs = recs[:5]

    recommended_books = [book, recs]
    return recommended_books


In [None]:
books = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
print(books)

def test_book_recommendation():
  test_pass = True
  recommends = get_recommends("Where the Heart Is (Oprah's Book Club (Paperback))")
  if recommends[0] != "Where the Heart Is (Oprah's Book Club (Paperback))":
    test_pass = False
  recommended_books = ["I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True']
  recommended_books_dist = [0.8, 0.77, 0.77, 0.77]
  for i in range(2):
    if recommends[1][i][0] not in recommended_books:
      test_pass = False
    if abs(recommends[1][i][1] - recommended_books_dist[i]) >= 0.05:
      test_pass = False
  if test_pass:
    print("You passed the challenge! 🎉🎉🎉🎉🎉")
  else:
    print("You haven't passed yet. Keep trying!")

test_book_recommendation()