In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt

In [2]:
# get data files
#!wget https://cdn.freecodecamp.org/project-data/books/book-crossings.zip

#!unzip book-crossings.zip

books_filename = 'BX-Books.csv'
ratings_filename = 'BX-Book-Ratings.csv'

In [3]:
# import csv data into dataframes
dfb = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'})

dfr = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'})

In [204]:
# Remove users that appear <200 times in the list and books that have <100 users from df_ratings
maskA = dfr.groupby(by="user").rating.transform('count') > 200
maskB = dfr.groupby(by="isbn").rating.transform('count') > 100

df1 = dfr[maskA & maskB]
df1

Unnamed: 0,user,isbn,rating
1456,277427,002542730X,10.0
1469,277427,0060930535,0.0
1471,277427,0060934417,0.0
1474,277427,0061009059,9.0
1484,277427,0140067477,0.0
...,...,...,...
1147304,275970,0804111359,0.0
1147436,275970,140003065X,0.0
1147439,275970,1400031346,0.0
1147440,275970,1400031354,0.0


In [205]:
# Merge both tables on isbn
df = pd.merge(df1, dfb, on='isbn')
df

Unnamed: 0,user,isbn,rating,title,author
0,277427,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
1,3363,002542730X,0.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
2,11676,002542730X,6.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
3,12538,002542730X,10.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
4,13552,002542730X,0.0,Politically Correct Bedtime Stories: Modern Ta...,James Finn Garner
...,...,...,...,...,...
48985,238864,0399149325,0.0,Portrait of a Killer: Jack the Ripper -- Case ...,Patricia Cornwell
48986,251843,0399149325,1.0,Portrait of a Killer: Jack the Ripper -- Case ...,Patricia Cornwell
48987,253821,0399149325,0.0,Portrait of a Killer: Jack the Ripper -- Case ...,Patricia Cornwell
48988,265115,0399149325,0.0,Portrait of a Killer: Jack the Ripper -- Case ...,Patricia Cornwell


In [206]:
# Drop the title duplicates with the default keep='first' parameter setting
print("Size before - ", df.size)
df = df.drop_duplicates(subset=['title', "user"], keep='first')
print("Size After - ", df.size)

Size before -  244950
Size After -  243075


In [248]:
# Find a book by title
df[df.title == 'Lasher: Lives of the Mayfair Witches (Lives of the Mayfair Witches)']

Unnamed: 0,user,isbn,rating,title,author
42380,11601,345397819,0.0,Lasher: Lives of the Mayfair Witches (Lives of...,Anne Rice
42381,11676,345397819,0.0,Lasher: Lives of the Mayfair Witches (Lives of...,Anne Rice
42382,35859,345397819,0.0,Lasher: Lives of the Mayfair Witches (Lives of...,Anne Rice
42383,37712,345397819,10.0,Lasher: Lives of the Mayfair Witches (Lives of...,Anne Rice
42384,43246,345397819,0.0,Lasher: Lives of the Mayfair Witches (Lives of...,Anne Rice
42385,46398,345397819,0.0,Lasher: Lives of the Mayfair Witches (Lives of...,Anne Rice
42386,50547,345397819,0.0,Lasher: Lives of the Mayfair Witches (Lives of...,Anne Rice
42387,52584,345397819,0.0,Lasher: Lives of the Mayfair Witches (Lives of...,Anne Rice
42388,56447,345397819,0.0,Lasher: Lives of the Mayfair Witches (Lives of...,Anne Rice
42389,69355,345397819,0.0,Lasher: Lives of the Mayfair Witches (Lives of...,Anne Rice


In [208]:
# Create a spreadsheet-style pivot table as a DataFrame. More flexible than pivot function.

pt = pd.pivot_table(df, values='rating', index=['title'], columns=['user'], fill_value=0)
pt

user,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1st to Die: A Novel,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2nd Chance,0,10,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4 Blondes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A Beautiful Mind: The Life of Mathematical Genius and Nobel Laureate John Nash,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Without Remorse,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Year of Wonders,0,0,0,7,0,0,0,7,0,0,...,0,0,0,0,0,0,0,0,0,0
You Belong To Me,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [247]:
# Find a book by title

pt[pt.index == 'Lasher: Lives of the Mayfair Witches (Lives of the Mayfair Witches)']

user,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Lasher: Lives of the Mayfair Witches (Lives of the Mayfair Witches),0,0,0,0,0,0,0,0,0,0,...,9,0,0,0,0,0,0,0,0,0


In [228]:
# Training Data
A = csr_matrix(pt.values)
A.toarray()

array([[ 9,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0, 10,  0, ...,  0,  0,  0],
       ...,
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0],
       [ 0,  0,  0, ...,  0,  0,  0]], dtype=int64)

In [244]:
# Create a KNN model that can give us 5 nearest neighbors.
#knn = NearestNeighbors(n_neighbors=5)
#knn = NearestNeighbors(algorithm='auto', metric = 'cosine',n_neighbors=5)
#knn = NearestNeighbors(algorithm='auto', metric = 'cosine',n_neighbors=5)
#knn = NearestNeighbors(algorithm='auto', metric = 'cosine',n_neighbors=5)
knn = NearestNeighbors(algorithm='auto', metric="cosine", p=1, n_neighbors=5)

# Now train the model
knn.fit(A.toarray())

NearestNeighbors(metric='cosine', p=1)

In [245]:
# Test the model
"""
"I'll Be Seeing You", 'The Weight of Water', 'The Surgeon', 'I Know This Much Is True'
"""
book = pt[pt.index == "Where the Heart Is (Oprah's Book Club (Paperback))"].values
print(len(book))
distance, suggestions = knn.kneighbors(book, n_neighbors= 6)
print(pt.index[suggestions[0]])
print(distance[0])

1
Index(['Where the Heart Is (Oprah's Book Club (Paperback))',
       'The Lovely Bones: A Novel', 'I Know This Much Is True', 'The Surgeon',
       'The Weight of Water', 'I'll Be Seeing You'],
      dtype='object', name='title')
[0.         0.72301844 0.76770751 0.7699411  0.77085836 0.80162106]


In [246]:
"""
  'The Queen of the Damned (Vampire Chronicles (Paperback))',
  [
    ['Catch 22', 0.793983519077301], 
    ['The Witching Hour (Lives of the Mayfair Witches)', 0.7448656558990479], 
    ['Interview with the Vampire', 0.7345068454742432],
    ['The Tale of the Body Thief (Vampire Chronicles (Paperback))', 0.5376338362693787],
    ['The Vampire Lestat (Vampire Chronicles, Book II)', 0.5178412199020386]
  ]

"""
book = pt[pt.index == 'The Queen of the Damned (Vampire Chronicles (Paperback))'].values
print(len(book))
distance, suggestions = knn.kneighbors(book, n_neighbors= 7)
print(pt.index[suggestions[0]])
print(distance[0])

1
Index(['The Queen of the Damned (Vampire Chronicles (Paperback))',
       'The Vampire Lestat (Vampire Chronicles, Book II)',
       'The Tale of the Body Thief (Vampire Chronicles (Paperback))',
       'Interview with the Vampire',
       'The Witching Hour (Lives of the Mayfair Witches)',
       'Lasher: Lives of the Mayfair Witches (Lives of the Mayfair Witches)',
       'Catch 22'],
      dtype='object', name='title')
[1.11022302e-16 5.17841186e-01 5.29854435e-01 7.34506886e-01
 7.36278710e-01 7.83343323e-01 7.93983542e-01]


In [None]:
# Save the trained model

