In [1]:
import pandas as pd
import numpy as np

In [3]:
book_df = pd.read_csv('D:Books.csv',usecols=['ISBN','Book-Title'],dtype={'ISBN': 'str', 'Book-Title': 'str'})
rating_df = pd.read_csv('D:Ratings.csv',usecols=['User-ID', 'ISBN', 'Book-Rating'],dtype={'User-ID': 'int32', 'ISBN': 'str', 'Book-Rating': 'float32'})
user_df = pd.read_csv('D:Users.csv',usecols=['User-ID','Location'],dtype={'User-ID': 'int32', 'Location': 'str'})

In [4]:
book_df.head()

Unnamed: 0,ISBN,Book-Title
0,195153448,Classical Mythology
1,2005018,Clara Callan
2,60973129,Decision in Normandy
3,374157065,Flu: The Story of the Great Influenza Pandemic...
4,393045218,The Mummies of Urumchi


In [5]:
rating_df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


In [6]:
user_df.head()

Unnamed: 0,User-ID,Location
0,1,"nyc, new york, usa"
1,2,"stockton, california, usa"
2,3,"moscow, yukon territory, russia"
3,4,"porto, v.n.gaia, portugal"
4,5,"farnborough, hants, united kingdom"


In [8]:
df = pd.merge(rating_df,book_df,on='ISBN')
df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title
0,276725,034545104X,0.0,Flesh Tones: A Novel
1,2313,034545104X,5.0,Flesh Tones: A Novel
2,6543,034545104X,0.0,Flesh Tones: A Novel
3,8680,034545104X,5.0,Flesh Tones: A Novel
4,10314,034545104X,9.0,Flesh Tones: A Novel


In [11]:
ddf = pd.merge(df,user_df,on='User-ID')
ddf.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Location
0,276725,034545104X,0.0,Flesh Tones: A Novel,"tyler, texas, usa"
1,2313,034545104X,5.0,Flesh Tones: A Novel,"cincinnati, ohio, usa"
2,2313,0812533550,9.0,Ender's Game (Ender Wiggins Saga (Paperback)),"cincinnati, ohio, usa"
3,2313,0679745580,8.0,In Cold Blood (Vintage International),"cincinnati, ohio, usa"
4,2313,0060173289,9.0,Divine Secrets of the Ya-Ya Sisterhood : A Novel,"cincinnati, ohio, usa"


In [14]:
combine_book_rating = ddf.dropna(axis = 0, subset = ['Book-Title'])
book_ratingCount = (combine_book_rating.
     groupby(by = ['Book-Title'])['Book-Rating'].
     count().
     reset_index().
     rename(columns = {'Book-Rating': 'totalRatingCount'})
     [['Book-Title', 'totalRatingCount']]
    )
book_ratingCount.head()

Unnamed: 0,Book-Title,totalRatingCount
0,A Light in the Storm: The Civil War Diary of ...,4
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,"Ask Lily (Young Women of Faith: Lily Series, ...",1
4,Beyond IBM: Leadership Marketing and Finance ...,1


In [15]:
rating_with_totalRatingCount = combine_book_rating.merge(book_ratingCount, left_on = 'Book-Title', right_on = 'Book-Title', how = 'left')
rating_with_totalRatingCount.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Location,totalRatingCount
0,276725,034545104X,0.0,Flesh Tones: A Novel,"tyler, texas, usa",60
1,2313,034545104X,5.0,Flesh Tones: A Novel,"cincinnati, ohio, usa",60
2,2313,0812533550,9.0,Ender's Game (Ender Wiggins Saga (Paperback)),"cincinnati, ohio, usa",249
3,2313,0679745580,8.0,In Cold Blood (Vintage International),"cincinnati, ohio, usa",55
4,2313,0060173289,9.0,Divine Secrets of the Ya-Ya Sisterhood : A Novel,"cincinnati, ohio, usa",130


In [16]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(book_ratingCount['totalRatingCount'].describe())

count   241071.000
mean         4.277
std         16.739
min          1.000
25%          1.000
50%          1.000
75%          3.000
max       2502.000
Name: totalRatingCount, dtype: float64


In [17]:
popularity_threshold = 50
rating_popular_book= rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_book.head()

Unnamed: 0,User-ID,ISBN,Book-Rating,Book-Title,Location,totalRatingCount
0,276725,034545104X,0.0,Flesh Tones: A Novel,"tyler, texas, usa",60
1,2313,034545104X,5.0,Flesh Tones: A Novel,"cincinnati, ohio, usa",60
2,2313,0812533550,9.0,Ender's Game (Ender Wiggins Saga (Paperback)),"cincinnati, ohio, usa",249
3,2313,0679745580,8.0,In Cold Blood (Vintage International),"cincinnati, ohio, usa",55
4,2313,0060173289,9.0,Divine Secrets of the Ya-Ya Sisterhood : A Novel,"cincinnati, ohio, usa",130


In [18]:
rating_popular_book.shape

(288740, 6)

In [19]:
book_features_df=rating_popular_book.pivot_table(index='Book-Title',columns='User-ID',values='Book-Rating').fillna(0)
book_features_df.head()

User-ID,8,9,14,16,17,19,23,26,32,39,...,278820,278824,278828,278832,278836,278843,278844,278846,278851,278854
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
from scipy.sparse import csr_matrix

book_features_df_matrix = csr_matrix(book_features_df.values)

from sklearn.neighbors import NearestNeighbors


model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(book_features_df_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [21]:
book_features_df.shape

(2444, 47994)

In [22]:
query_index = np.random.choice(book_features_df.shape[0])
print(query_index)
query_index =2

1064


In [23]:
distances, indices = model_knn.kneighbors(book_features_df.iloc[query_index,:].values.reshape(1, -1), n_neighbors = 6)

In [24]:
book_features_df.head()

User-ID,8,9,14,16,17,19,23,26,32,39,...,278820,278824,278828,278832,278836,278843,278844,278846,278851,278854
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16 Lighthouse Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1984,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2010: Odyssey Two,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(book_features_df.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, book_features_df.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for 1984:

1: Animal Farm, with distance of 0.8498674631118774:
2: Brave New World, with distance of 0.8773695826530457:
3: American Psycho (Vintage Contemporaries), with distance of 0.9119734764099121:
4: Slaughterhouse Five or the Children's Crusade: A Duty Dance With Death, with distance of 0.9161115288734436:
5: Lying Awake, with distance of 0.9162192344665527:
