In [106]:
import pandas as pd 
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

import warnings; warnings.simplefilter('ignore')

In [2]:
books = pd.read_csv('./book_data/Books.csv')
ratings = pd.read_csv('./book_data/Ratings.csv')
users = pd.read_csv('./book_data/Users.csv')

In [3]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [4]:
df = users.merge(ratings, how='left', on='User-ID')
df.head()

Unnamed: 0,User-ID,Location,Age,ISBN,Book-Rating
0,1,"nyc, new york, usa",,,
1,2,"stockton, california, usa",18.0,195153448.0,0.0
2,3,"moscow, yukon territory, russia",,,
3,4,"porto, v.n.gaia, portugal",17.0,,
4,5,"farnborough, hants, united kingdom",,,


In [5]:
df = books.merge(df, left_on = 'ISBN',right_on = 'ISBN')

In [6]:
d = {}
for i, j in enumerate(df['ISBN'].unique()):
    d[j] = i
df['ISBN'] = df['ISBN'].map(d)

In [7]:
md = df.copy()

In [8]:
df = df[['User-ID', 'ISBN', 'Book-Rating']]

In [23]:
df.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,2,0,0.0
1,8,1,5.0
2,11400,1,0.0
3,11676,1,8.0
4,41385,1,0.0


In [9]:
reader = Reader()

In [10]:
data = Dataset.load_from_df(ratings[['User-ID', 'ISBN', 'Book-Rating']], reader)

In [11]:
svd = SVD()

In [12]:
cross_validate(svd,data,measures=['RMSE', 'MAE'],cv=5, verbose=False, n_jobs=-1)

{'test_rmse': array([3.50289744, 3.50580509, 3.50768769, 3.50562021, 3.4999434 ]),
 'test_mae': array([2.98138417, 2.97974415, 2.98350748, 2.98275594, 2.97550705]),
 'fit_time': (31.033143758773804,
  38.24634599685669,
  33.32606053352356,
  30.26677179336548,
  22.33441686630249),
 'test_time': (17.374815940856934,
  37.811400413513184,
  40.42589020729065,
  26.25611710548401,
  11.297343254089355)}

In [13]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7faa5a0e83a0>

In [14]:
smd = md[['ISBN','Book-Title', 'Book-Author', 'Publisher']]

In [15]:
author_counts= smd['Book-Author'].value_counts().to_dict()
smd['author_counts'] = smd['Book-Author'].map(author_counts)
smd = smd.drop_duplicates()
m = smd['author_counts'].quantile(0.95)

In [16]:
smd = smd[smd['author_counts'] > m]

In [17]:
count = CountVectorizer(analyzer='word', ngram_range=(1,2),
                     min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['Book-Title'])

In [18]:
count_matrix.shape

(13425, 25364)

In [19]:
%%time
cosine_sim = cosine_similarity(count_matrix, count_matrix)

CPU times: user 423 ms, sys: 699 ms, total: 1.12 s
Wall time: 1.14 s


In [34]:
smd = smd.reset_index(drop=True)
titles = smd['Book-Title']
indices = pd.Series(smd.index, index=smd['Book-Title'])

In [45]:
smd = smd.set_index('Book-Title')

In [58]:
smd.head()

Unnamed: 0_level_0,ISBN,Book-Author,Publisher,author_counts
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
The Kitchen God's Wife,5,Amy Tan,Putnam Pub Group,1769.0
PLEADING GUILTY,7,Scott Turow,Audioworks,1013.0
A Second Chicken Soup for the Woman's Soul (Chicken Soup for the Soul Series),14,Jack Canfield,Health Communications,1832.0
The Testament,18,John Grisham,Dell,6010.0
Beloved (Plume Contemporary Fiction),19,Toni Morrison,Plume,1021.0


In [102]:
def recommend(user_id, title):
    idx = indices[title]
    isbn = smd.loc[title]['ISBN']
    
    sim_scores = list(enumerate(cosine_sim[idx]))

    sim_scores = sorted(sim_scores, key=lambda x: x[1].any(), reverse=True)
    sim_scores = sim_scores[1:26]
    title_indices = [i[0] for i in sim_scores]
    
    books = smd.iloc[title_indices][['ISBN','Book-Author', 'Publisher']]
    books['est'] = books['ISBN'].apply(lambda x: svd.predict(user_id, x).est)
    books = books.sort_values('est', ascending=False)
    return books.reset_index().head(10)

In [103]:
recommend(1, 'Beloved (Plume Contemporary Fiction)')

Unnamed: 0,Book-Title,ISBN,Book-Author,Publisher,est
0,Hearts In Atlantis : New Fiction,3780,Stephen King,Scribner,2.86695
1,Beloved,29445,Toni Morrison,Ediciones B,2.86695
2,Sword and Sorceress X: An Anthology of Heroic ...,46856,Marion Zimmer Bradley,New Amer Library,2.86695
3,100 Great Science Fiction Short Short Stories,46321,Isaac Asimov,Avon Books,2.86695
4,Gold : The Final Science Fiction Collection (T...,45607,Isaac Asimov,Eos,2.86695
5,Beloved,41563,Toni Morrison,Collectible First Editions,2.86695
6,Jazz (Plume Contemporary Fiction),41333,Toni Morrison,Plume Books,2.86695
7,How to Write Science Fiction &amp; Fantasy,39974,Orson Scott Card,Writer's Digest Books,2.86695
8,Characters and Viewpoint (Elements of Fiction ...,39967,Orson Scott Card,Writer's Digest Books,2.86695
9,"Hard Times: An Authoritative Text, Backgrounds...",36725,Charles Dickens,W W Norton &amp; Co Inc,2.86695


In [104]:
recommend(1, 'PLEADING GUILTY')

Unnamed: 0,Book-Title,ISBN,Book-Author,Publisher,est
0,Guilty as Sin,595,TAMI HOAG,Bantam,2.86695
1,A Second Chicken Soup for the Woman's Soul (Ch...,14,Jack Canfield,Health Communications,2.86695
2,Isle of Dogs,47,Patricia Cornwell,Berkley Publishing Group,2.86695
3,I'll Be Seeing You,45,Mary Higgins Clark,Pocket,2.86695
4,Icebound,44,Dean R. Koontz,Bantam Books,2.86695
5,Pride and Prejudice,41,Jane Austen,Bantam,2.86695
6,Pigs in Heaven,39,Barbara Kingsolver,Harpercollins,2.86695
7,Timeline,28,MICHAEL CRICHTON,Ballantine Books,2.86695
8,Airframe,27,Michael Crichton,Ballantine Books,2.86695
9,Wild Animus,26,Rich Shapero,Too Far,2.86695
