# Libraries

In [163]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
import scipy

In [2]:
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)

# Import DF

In [3]:
df_ratings = pd.read_csv('ratings.csv').drop('Unnamed: 0', axis=1)
df_users = pd.read_csv('users.csv').drop('Unnamed: 0', axis=1)
df_books = pd.read_csv('books.csv').drop('Unnamed: 0', axis=1)

# Correlation - Pearson-R

In [4]:
irate_pivot = df_ratings.pivot(index='User-ID', columns='ISBN')["Book-Rating"]
userID = irate_pivot.index
ISBN = irate_pivot.columns
irate_pivot[irate_pivot > 0] = np.NaN
irate_pivot[irate_pivot == 0] = 1
irate_pivot = irate_pivot.fillna(0)
print(irate_pivot.shape)
irate_pivot.head()

(7951, 4533)


ISBN,000649840X,0006547834,0006550789,0007110928,0007154615,0020198906,0020199600,002026478X,0020427859,0020442009,...,3596150655,3596259924,3746614007,8408043641,8495618605,8806142100,8807813025,8817131628,8845205118,884590184X
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
243,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
254,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
383,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
import time

ts = time.time()
corr_matrix = irate_pivot.corr(method='pearson')
print ("Complete, time taken: " + str(time.time() - ts) + " seconds")

Complete, time taken: 304.07276153564453 seconds


In [6]:
ts = time.time()
corr_matrix.to_csv("irate_corr.csv")
print ("Complete, time taken: " + str(time.time() - ts) + " seconds")

Complete, time taken: 31.990086555480957 seconds


In [7]:
df_books.loc[df_books["Book-Title"].str.contains("The Wheel of Time")].head(10)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
683,0812513754,"Lord of Chaos (The Wheel of Time, Book 6)",Robert Jordan,1995,Tor Fantasy
684,0812550307,"The Fires of Heaven (The Wheel of Time, Book 5)",Robert Jordan,1994,Tor Fantasy
685,0812513738,"The Shadow Rising (The Wheel of Time, Book 4)",Robert Jordan,1993,Tor Fantasy
686,0812511816,"The Eye of the World (The Wheel of Time, Book 1)",Robert Jordan,1990,Tor Fantasy
840,0812517725,"The Great Hunt (The Wheel of Time, Book 2)",Robert Jordan,1991,Tor Fantasy
928,0812513711,"The Dragon Reborn (The Wheel of Time, Book 3)",Robert Jordan,1992,Tor Fantasy
2427,081257558X,"Winter's Heart (The Wheel of Time, Book 9)",Robert Jordan,2002,Tor Books
2621,0812550293,"The Path of Daggers (The Wheel of Time, Book 8)",Robert Jordan,1999,Tor Fantasy
4406,0812550285,"A Crown of Swords (The Wheel of Time, Book 7)",Robert Jordan,1997,Tor Fantasy


In [8]:
corr_matrix['0812511816'].sort_values(ascending=False).head(10)

ISBN
0812511816    1.000000
0812517725    0.344198
0812513738    0.255608
0812513754    0.181599
0812548051    0.161702
0812550307    0.157628
0812513711    0.153831
0345352661    0.149548
0553277839    0.137663
0553571834    0.137663
Name: 0812511816, dtype: float64

In [9]:
books_corr = pd.DataFrame(corr_matrix['0812511816'].sort_values(ascending=False).head(20).index, 
                                  index=np.arange(20), columns=['ISBN'])
corr_books = pd.merge(books_corr, df_books, on='ISBN')
corr_books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,812511816,"The Eye of the World (The Wheel of Time, Book 1)",Robert Jordan,1990,Tor Fantasy
1,812517725,"The Great Hunt (The Wheel of Time, Book 2)",Robert Jordan,1991,Tor Fantasy
2,812513738,"The Shadow Rising (The Wheel of Time, Book 4)",Robert Jordan,1993,Tor Fantasy
3,812513754,"Lord of Chaos (The Wheel of Time, Book 6)",Robert Jordan,1995,Tor Fantasy
4,812548051,"Wizard's First Rule (Sword of Truth, Book 1)",Terry Goodkind,1997,Tor Fantasy
5,812550307,"The Fires of Heaven (The Wheel of Time, Book 5)",Robert Jordan,1994,Tor Fantasy
6,812513711,"The Dragon Reborn (The Wheel of Time, Book 3)",Robert Jordan,1992,Tor Fantasy
7,345352661,Guardians of the West (Book 1 of the Malloreon),David Eddings,1988,Del Rey Books
8,553277839,Faerie Tale,Raymond E. Feist,1989,Bantam
9,553571834,Haunting Rachel,KAY HOOPER,1999,Bantam


# Book Titles to dictionary definitions

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(min_df=1, max_df=0.5)

count_features = count_vect.fit_transform(df_books["Book-Title"])

In [18]:
indices = count_features.nonzero()
indices

(array([   0,    0,    1, ..., 4530, 4531, 4532], dtype=int32),
 array([4205, 4213, 1548, ..., 4213, 1959, 4769], dtype=int32))

In [19]:
df_books_desc = df_books.copy()

In [20]:
df_books_desc["Title-Desc"] = ""

In [21]:
df_books_desc.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Title-Desc
0,440234743,The Testament,John Grisham,1999,Dell,
1,452264464,Beloved (Plume Contemporary Fiction),Toni Morrison,1994,Plume,
2,1841721522,New Vegetarian: Bold and Beautiful Recipes for...,Celia Brooks Brown,2001,Ryland Peters & Small Ltd,
3,971880107,Wild Animus,Rich Shapero,2004,Too Far,
4,345402871,Airframe,Michael Crichton,1997,Ballantine Books,


In [22]:
from nltk.corpus import wordnet
import time

In [23]:
ts = time.time()

for i in range(0, len(indices[0])):
    df_books_desc["Title-Desc"][indices[0][i]] = df_books_desc["Title-Desc"][indices[0][i]] + " " + count_vect.get_feature_names()[indices[1][i]]
    if len(wordnet.synsets(count_vect.get_feature_names()[indices[1][i]])) != 0:
        df_books_desc["Title-Desc"][indices[0][i]] = df_books_desc["Title-Desc"][indices[0][i]] + " " + wordnet.synsets(count_vect.get_feature_names()[indices[1][i]])[0].definition()
    if i%100 == 0:
        print("Completed " + str(i) + " of " + str(len(indices[0])) + ", time taken: " + str(time.time() - ts) + " seconds")
        ts = time.time()

Completed 0 of 18797, time taken: 1.3175113201141357 seconds
Completed 100 of 18797, time taken: 11.041009902954102 seconds
Completed 200 of 18797, time taken: 11.293351888656616 seconds
Completed 300 of 18797, time taken: 11.86027979850769 seconds
Completed 400 of 18797, time taken: 11.351639747619629 seconds
Completed 500 of 18797, time taken: 12.228218078613281 seconds
Completed 600 of 18797, time taken: 11.476305723190308 seconds
Completed 700 of 18797, time taken: 11.727633714675903 seconds
Completed 800 of 18797, time taken: 11.743646621704102 seconds
Completed 900 of 18797, time taken: 12.640191078186035 seconds
Completed 1000 of 18797, time taken: 12.14953327178955 seconds
Completed 1100 of 18797, time taken: 12.162486791610718 seconds
Completed 1200 of 18797, time taken: 12.175436019897461 seconds
Completed 1300 of 18797, time taken: 11.966963768005371 seconds
Completed 1400 of 18797, time taken: 12.392881393432617 seconds
Completed 1500 of 18797, time taken: 12.70101451873779

# TF-IDF

In [24]:
df_books_desc["Title-Desc"].head()

0                 testament a profession of belief the
1     fiction a literary work based on the imaginat...
2     occasion an event that occurs at a critical t...
3     animus a feeling of ill will arousing active ...
4     airframe the framework and covering of an air...
Name: Title-Desc, dtype: object

In [25]:
df_books_desc.shape

(4533, 6)

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=1, max_df=0.5, norm='l2', encoding='latin-1', ngram_range=(1, 3))
features = tfidf.fit_transform(df_books_desc["Title-Desc"])

In [27]:
features.shape

(4533, 91834)

In [132]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarities = pd.DataFrame(cosine_similarity(features,features), index=df_books["ISBN"], columns=df_books["ISBN"])

In [29]:
ts = time.time()
cosine_similarities.to_csv("desc_coscorr.csv")
print ("Complete, time taken: " + str(time.time() - ts) + " seconds")

Complete, time taken: 24.75649094581604 seconds


In [30]:
df_books.loc[df_books["Book-Title"].str.contains("Orient Express")].head(30)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
702,425173755,Murder on the Orient Express (Hercule Poirot M...,Agatha Christie,2000,Berkley Publishing Group
3788,61002747,Murder on the Orient Express,Agatha Christie,1991,Harper Mass Market Paperbacks (Mm)


In [31]:
cosine_similarities['0425173755'].sort_values(ascending=False).head(10)

ISBN
0425173755    1.000000
0061002747    0.765284
0380717581    0.357299
0553572350    0.314676
042513024X    0.309292
0380793660    0.306119
0425144429    0.289209
0553575406    0.287194
0345413903    0.283288
0345452534    0.283288
Name: 0425173755, dtype: float64

In [32]:
books_coscorr = pd.DataFrame(cosine_similarities['0425173755'].sort_values(ascending=False).head(40).index, 
                                  index=np.arange(40), columns=['ISBN'])
coscorr_books = pd.merge(books_coscorr, df_books, on='ISBN')
coscorr_books

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,0425173755,Murder on the Orient Express (Hercule Poirot M...,Agatha Christie,2000,Berkley Publishing Group
1,0061002747,Murder on the Orient Express,Agatha Christie,1991,Harper Mass Market Paperbacks (Mm)
2,0380717581,Murder on the Iditarod Trail (Alaska Mysteries...,Sue Henry,1993,Avon
3,0553572350,Murder at Monticello (Mrs. Murphy Mysteries (P...,RITA MAE BROWN,1995,Bantam
4,042513024X,The A.B.C. Murders,Agatha Christie,1993,Berkley Publishing Group
5,0380793660,Murder Gets a Life: : A Southern Sisters Myste...,Anne George,1999,Avon
6,0425144429,Mr. Murder,Dean R. Koontz,1996,Berkley Publishing Group
7,0553575406,Murder on the Prowl,RITA MAE BROWN,1999,Bantam
8,0345413903,The Murder Book,Jonathan Kellerman,2003,Ballantine Books
9,0345452534,The Murder Book,JONATHAN KELLERMAN,2002,Ballantine Books


# Book Descriptions

In [68]:
df_books_scrape = df_books.copy()

In [69]:
df_books_scrape["Category"] = np.NaN
df_books_scrape["Desc"] = np.NaN

In [None]:
api_key1 = ''
api_key2 = ''
api_key3 = ''
api_key4 = ''
api_key5 = ''

In [70]:
import requests
from bs4 import BeautifulSoup
import json
ts = time.time()

for i in range(0, 1000):
    if i%100 == 0:
        print("Processing " + str(i) + " of " + str(len(df_books_scrape['ISBN'])) + ", time taken: " + str(time.time() - ts) + " seconds")
        ts = time.time()
        
    currbook = requests.get('https://www.googleapis.com/books/v1/volumes?q=isbn:' + 
                            df_books_scrape["ISBN"][i] + 
                            '&key=' + api_key1)

    soup = BeautifulSoup(currbook.text, "html.parser")
    try: 
        currbookjson=json.loads(str(soup))
        try:
            df_books_scrape["Desc"][i] = currbookjson["items"][0]["volumeInfo"]["description"]
        except:
            pass
        try:
            df_books_scrape["Category"][i] = currbookjson["items"][0]["volumeInfo"]["categories"][0]
        except:
            pass
    except:
        continue

Processing 0 of 4533, time taken: 0.0 seconds
Processing 100 of 4533, time taken: 64.62970805168152 seconds
Processing 200 of 4533, time taken: 64.04486536979675 seconds
Processing 300 of 4533, time taken: 57.76146960258484 seconds
Processing 400 of 4533, time taken: 63.631770610809326 seconds
Processing 500 of 4533, time taken: 57.57495093345642 seconds
Processing 600 of 4533, time taken: 64.76246404647827 seconds
Processing 700 of 4533, time taken: 59.50667905807495 seconds
Processing 800 of 4533, time taken: 62.79475808143616 seconds
Processing 900 of 4533, time taken: 61.10536026954651 seconds


In [72]:
for i in range(1000, 2000):
    if i%100 == 0:
        print("Processing " + str(i) + " of " + str(len(df_books_scrape['ISBN'])) + ", time taken: " + str(time.time() - ts) + " seconds")
        ts = time.time()
        
    currbook = requests.get('https://www.googleapis.com/books/v1/volumes?q=isbn:' + 
                            df_books_scrape["ISBN"][i] + 
                            '&key=' + api_key2)

    soup = BeautifulSoup(currbook.text, "html.parser")
    try: 
        currbookjson=json.loads(str(soup))
        try:
            df_books_scrape["Desc"][i] = currbookjson["items"][0]["volumeInfo"]["description"]
        except:
            pass
        try:
            df_books_scrape["Category"][i] = currbookjson["items"][0]["volumeInfo"]["categories"][0]
        except:
            pass
    except:
        continue

Processing 1000 of 4533, time taken: 146.4106593132019 seconds
Processing 1100 of 4533, time taken: 67.40260887145996 seconds
Processing 1200 of 4533, time taken: 70.97190999984741 seconds
Processing 1300 of 4533, time taken: 71.89488816261292 seconds
Processing 1400 of 4533, time taken: 71.88876581192017 seconds
Processing 1500 of 4533, time taken: 73.63596868515015 seconds
Processing 1600 of 4533, time taken: 71.60944509506226 seconds
Processing 1700 of 4533, time taken: 66.76437640190125 seconds
Processing 1800 of 4533, time taken: 73.26886916160583 seconds
Processing 1900 of 4533, time taken: 68.90411162376404 seconds


In [73]:
for i in range(2000, 3000):
    if i%100 == 0:
        print("Processing " + str(i) + " of " + str(len(df_books_scrape['ISBN'])) + ", time taken: " + str(time.time() - ts) + " seconds")
        ts = time.time()
        
    currbook = requests.get('https://www.googleapis.com/books/v1/volumes?q=isbn:' + 
                            df_books_scrape["ISBN"][i] + 
                            '&key=' + api_key3)

    soup = BeautifulSoup(currbook.text, "html.parser")
    try: 
        currbookjson=json.loads(str(soup))
        try:
            df_books_scrape["Desc"][i] = currbookjson["items"][0]["volumeInfo"]["description"]
        except:
            pass
        try:
            df_books_scrape["Category"][i] = currbookjson["items"][0]["volumeInfo"]["categories"][0]
        except:
            pass
    except:
        continue

Processing 2000 of 4533, time taken: 70.78629350662231 seconds
Processing 2100 of 4533, time taken: 70.41436100006104 seconds
Processing 2200 of 4533, time taken: 74.21485257148743 seconds
Processing 2300 of 4533, time taken: 68.67269921302795 seconds
Processing 2400 of 4533, time taken: 75.5420663356781 seconds
Processing 2500 of 4533, time taken: 72.60682272911072 seconds
Processing 2600 of 4533, time taken: 74.00299596786499 seconds
Processing 2700 of 4533, time taken: 75.32691693305969 seconds
Processing 2800 of 4533, time taken: 70.64651679992676 seconds
Processing 2900 of 4533, time taken: 72.02114987373352 seconds


In [74]:
for i in range(3000, 4000):
    if i%100 == 0:
        print("Processing " + str(i) + " of " + str(len(df_books_scrape['ISBN'])) + ", time taken: " + str(time.time() - ts) + " seconds")
        ts = time.time()
        
    currbook = requests.get('https://www.googleapis.com/books/v1/volumes?q=isbn:' + 
                            df_books_scrape["ISBN"][i] + 
                            '&key=' + api_key4)

    soup = BeautifulSoup(currbook.text, "html.parser")
    try: 
        currbookjson=json.loads(str(soup))
        try:
            df_books_scrape["Desc"][i] = currbookjson["items"][0]["volumeInfo"]["description"]
        except:
            pass
        try:
            df_books_scrape["Category"][i] = currbookjson["items"][0]["volumeInfo"]["categories"][0]
        except:
            pass
    except:
        continue

Processing 3000 of 4533, time taken: 72.05255556106567 seconds
Processing 3100 of 4533, time taken: 77.97649455070496 seconds
Processing 3200 of 4533, time taken: 75.57093787193298 seconds
Processing 3300 of 4533, time taken: 67.14936685562134 seconds
Processing 3400 of 4533, time taken: 76.40157318115234 seconds
Processing 3500 of 4533, time taken: 67.16757345199585 seconds
Processing 3600 of 4533, time taken: 72.79154968261719 seconds
Processing 3700 of 4533, time taken: 67.41567802429199 seconds
Processing 3800 of 4533, time taken: 81.64157557487488 seconds
Processing 3900 of 4533, time taken: 74.1880042552948 seconds


In [75]:
for i in range(4000, len(df_books)):
    if i%100 == 0:
        print("Processing " + str(i) + " of " + str(len(df_books_scrape['ISBN'])) + ", time taken: " + str(time.time() - ts) + " seconds")
        ts = time.time()
        
    currbook = requests.get('https://www.googleapis.com/books/v1/volumes?q=isbn:' + 
                            df_books_scrape["ISBN"][i] + 
                            '&key=' + api_key5)

    soup = BeautifulSoup(currbook.text, "html.parser")
    try: 
        currbookjson=json.loads(str(soup))
        try:
            df_books_scrape["Desc"][i] = currbookjson["items"][0]["volumeInfo"]["description"]
        except:
            pass
        try:
            df_books_scrape["Category"][i] = currbookjson["items"][0]["volumeInfo"]["categories"][0]
        except:
            pass
    except:
        continue

Processing 4000 of 4533, time taken: 69.37743091583252 seconds
Processing 4100 of 4533, time taken: 74.52195882797241 seconds
Processing 4200 of 4533, time taken: 70.39898037910461 seconds
Processing 4300 of 4533, time taken: 67.13504409790039 seconds
Processing 4400 of 4533, time taken: 68.45227718353271 seconds
Processing 4500 of 4533, time taken: 75.71544599533081 seconds


In [76]:
df_books_scrape.to_csv("books_desc.csv")

In [79]:
df_books_scrape.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4533 entries, 0 to 4532
Data columns (total 7 columns):
ISBN                   4533 non-null object
Book-Title             4533 non-null object
Book-Author            4533 non-null object
Year-Of-Publication    4533 non-null int64
Publisher              4533 non-null object
Category               3721 non-null object
Desc                   3672 non-null object
dtypes: int64(1), object(6)
memory usage: 248.0+ KB


In [94]:
df_books_scrape['Desc'] = df_books_scrape['Desc'].fillna(df_books_scrape['Book-Title'])

# TF-IDF Take Two

In [129]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_2 = TfidfVectorizer(sublinear_tf=True, min_df=2, max_df=0.5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
features_2 = tfidf_2.fit_transform(df_books_scrape["Desc"])

In [130]:
features_2.shape

(4533, 17803)

In [157]:
author_dummy = pd.get_dummies(df_books_scrape["Book-Author"]).values
publisher_dummy = pd.get_dummies(df_books_scrape["Publisher"]).values
category_dummy = pd.get_dummies(df_books_scrape["Category"]).values

In [149]:
for i in range(0, features_2.shape[0]):
    try:
        features_2[i] = features_2[i]/np.sum(features_2[i])
    except:
        print("Error on row " + str(i))

In [164]:
mfeatures_2 = scipy.sparse.csr_matrix(features_2).toarray()

In [197]:
author_weight = 0.25
publisher_weight = 0.02
category_weight = 0.1

In [198]:
features_combined = np.concatenate((mfeatures_2, 
                                    author_dummy*author_weight, 
                                    publisher_dummy*publisher_weight, 
                                    category_dummy*category_weight), axis=1)

In [199]:
sum(features_combined[0])

1.3699999999999999

In [200]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarities_2 = pd.DataFrame(cosine_similarity(features_combined,features_combined), index=df_books["ISBN"], columns=df_books["ISBN"])

In [204]:
ts = time.time()
cosine_similarities_2.to_csv("desc_coscorr_2.csv")
print ("Complete, time taken: " + str(time.time() - ts) + " seconds")

Complete, time taken: 20.512391567230225 seconds


In [201]:
df_books.loc[df_books["Book-Title"].str.contains("Orient Express")].head(30)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
702,425173755,Murder on the Orient Express (Hercule Poirot M...,Agatha Christie,2000,Berkley Publishing Group
3788,61002747,Murder on the Orient Express,Agatha Christie,1991,Harper Mass Market Paperbacks (Mm)


In [202]:
cosine_similarities_2['0425173755'].sort_values(ascending=False).head(10)

ISBN
0425173755    1.000000
042513024X    0.670572
0312979479    0.660938
0425129586    0.624099
0061002747    0.496142
0312962452    0.144525
0449219569    0.139688
0399149244    0.132350
0671695126    0.118184
0380778556    0.116233
Name: 0425173755, dtype: float64

In [203]:
books_coscorr_2 = pd.DataFrame(cosine_similarities_2['0425173755'].sort_values(ascending=False).head(40).index, 
                                  index=np.arange(40), columns=['ISBN'])
coscorr_books_2 = pd.merge(books_coscorr_2, df_books, on='ISBN')
coscorr_books_2

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher
0,0425173755,Murder on the Orient Express (Hercule Poirot M...,Agatha Christie,2000,Berkley Publishing Group
1,042513024X,The A.B.C. Murders,Agatha Christie,1993,Berkley Publishing Group
2,0312979479,And Then There Were None : A Novel,Agatha Christie,2001,St. Martin's Paperbacks
3,0425129586,And Then There Were None,Agatha Christie,1995,Berkley Publishing Group
4,0061002747,Murder on the Orient Express,Agatha Christie,1991,Harper Mass Market Paperbacks (Mm)
5,0312962452,Medusa's Child,John J. Nance,1997,St. Martin's Press
6,0449219569,Comeback,Dick Francis,1994,Fawcett Books
7,0399149244,Tricky Business,Dave Barry,2002,Putnam Publishing Group
8,0671695126,Secrets of the Morning (Cutler),V.C. Andrews,1991,Pocket
9,0380778556,Rebecca,Daphne Du Maurier,1994,Avon


# Matrix Factorization

### Mostly copied from https://github.com/albertauyeung/matrix-factorization-in-python

In [234]:
class MF():
    
    def __init__(self, R, K, alpha, beta, stop_threshold):
        """
        Perform matrix factorization to predict empty
        entries in a matrix.
        
        Arguments
        - R (ndarray)   : user-item rating matrix
        - K (int)       : number of latent dimensions
        - alpha (float) : learning rate
        - beta (float)  : regularization parameter
        """
        
        self.R = R
        self.num_users, self.num_items = R.shape
        self.K = K
        self.alpha = alpha
        self.beta = beta
        self.stop_threshold = stop_threshold

    def train(self):
        # Initialize user and item latent feature matrice
        self.P = np.random.normal(scale=1./self.K, size=(self.num_users, self.K))
        self.Q = np.random.normal(scale=1./self.K, size=(self.num_items, self.K))
        
        # Initialize the biases
        self.b_u = np.zeros(self.num_users)
        self.b_i = np.zeros(self.num_items)
        self.b = np.mean(self.R[np.where(self.R != 0)])
        
        # Create a list of training samples
        self.samples = [
            (i, j, self.R[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R[i, j] > 0
        ]
        
        # Perform stochastic gradient descent for number of iterations
        training_process = []
        ts = time.time()
        iteration = 1
        mse = 999
        threshold_count = 0
        while True:
            np.random.shuffle(self.samples)
            self.sgd()
            prev_mse = mse
            mse = self.mse()
            training_process.append((i, mse))
            
            if iteration%10==0:
                print("Iteration: %d ; error = %.4f ; time taken = %.4f seconds" % (iteration, mse, time.time() - ts))
                ts = time.time()
            iteration += 1
            
            if (mse/prev_mse) > 0.999:
                threshold_count += 1
                if threshold_count == self.stop_threshold:
                    print("Stopping training")
                    break
            else:
                threshold_count = 0
                continue
        return training_process

    def mse(self):
        """
        A function to compute the total mean square error
        """
        xs, ys = self.R.nonzero()
        predicted = self.full_matrix()
        error = 0
        for x, y in zip(xs, ys):
            error += pow(self.R[x, y] - predicted[x, y], 2)
        return np.sqrt(error)

    def sgd(self):
        """
        Perform stochastic gradient descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            e = (r - prediction)
            
            # Update biases
            self.b_u[i] += self.alpha * (e - self.beta * self.b_u[i])
            self.b_i[j] += self.alpha * (e - self.beta * self.b_i[j])
            
            # Update user and item latent feature matrices
            self.P[i, :] += self.alpha * (e * self.Q[j, :] - self.beta * self.P[i,:])
            self.Q[j, :] += self.alpha * (e * self.P[i, :] - self.beta * self.Q[j,:])

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        prediction = self.b + self.b_u[i] + self.b_i[j] + self.P[i, :].dot(self.Q[j, :].T)
        return prediction
    
    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        return mf.b + mf.b_u[:,np.newaxis] + mf.b_i[np.newaxis:,] + mf.P.dot(mf.Q.T)

In [211]:
erate_pivot = df_ratings.pivot(index='User-ID', columns='ISBN')["Book-Rating"]
erate_pivot = erate_pivot.fillna(0)
erate_pivot = erate_pivot.astype('int32')
erate_pivot = erate_pivot.loc[(erate_pivot.astype(bool).sum(axis=1) >= 10)]

In [232]:
erate_pivot.shape

(2412, 4533)

In [233]:
R = erate_pivot.values

In [235]:
mf = MF(R, K=32, alpha=0.01, beta=0.001, stop_threshold=10)
training_process = mf.train()
print()
print("P x Q:")
print(mf.full_matrix())
print()
print("Global bias:")
print(mf.b)
print()
print("User bias:")
print(mf.b_u)
print()
print("Item bias:")
print(mf.b_i)

Iteration: 10 ; error = 312.9239 ; time taken = 12.9444 seconds
Iteration: 20 ; error = 255.2766 ; time taken = 13.1478 seconds
Iteration: 30 ; error = 170.4308 ; time taken = 14.5537 seconds
Iteration: 40 ; error = 108.7113 ; time taken = 13.3722 seconds
Iteration: 50 ; error = 71.1797 ; time taken = 13.0970 seconds
Iteration: 60 ; error = 48.6587 ; time taken = 13.0929 seconds
Iteration: 70 ; error = 34.6517 ; time taken = 13.1259 seconds
Iteration: 80 ; error = 25.5373 ; time taken = 13.1419 seconds
Iteration: 90 ; error = 19.3536 ; time taken = 13.4869 seconds
Iteration: 100 ; error = 15.0130 ; time taken = 13.1040 seconds
Iteration: 110 ; error = 11.8787 ; time taken = 13.1279 seconds
Iteration: 120 ; error = 9.5639 ; time taken = 13.1139 seconds
Iteration: 130 ; error = 7.8208 ; time taken = 13.1398 seconds
Iteration: 140 ; error = 6.4878 ; time taken = 13.4351 seconds
Iteration: 150 ; error = 5.4520 ; time taken = 13.1378 seconds
Iteration: 160 ; error = 4.6382 ; time taken = 13

In [236]:
mf_weights = mf.Q

In [237]:
mf_weights.shape

(4533, 32)

In [238]:
np.save("mf_weights.npy", mf_weights)