**Book Recommender System**

I will build a hybrid book recommender system that combines content-based and collaborative filtering. I'm using Ruchi Bhatia's Book-crossing dataset and the LightFM package

In [1]:
#importing necessary packages
import numpy as np 
import pandas as pd
import pycountry
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from surprise import accuracy
from lightfm import LightFM, cross_validation
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
from lightfm.evaluation import recall_at_k
from lightfm.evaluation import reciprocal_rank
pd.options.mode.chained_assignment = None
from scipy.sparse import coo_matrix
from scipy.sparse import csr_matrix


1. Put each table of dataset in dataframe.

In [2]:
user_c = ['u_id', 'country', 'age']
users = pd.read_csv('../input/bookcrossing-dataset/Book reviews/BX-Users.csv', sep=';', names= user_c, encoding='latin-1',low_memory=False)

books_c = ['isbn', 'title', 'author', 'year_of_publication', 'publisher', 'img1', 'img2', 'img3']
books = pd.read_csv('../input/bookcrossing-dataset/Book reviews/BX_Books.csv', sep=';', names= books_c, encoding='latin-1',low_memory=False)

ratings_c = ['u_id', 'isbn', 'rating']
ratings = pd.read_csv('../input/bookcrossing-dataset/Book reviews/BX-Book-Ratings.csv', sep=';', names=ratings_c, encoding='latin-1',low_memory=False)


2. Get rid of all invalid data in frames and trim the countries column so that it contains...countries only. After that I need to delete all rows with user ages >90 and <5. Also drop image columns from books dataset.




In [3]:
temp = users['country'].str.split(',', 3, expand = True)
users['country'] = temp[2]
users.drop(users.head(1).index, inplace = True)

In [4]:
users.replace(to_replace = ['NaN', 'NaT'], value = np.nan, inplace = True)
users.dropna(how= 'any', inplace = True)

books.replace(to_replace = ['NaN', 'NaT'], value = np.nan, inplace = True)
books.dropna(how= 'any', inplace = True)

ratings.replace(to_replace = ['NaN', 'NaT'], value = np.nan, inplace = True)
ratings.dropna(how= 'any', inplace = True)

users.age = pd.to_numeric(users.age, errors = 'coerce').astype('Int64')
users = users.loc[(users.age >5) & (users.age < 90)]

In [5]:
books.drop('img1', axis=1, inplace=True)
books.drop('img2', axis=1, inplace=True)
books.drop('img3', axis=1, inplace=True)

In [6]:
books.drop(books.head(1).index, inplace = True)
ratings.drop(ratings.head(1).index, inplace = True)


3. Create a single matrix where each row is a user and each column is the rating they gave each book (0 if they haven't rated it). All ratings must be on (-1,1) scale.

In [7]:
ratings2 = pd.merge(ratings, books, on = 'isbn')

In [8]:
del ratings2['author']
del ratings2['year_of_publication']
del ratings2['publisher']

In [9]:
ratings2 = ratings2[:50000]

In [10]:
scaler = MinMaxScaler(feature_range=(-1, 1)) #use scaler to fit the data
temp = ratings2[['rating']]
scaler.fit(temp)
ratings2['rating'] = scaler.transform(temp)

In [11]:
ratings2

Unnamed: 0,u_id,isbn,rating,title
0,276725,034545104X,-1.0,Flesh Tones: A Novel
1,2313,034545104X,0.0,Flesh Tones: A Novel
2,6543,034545104X,-1.0,Flesh Tones: A Novel
3,8680,034545104X,0.0,Flesh Tones: A Novel
4,10314,034545104X,0.8,Flesh Tones: A Novel
...,...,...,...,...
49995,277427,0345436911,-1.0,The Dress Lodger (Ballantine Reader's Circle)
49996,1167,0345436911,-1.0,The Dress Lodger (Ballantine Reader's Circle)
49997,6543,0345436911,0.6,The Dress Lodger (Ballantine Reader's Circle)
49998,8090,0345436911,-1.0,The Dress Lodger (Ballantine Reader's Circle)


In [12]:
b_ratings = pd.pivot_table(ratings2, index='u_id', values='rating', columns='isbn', fill_value=0)
b_ratings

isbn,000225669X,0006379702,0006485294,0006543545,0020259700,0020847459,002542730X,0026217457,003008685X,0030615321,...,8838910987,884590184X,8880891766,950491036X,9508521481,9681500830,9722100718,9722509713,9726101794,9871138016
u_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100004,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
100009,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
10001,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
100010,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
100053,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99946,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
99955,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
99980,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0
99996,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0


In [13]:
#convert b_rating into csr matrix
b_ratings_csr = csr_matrix(b_ratings.values)

In [14]:
book_dict = {}
df = ratings2[['isbn', 'title']].sort_values('isbn').reset_index()
for i in range(df.shape[0]):
    book_dict[(df.loc[i,'isbn'])] = df.loc[i,'title']

In [15]:
user_dict = {}
user_ids = list(b_ratings.index)
counter = 0
for user_id in user_ids:
    user_dict[user_id] = counter
    counter +=1

4. Import LightFM package and initialize model. Split b_ratings matrix into train and validation sets. Before introducing user and item feature matrices, this model should perform as well as any Matrix Factorization Model.

In [16]:
model = LightFM(loss = 'logistic')

In [17]:
train, test = cross_validation.random_train_test_split(b_ratings_csr, test_percentage=0.25)

In [18]:
model.fit(train, epochs = 30) #user and item feature matrices would be included here

<lightfm.lightfm.LightFM at 0x7f4e0bdd7250>

Create a function to get sample predictions (based on sample recommendation function shown in [this article](http://https://towardsdatascience.com/recommendation-system-in-python-lightfm-61c85010ce17))

In [19]:
def test_predictions(model, data, user_ids, rating_threshold, num_pred, book_dict, user_dict):
    num_users, num_items = data.shape
    
    for user_id in user_ids:
        u_id = user_dict[user_id]
        scores = pd.Series(model.predict(u_id, np.arange(num_items),item_features=None, user_features=None, num_threads=1))
        scores.index = data.columns
        scores = list(pd.Series(scores.sort_values(ascending=False).index))
        known_items = list((pd.Series(b_ratings.loc[user_id,:][b_ratings.loc[user_id,:] > (rating_threshold)]).index).sort_values(ascending=False))
        scores = [x for x in scores if x not in known_items]
        return_score_list = scores[0:num_pred]
        known_items = list(pd.Series(known_items).apply(lambda x: book_dict[x]))
        scores = list(pd.Series(return_score_list).apply(lambda x: book_dict[x]))
        known_likes = known_items[0:num_pred]
        print ("User: " + str(user_id))
        print("Known Likes:")
        counter = 1
        for i in known_likes:
            print(str(counter) + '- ' + i)
            counter+=1
        print("\n Recommended Items:")
        counter = 1
        for i in scores:
            print(str(counter) + '- ' + i)
            counter+=1

Test some user IDs.

In [20]:
test_predictions(model, b_ratings, ['100904','100906','100925','100984','100995','101027','101029','101035'], 0, 5, book_dict, user_dict)

User: 100904
Known Likes:

 Recommended Items:
1- To Kill a Mockingbird
2- The Lovely Bones: A Novel
3- The Hobbit : The Enchanting Prelude to The Lord of the Rings
4- The Joy Luck Club
5- Marching Through Culpeper : A Novel of Culpeper, Virginia, Crossroads of the Civil War
User: 100906
Known Likes:
1- The Stars My Destination
2- The Alienist
3- A Wizard of Earthsea (Earthsea Trilogy, Book 1)
4- Alice's Adventures in Wonderland and Through the Looking Glass
5- Harry Potter and the Order of the Phoenix (Book 5)

 Recommended Items:
1- A Painted House
2- The Red Tent (Bestselling Backlist)
3- Girl, Interrupted
4- She's Come Undone (Oprah's Book Club)
5- Jack & Jill (Alex Cross Novels)
User: 100925
Known Likes:

 Recommended Items:
1- Marching Through Culpeper : A Novel of Culpeper, Virginia, Crossroads of the Civil War
2- Harry Potter and the Chamber of Secrets (Book 2)
3- Bad Business
4- To Kill a Mockingbird
5- The Red Tent (Bestselling Backlist)
User: 100984
Known Likes:

 Recommende

  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':


5. I will measure the accuracy of the model as-is with LightFM's model evaluation functions.

In [21]:
model_precision_at_k = precision_at_k(model, test,train, k=100).mean()
model_recall_at_k = recall_at_k(model, test,train, k=100).mean()
model_auc_score = auc_score(model, test, train).mean()
model_reciprocal_rank = reciprocal_rank(model, test, train).mean()

In [22]:
print('model precision at k = 100 : '+ str(model_precision_at_k) +'/1')
print('model recall at k = 100 : '+ str(model_recall_at_k)+'/1')
print('model AUC score : ' + str(model_auc_score)+'/1')
print('model average reciprocal rank : ' + str(model_reciprocal_rank)+'/1')

model precision at k = 100 : 0.0026067875/1
model recall at k = 100 : 0.12826275983228116/1
model AUC score : 0.42703477/1
model average reciprocal rank : 0.03904833/1


6. Create user feature matrix

7. Create item feature matrix

8. Make predictions using these features

9.Measure accuracy of current model