## Suprise Recommender

In this notebook we will use the Surprise library to create a recommendation system, specifically using the Baseline Only algorithm. This model is fast in both testing and training which is optimal for a good recommendation system.

In [1]:
import pandas as pd
import numpy as np

from surprise import Reader
from surprise import Dataset
from surprise import SVD
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import LeaveOneOut

In [2]:
ratings_file = '/Users/gregoryolson/Documents/Data Science CT/Capstone/Capstone_Books/Data/ratings_cleaned.csv'
books_file = '/Users/gregoryolson/Documents/Data Science CT/Capstone/Capstone_Books/Data/books_cleaned.csv'

# read in ratings and book details
ratings = pd.read_csv(ratings_file)
books = pd.read_csv(books_file)

In [3]:
ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [4]:
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,goodreads_book_id,genre1,genre2,genre3
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,2767052,young-adult,fiction,fantasy
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,3,fantasy,young-adult,fiction
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...,41865,young-adult,fantasy,fiction
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...,2657,classics,historical-fiction,young-adult
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...,4671,classics,fiction,historical-fiction


In [5]:
# make dataframe with only essential columns
books_cf = books[['id', 'title', 'authors', 'original_publication_year', 'genre1', 'genre2', 'genre3']].copy()
books_cf.head(10)

Unnamed: 0,id,title,authors,original_publication_year,genre1,genre2,genre3
0,1,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,2008.0,young-adult,fiction,fantasy
1,2,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré",1997.0,fantasy,young-adult,fiction
2,3,"Twilight (Twilight, #1)",Stephenie Meyer,2005.0,young-adult,fantasy,fiction
3,4,To Kill a Mockingbird,Harper Lee,1960.0,classics,historical-fiction,young-adult
4,5,The Great Gatsby,F. Scott Fitzgerald,1925.0,classics,fiction,historical-fiction
5,6,The Fault in Our Stars,John Green,2012.0,young-adult,fiction,romance
6,7,The Hobbit,J.R.R. Tolkien,1937.0,fantasy,classics,fiction
7,8,The Catcher in the Rye,J.D. Salinger,1951.0,classics,fiction,young-adult
8,9,"Angels & Demons (Robert Langdon, #1)",Dan Brown,2000.0,fiction,mystery,thriller
9,10,Pride and Prejudice,Jane Austen,1813.0,classics,fiction,romance


In [6]:
# convert original_publication_year to int, rename column to something shorter
books_cf['original_publication_year'] = books_cf['original_publication_year'].astype('Int64')
books_cf = books_cf.rename(columns={'original_publication_year': 'year'})

In [7]:
ratings = ratings.rename(columns={'book_id': 'id'})

In [8]:
df = pd.merge(books_cf, ratings, on='id', how='inner')
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['user_id', 'id', 'rating']], reader)
trainSet, testSet = train_test_split(data, test_size=.25, random_state=0)
algo = SVD(random_state=0)
algo.fit(trainSet)
predictions = algo.test(testSet)

def MAE(predictions):
        return accuracy.mae(predictions, verbose=False)
def RMSE(predictions):
        return accuracy.rmse(predictions, verbose=False)
    
print("RMSE: ", RMSE(predictions))
print("MAE: ", MAE(predictions))

RMSE:  0.843738878541585
MAE:  0.6605207139927541


In [9]:
df_pred = pd.DataFrame(predictions, columns=['user_id', 'book_id', 'rating', 'estimate', 'details'])
df_pred['error'] = abs(df_pred.estimate - df_pred.rating)
df_pred.head()

Unnamed: 0,user_id,book_id,rating,estimate,details,error
0,44766,6645,3.0,4.410316,{'was_impossible': False},1.410316
1,5303,1330,3.0,3.444029,{'was_impossible': False},0.444029
2,12089,8428,4.0,4.352092,{'was_impossible': False},0.352092
3,14062,3463,5.0,3.753643,{'was_impossible': False},1.246357
4,32484,6812,5.0,4.313155,{'was_impossible': False},0.686845


In [10]:
def GetTopN(predictions, n=10, minimumRating=3):
    topN = defaultdict(list)
    for user_id, book_id, rating, estimate, _ in predictions:
        if (estimate >= minimumRating):
            topN[user_id].append((book_id, estimate))

    for user_id, ratings in topN.items():
        ratings.sort(key=lambda x: x[1], reverse=True)
        topN[user_id] = ratings[:n]

    return topN
    
LOOCV = LeaveOneOut(n_splits=1, random_state=1)

for trainSet, testSet in LOOCV.split(data):
    # Train model without left-out ratings
    algo.fit(trainSet)
    # Predicts ratings for left-out ratings only
    leftOutPredictions = algo.test(testSet)
    # Build predictions for all ratings not in the training set
    bigTestSet = trainSet.build_anti_testset()
    allPredictions = algo.test(bigTestSet)
    # Compute top 10 recs for each user
    topNPredicted = GetTopN(allPredictions, n=10)

KeyboardInterrupt: 

In [10]:
df_pred.shape

(244038, 6)

In [11]:
temp1 = df_pred.groupby('book_id').sum()[['error']]
temp2 = df_pred.groupby('book_id').count()[['error']]

temp1.columns = ['sum_error']
temp2.columns = ['count_error']

temp2.head()

Unnamed: 0_level_0,count_error
book_id,Unnamed: 1_level_1
1,20
2,25
3,29
4,23
5,26


In [12]:
rec = pd.DataFrame()
rec['average_error'] = temp1['sum_error'] / temp2['count_error']
rec['id'] = temp1.index
rec.head(10)

Unnamed: 0_level_0,average_error,id
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.493627,1
2,0.724424,2
3,1.237949,3
4,0.511528,4
5,0.954722,5
6,0.567483,6
7,0.821942,7
8,0.737615,8
9,0.506936,9
10,0.721122,10


In [13]:
rec = rec.sort_values(by='average_error')
rec.head(10)

Unnamed: 0_level_0,average_error,id
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1
7213,0.229049,7213
7976,0.230028,7976
7512,0.232739,7512
7150,0.253014,7150
6153,0.263174,6153
5176,0.263896,5176
8256,0.278344,8256
8694,0.279322,8694
6920,0.282959,6920
3275,0.28874,3275


In [14]:
recommendation = books_cf.loc[books_cf['id'].isin(rec['id'].head(10).tolist())]
recommendation

Unnamed: 0,id,title,authors,year,genre1,genre2,genre3
3269,3275,"Harry Potter Boxed Set, Books 1-5 (Harry Potte...","J.K. Rowling, Mary GrandPré",2003,fantasy,young-adult,fiction
5156,5176,"D is for Deadbeat (Kinsey Millhone, #4)",Sue Grafton,1987,mystery,crime,fiction
6126,6153,"I is for Innocent (Kinsey Millhone, #9)",Sue Grafton,1992,mystery,fiction,crime
6889,6920,The Indispensable Calvin and Hobbes,Bill Watterson,1992,comedy,fiction,children
7116,7150,"Betrayal in Death (In Death, #12)","J.D. Robb, Nora Roberts",2001,mystery,romance,crime
7178,7213,"Shadow Prey (Lucas Davenport, #2)",John Sandford,1990,mystery,fiction,thriller
7473,7512,"Purity in Death (In Death, #15)",J.D. Robb,2002,mystery,romance,crime
7933,7976,"Reunion in Death (In Death, #14)",J.D. Robb,2002,mystery,romance,crime
8210,8256,"Creation in Death (In Death, #25)",J.D. Robb,2007,mystery,romance,crime
8647,8694,"Strangers in Death (In Death, #26)",J.D. Robb,2008,mystery,crime,fiction


In [13]:
# look at user_id = 40813
num = 4828
ratings = df.loc[df['user_id'] == num]

# obtain the required data of this user
ratings = ratings[['user_id', 'book_id', 'rating']]
ratings

Unnamed: 0,user_id,book_id,rating
51567,4828,9137,2.0
64833,4828,3260,3.0
115676,4828,5598,2.0


In [14]:
# get list of book_ids
unique_ids = df['book_id'].unique()

# get list of ids that the user_id has read
sample_ids = df.loc[df['user_id'] == num, 'book_id']

# remove the rated books for the recommendations
books_to_predict = np.setdiff1d(unique_ids, sample_ids)

In [15]:
algo = BaselineOnly(bsl_options=bsl_options)
algo.fit(data.build_full_trainset())

my_recs = []
for iid in books_to_predict:
    my_recs.append((iid, algo.predict(uid=num, iid=iid).est))
recommendation_df = pd.DataFrame(my_recs, columns=['book_id', 'predictions']).sort_values('predictions', ascending=False)
recommendation_df.head(10)

Estimating biases using als...


Unnamed: 0,book_id,predictions
5187,5207,4.545732
6888,6920,4.516462
860,862,4.507351
6560,6590,4.494711
9508,9566,4.488837
3621,3628,4.481991
7903,7947,4.46165
1784,1788,4.451807
8896,8946,4.451507
4468,4483,4.447617


In [16]:
# return rows from books_cf with above book_id's to make final recommendation
recommendation = books_cf.loc[books_cf['id'].isin(recommendation_df.head(10)['book_id'].tolist())]
#recommendation = recommendation.drop('id', axis=1)
#print(recommendation.to_string(index=False))
recommendation

Unnamed: 0,id,title,authors,year
859,862,"Words of Radiance (The Stormlight Archive, #2)",Brandon Sanderson,2014
1783,1788,The Calvin and Hobbes Tenth Anniversary Book,Bill Watterson,1995
3621,3628,The Complete Calvin and Hobbes,Bill Watterson,2005
4468,4483,It's a Magical World: A Calvin and Hobbes Coll...,Bill Watterson,1996
5187,5207,The Days Are Just Packed: A Calvin and Hobbes ...,Bill Watterson,1993
6561,6590,The Authoritative Calvin and Hobbes: A Calvin ...,Bill Watterson,1990
6889,6920,The Indispensable Calvin and Hobbes,Bill Watterson,1992
7904,7947,ESV Study Bible,"Anonymous, Lane T. Dennis, Wayne A. Grudem",2002
8897,8946,The Divan,Hafez,1380
9510,9566,Attack of the Deranged Mutant Killer Monster S...,Bill Watterson,1992


In [None]:
# import Surprise libraries
#from surprise import BaselineOnly
#from surprise import KNNBasic
#from surprise.accuracy import rmse
#from surprise.model_selection import cross_validate

#set bsl_options parameters
#bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}

# split into train and test sets, instantiate model, fit and predict
#trainset, testset = train_test_split(data, test_size=0.25)
#algo = BaselineOnly(bsl_options=bsl_options)
#predictions = algo.fit(trainset).test(testset)
#accuracy.rmse(predictions)

#trainset = algo.trainset
#print(algo.__class__.__name__)

#def get_Iu(user_id):
    #""" return the number of items rated by given user
    #args: 
      #user_id: the id of the user
    #returns: 
      #the number of items rated by the user
    #"""
    #try:
        #return len(trainset.ur[trainset.to_inner_uid(user_id)])
    #except ValueError: # user was not part of the trainset
        #return 0
    
#def get_Ui(book_id):
    #""" return number of users that have rated given item
    #args:
      #book_id: the raw id of the item
    #returns:
      #the number of users that have rated the item.
    #"""
    #try: 
        #return len(trainset.ir[trainset.to_inner_iid(book_id)])
    #except ValueError:
        #return 0
    
#df = pd.DataFrame(predictions, columns=['user_id', 'book_id', 'rating', 'estimate', 'details'])
#df['Iu'] = df.user_id.apply(get_Iu)
#df['Ui'] = df.book_id.apply(get_Ui)
#df['error'] = abs(df.estimate - df.rating)
#df.head()

In [17]:
#from surprise import NormalPredictor
#from surprise import KNNBasic
#from surprise import KNNWithMeans
#from surprise import KNNWithZScore
#from surprise import KNNBaseline
#from surprise import SVD
#from surprise import SVDpp
#from surprise import NMF
#from surprise import SlopeOne
#from surprise import CoClustering

# Measures rmse for all algorithms

#benchmark = []
# Iterate over all algorithms
#for algorithm in [SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    #SVD(), SVDpp(), 
    # Perform cross validation
    #results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    #tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    #tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    #benchmark.append(tmp)
    
# Measures rmse for just 1 algorithm

#benchmark = []
#results = cross_validate(BaselineOnly(), data, measures=['RMSE'], cv=3, verbose=False)
#tmp = pd.DataFrame.from_dict(results).mean(axis=0)
#tmp = tmp.append(pd.Series([str(BaselineOnly()).split(' ')[0].split('.')[-1]], index=['Algorithm']))
#benchmark.append(tmp)

# outputs df of results

#surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
#surprise_results

#print('Using ALS')
#bsl_options = {'method': 'als',
               #'n_epochs': 5,
               #'reg_u': 12,
               #'reg_i': 5
               #}
#algo = BaselineOnly(bsl_options=bsl_options)
#cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)