## Suprise Recommender - Baseline Only

In this notebook we will use the Surprise library to create a recommendation system, specifically using the Baseline Only algorithm. This model is fast in both testing and training which is optimal for a good recommendation system.

In [1]:
import pandas as pd
import numpy as np

from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise import BaselineOnly
from surprise import CoClustering
from surprise.accuracy import rmse
from surprise import accuracy
from surprise.model_selection import train_test_split

In [2]:
ratings_file = '/Users/gregoryolson/Documents/Data Science CT/Capstone/Capstone_Books/Data/ratings_cleaned.csv'
books_file = '/Users/gregoryolson/Documents/Data Science CT/Capstone/Capstone_Books/Data/books_cleaned.csv'

# read in ratings and book details
ratings = pd.read_csv(ratings_file)
books = pd.read_csv(books_file)

In [3]:
ratings.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [4]:
books.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [5]:
# make dataframe with only essential columns
books_cf = books[['id', 'title', 'authors', 'original_publication_year']].copy()
books_cf.head(10)

Unnamed: 0,id,title,authors,original_publication_year
0,1,"The Hunger Games (The Hunger Games, #1)",Suzanne Collins,2008.0
1,2,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré",1997.0
2,3,"Twilight (Twilight, #1)",Stephenie Meyer,2005.0
3,4,To Kill a Mockingbird,Harper Lee,1960.0
4,5,The Great Gatsby,F. Scott Fitzgerald,1925.0
5,6,The Fault in Our Stars,John Green,2012.0
6,7,The Hobbit,J.R.R. Tolkien,1937.0
7,8,The Catcher in the Rye,J.D. Salinger,1951.0
8,9,"Angels & Demons (Robert Langdon, #1)",Dan Brown,2000.0
9,10,Pride and Prejudice,Jane Austen,1813.0


In [6]:
# convert original_publication_year to int, rename column to something shorter
books_cf['original_publication_year'] = books_cf['original_publication_year'].astype('Int64')
books_cf = books_cf.rename(columns={'original_publication_year': 'year'})

Now we have a ratings dataframe and a simple books df with only essential columns. We are now ready to move onto the modeling with Surprise.

## Surprise

In [7]:
# set ratings scale, reformat ratings df
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings[['user_id', 'book_id', 'rating']], reader)

In [8]:
#set bsl_options parameters
bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}

In [9]:
# split into train and test sets, instantiate model, fit and predict
trainset, testset = train_test_split(data, test_size=0.25)
algo = CoClustering()
predictions = algo.fit(trainset).test(testset)
accuracy.rmse(predictions)

RMSE: 0.8786


0.8785978718013283

In [10]:
trainset = algo.trainset
print(algo.__class__.__name__)

CoClustering


In [11]:
def get_Iu(user_id):
    """ return the number of items rated by given user
    args: 
      user_id: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(user_id)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(book_id):
    """ return number of users that have rated given item
    args:
      book_id: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(book_id)])
    except ValueError:
        return 0
    
df = pd.DataFrame(predictions, columns=['user_id', 'book_id', 'rating', 'estimate', 'details'])
df['Iu'] = df.user_id.apply(get_Iu)
df['Ui'] = df.book_id.apply(get_Ui)
df['error'] = abs(df.estimate - df.rating)
df.head()

Unnamed: 0,user_id,book_id,rating,estimate,details,Iu,Ui,error
0,5115,1132,3.0,3.295145,{'was_impossible': False},132,82,0.295145
1,41318,4965,5.0,3.074677,{'was_impossible': False},137,73,1.925323
2,12456,8111,4.0,3.389097,{'was_impossible': False},25,76,0.610903
3,52473,6552,4.0,3.187267,{'was_impossible': False},8,69,0.812733
4,40819,598,5.0,4.12569,{'was_impossible': False},66,71,0.87431


In [12]:
df.shape

(244063, 8)

In [18]:
# look at user_id = 40813
num = 4828
ratings = df.loc[df['user_id'] == num]

# obtain the required data of this user
ratings = ratings[['user_id', 'book_id', 'rating']]
ratings

Unnamed: 0,user_id,book_id,rating
85664,4828,3746,5.0
170630,4828,4596,4.0
200724,4828,9137,2.0


In [19]:
# get list of book_ids
unique_ids = df['book_id'].unique()

# get list of ids that the user_id has read
sample_ids = df.loc[df['user_id'] == num, 'book_id']

# remove the rated books for the recommendations
books_to_predict = np.setdiff1d(unique_ids, sample_ids)

In [20]:
algo = CoClustering()
algo.fit(data.build_full_trainset())

my_recs = []
for iid in books_to_predict:
    my_recs.append((iid, algo.predict(uid=num, iid=iid).est))
recommendation_df = pd.DataFrame(my_recs, columns=['book_id', 'predictions']).sort_values('predictions', ascending=False)
recommendation_df.head(10)

Unnamed: 0,book_id,predictions
7902,7947,4.380294
1304,1308,4.30007
3746,3753,4.28007
8064,8109,4.27007
5186,5207,4.238006
6888,6920,4.238006
9507,9566,4.235784
8895,8946,4.232199
6333,6361,4.228006
8520,8569,4.21632


In [21]:
# return rows from books_cf with above book_id's to make final recommendation
recommendation = books_cf.loc[books_cf['id'].isin(recommendation_df.head(10)['book_id'].tolist())]
#recommendation = recommendation.drop('id', axis=1)
#print(recommendation.to_string(index=False))
recommendation

Unnamed: 0,id,title,authors,year
1304,1308,A Court of Mist and Fury (A Court of Thorns an...,Sarah J. Maas,2016
3747,3753,"Harry Potter Collection (Harry Potter, #1-6)",J.K. Rowling,2005
5188,5207,The Days Are Just Packed: A Calvin and Hobbes ...,Bill Watterson,1993
6335,6361,There's Treasure Everywhere: A Calvin and Hobb...,Bill Watterson,1996
6890,6920,The Indispensable Calvin and Hobbes,Bill Watterson,1992
7905,7947,ESV Study Bible,"Anonymous, Lane T. Dennis, Wayne A. Grudem",2002
8067,8109,"The Absolute Sandman, Volume One","Neil Gaiman, Mike Dringenberg, Chris Bachalo, ...",2006
8523,8569,"Styxx (Dark-Hunter, #22)",Sherrilyn Kenyon,2013
8898,8946,The Divan,Hafez,1380
9511,9566,Attack of the Deranged Mutant Killer Monster S...,Bill Watterson,1992


In [17]:
#from surprise import NormalPredictor
#from surprise import KNNBasic
#from surprise import KNNWithMeans
#from surprise import KNNWithZScore
#from surprise import KNNBaseline
#from surprise import SVD
#from surprise import SVDpp
#from surprise import NMF
#from surprise import SlopeOne
#from surprise import CoClustering

# Measures rmse for all algorithms

#benchmark = []
# Iterate over all algorithms
#for algorithm in [SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]:
    #SVD(), SVDpp(), 
    # Perform cross validation
    #results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    #tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    #tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    #benchmark.append(tmp)
    
# Measures rmse for just 1 algorithm

#benchmark = []
#results = cross_validate(BaselineOnly(), data, measures=['RMSE'], cv=3, verbose=False)
#tmp = pd.DataFrame.from_dict(results).mean(axis=0)
#tmp = tmp.append(pd.Series([str(BaselineOnly()).split(' ')[0].split('.')[-1]], index=['Algorithm']))
#benchmark.append(tmp)

# outputs df of results

#surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
#surprise_results

#print('Using ALS')
#bsl_options = {'method': 'als',
               #'n_epochs': 5,
               #'reg_u': 12,
               #'reg_i': 5
               #}
#algo = BaselineOnly(bsl_options=bsl_options)
#cross_validate(algo, data, measures=['RMSE'], cv=3, verbose=False)