In [1]:
from surprise import SVD, SVDpp
from surprise import Dataset
from surprise import accuracy, Reader
from surprise.model_selection import train_test_split, KFold
from time import perf_counter
import optuna

import pandas as pd
import numpy as np

random_state = 42

In [2]:
books = pd.read_csv('../data/interim/books_with_metadata.csv.zip')
books

  books = pd.read_csv('../data/interim/books_with_metadata.csv.zip')


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,...,Business_and_Money,Health__Fitness_and_Dieting,Religion_and_Spirituality,New__Used_and_Rental_Textbooks,Arts_and_Photography,Politics_and_Government,Humor_and_Entertainment,Arts_and_Literature,Science_and_Math,Regional_and_International
0,5.0,False,"03 30, 2005",A1REUF3A1YCPHM,0001713353,{'Format:': ' Hardcover'},TW Ervin II,"The King, the Mice and the Cheese by Nancy Gur...",A story children will love and learn from,1112140800,...,0,0,0,0,0,0,0,0,0,0
1,5.0,True,"06 20, 2016",AVP0HXC9FG790,0001713353,,Amazon Customer,The kids loved it!,Five Stars,1466380800,...,0,0,0,0,0,0,0,0,0,0
2,5.0,True,"01 24, 2016",A324TTUBKTN73A,0001713353,{'Format:': ' Paperback'},Tekla Borner,My students (3 & 4 year olds) loved this book!...,Five Stars,1453593600,...,0,0,0,0,0,0,0,0,0,0
3,5.0,False,"07 9, 2015",A2RE7WG349NV5D,0001713353,{'Format:': ' Paperback'},Deborah K Woroniecki,LOVE IT,Five Stars,1436400000,...,0,0,0,0,0,0,0,0,0,0
4,5.0,True,"01 18, 2015",A32B7QIUDQCD0E,0001713353,,E,Great!,Five Stars,1421539200,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,5.0,False,"08 30, 2009",A39GCZADC3WY9D,0061709565,{'Format:': ' Paperback'},A. M. McElfresh,This series is one of the best romantic young ...,Awesome Love Story!!!!!!!!!!!!!,1251590400,...,0,0,0,0,0,0,0,0,0,0
999996,3.0,False,"08 27, 2009",A3FFW203EECDXH,0061709565,{'Format:': ' Paperback'},rebelrebs,"Things I liked:\n\n-Nice, easy, short read. Th...",Good If You're Looking For A Shorter Book,1251331200,...,0,0,0,0,0,0,0,0,0,0
999997,4.0,False,"07 1, 2009",A1TN8WJNTMUIVB,0061709565,{'Format:': ' Paperback'},Nikky S. L.,Full Moon is the second book in the Dark Guar...,Worth ten bucks.,1246406400,...,0,0,0,0,0,0,0,0,0,0
999998,4.0,False,"01 20, 2009",A1AKQ1YUS4BT82,006170007X,{'Format:': ' Paperback'},Katie Babs,Ransom my Heart is the first Meg Cabot book I ...,Sexy Medieval Romance (B+ Grade),1232409600,...,0,0,0,0,0,0,0,0,0,0


In [3]:
data = Dataset.load_from_df(books[['reviewerID', 'asin', 'overall']], Reader(line_format='user item rating', rating_scale=(1, 5)))

In [4]:
# user, item, rating, timestamp
indices = np.random.choice(range(len(data.raw_ratings)), size=5)
np.array(data.raw_ratings)[indices]

array([['AFKVVNIEZ8ZO1', '000711835X', 5.0, None],
       ['A2JBXUUO54M7X6', '0061067997', 2.0, None],
       ['A2X8Y10GTS5QGW', '0008134952', 5.0, None],
       ['ACLQQEP9ZK216', '0007327064', 4.0, None],
       ['A2W2HW7T8WBER1', '0030624266', 5.0, None]], dtype=object)

# Grid search

In [56]:
trainset, testset = train_test_split(data, test_size=.3, random_state=random_state)

def objective(trial: optuna.Trial) -> float:
    algo_name = trial.suggest_categorical('algo_name', ['SVDpp', 'SVD'])
    n_epochs = trial.suggest_int('n_epochs', 5, 15)
    lr_all = trial.suggest_float('lr_all', 0.001, 0.005, log=True)
    reg_all = trial.suggest_float('reg_all', 0.4, 0.6, log=True)

    params = {
        'random_state': random_state,
        'n_epochs': n_epochs,
        'lr_all': lr_all,
        'reg_all': reg_all
    }
    if algo_name == 'SVDpp':
        algo = SVDpp(**params)
    else:
        algo = SVD(**params)

    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)

    return rmse

In [57]:
study = optuna.create_study()
study.optimize(objective, n_trials=100)

[32m[I 2022-02-26 17:04:27,239][0m A new study created in memory with name: no-name-05df0ba9-0dd9-470c-8a83-d3028335b6ba[0m
[32m[I 2022-02-26 17:05:35,602][0m Trial 0 finished with value: 1.005057609538849 and parameters: {'algo_name': 'SVDpp', 'n_epoch': 10, 'lr_all': 0.0016169670589367836, 'reg_all': 0.43369801051627177}. Best is trial 0 with value: 1.005057609538849.[0m
[32m[I 2022-02-26 17:05:50,815][0m Trial 1 finished with value: 1.016317756390582 and parameters: {'algo_name': 'SVD', 'n_epoch': 5, 'lr_all': 0.0013322477203739638, 'reg_all': 0.5221521824593691}. Best is trial 0 with value: 1.005057609538849.[0m
[32m[I 2022-02-26 17:06:09,092][0m Trial 2 finished with value: 1.0046387775201486 and parameters: {'algo_name': 'SVD', 'n_epoch': 6, 'lr_all': 0.0029258049209627598, 'reg_all': 0.4512386391269831}. Best is trial 2 with value: 1.0046387775201486.[0m
[32m[I 2022-02-26 17:06:42,062][0m Trial 3 finished with value: 0.999345315843417 and parameters: {'algo_name': 

In [None]:
best_params = study.best_params

In [6]:
best_params = {
    'algo_name': 'SVDpp',
    'n_epochs': 15,
    'lr_all': 0.004760245463611792,
    'reg_all': 0.40040712444861504
}
best_params['random_state'] = random_state

In [7]:
kf = KFold(n_splits=5, random_state=random_state)
rmses: list[float] = []

print('|  fold  |  rmse  |   sec   |')
print('|--------|--------|---------|')
for i, (trainset, testset) in enumerate(kf.split(data)):
    recsys_algo = SVD if 'algo_name' in best_params and best_params['algo_name'] == 'SVD' else SVDpp
    params = {k: v for k, v in best_params.items() if k != 'algo_name'}
    algo = recsys_algo(**params)
    start = perf_counter()
    algo.fit(trainset)
    stop = perf_counter()
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=False)
    rmses.append(rmse)

    print(f'|   {i}    | {rmse:.4f} | {stop-start:.3f} |')
print('|--------|--------|---------|')

print(f'Mean RMSE over {kf.n_splits} folds: {np.mean(rmses):.3f}')

|  fold  |  rmse  |   sec   |
|--------|--------|---------|
|   0    | 0.9865 | 125.398 |
|   1    | 0.9840 | 130.457 |
|   2    | 0.9838 | 125.300 |
|   3    | 0.9861 | 122.027 |
|   4    | 0.9849 | 127.582 |
|--------|--------|---------|
Mean RMSE over 5 folds: 0.985


In [23]:
algo.predict(testset[0][0],testset[0][1])

Prediction(uid='A4AEOJ2X6QGIT', iid='0001846590', r_ui=None, est=4.381889687240482, details={'was_impossible': False})