The notebook contains experiments' code as they described in the paper.

In [1]:
import gc
import os.path
import time
from typing import Tuple

import numpy as np
import pandas as pd
from bayes_opt import BayesianOptimization
from dfmf.model import SVDpp, SVD
from dfmf.util import sort_by_user
from drsu.config import DRSUConfiguration
from drsu.datasets import ALL_DESCRIPTORS, as_numpy, download_and_transform_dataset

from strategy import *
from util.metrics import rmse, ndcg_at_k

DRSUConfiguration.local_dataset_dir = '../data'
RESULTS_DIR = '../results'

In [2]:
DATASETS = []
for dd in ALL_DESCRIPTORS:
    if dd.id.startswith('amz_'):
        if dd.n_rows > 1000000:
            continue

    DATASETS.append(dd)

print('Chosen Datasets: ', [dd.name for dd in DATASETS])

Chosen Datasets:  ['Movielens 100k', 'Movielens 1M', 'Movielens 10M', 'epinions', 'LibraryThing', 'GoodRead Reviews (w/ spoilers)', 'Drug Recommendations', 'Amazon Ratings (Software)', 'Amazon Ratings (Amazon Fashion)', 'Amazon Ratings (All Beauty)', 'Amazon Ratings (Appliances)', 'Amazon Ratings (Gift Cards)', 'Amazon Ratings (Luxury Beauty)', 'Amazon Ratings (Magazine Subscriptions)', 'Amazon Ratings (Prime Pantry)']


In [3]:
for dd in DATASETS:
    download_and_transform_dataset(dd, verbose=False)
    print(f'"{dd.name}" ready')

"Movielens 100k" ready
"Movielens 1M" ready
"Movielens 10M" ready
"epinions" ready
"LibraryThing" ready
"GoodRead Reviews (w/ spoilers)" ready
"Drug Recommendations" ready
"Amazon Ratings (Software)" ready
"Amazon Ratings (Amazon Fashion)" ready
"Amazon Ratings (All Beauty)" ready
"Amazon Ratings (Appliances)" ready
"Amazon Ratings (Gift Cards)" ready
"Amazon Ratings (Luxury Beauty)" ready
"Amazon Ratings (Magazine Subscriptions)" ready
"Amazon Ratings (Prime Pantry)" ready


In [4]:
def do_experiment(dataset_descriptor, n_time_splits, strategies, model='svdpp', metric='rmse', bo_init_points=1, bo_n_iter=5):
    model_class = None
    if model == 'svd':
        model_class = SVD
    elif model == 'svdpp':
        model_class = SVDpp
    else:
        raise ValueError(f'Unknown model: {model_class}')

    def validate_model(strategy: AbstractSplittingStrategy,
                       data: np.ndarray,
                       validation_data: np.ndarray
                       ) -> Tuple[float, float]:
        data_X, data_y, val_X, val_y = sort_by_user(data[:, 0:2], data[:, 2], validation_data[:, 0:2],
                                                    validation_data[:, 2])

        splits = strategy.split(data)
        if strategy.generates_many_splits():
            splits = [split for split in splits]
        else:
            splits = [splits]

        for i in range(len(splits)):
            train, test = splits[i]
            X_train, y_train, X_test, y_test = sort_by_user(train[:, 0:2], train[:, 2], test[:, 0:2], test[:, 2])
            splits[i] = (X_train, y_train, X_test, y_test)

        def function_to_maximize(n_factors, reg):
            loss_values = []
            for split in splits:
                X_train, y_train, X_test, y_test = split
                model = model_class(n_factors=round(n_factors), reg=reg)
                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                if metric == 'rmse':
                    loss_values.append(-rmse(expected=y_test, actual=y_pred))
                elif metric == 'ndcg':
                    loss_values.append(ndcg_at_k(X=X_test, y_expected=y_test, y_actual=y_pred))
                else:
                    raise ValueError(f'Unknown metric: {metric}')

            return sum(loss_values) / len(loss_values)

        pbounds = {
            'n_factors': (10, 100),
            'reg': (0.005, 0.1)
        }
        optimizer = BayesianOptimization(
            f=function_to_maximize,
            pbounds=pbounds,
            verbose=0
        )

        start_time = time.perf_counter()
        optimizer.maximize(init_points=bo_init_points, n_iter=bo_n_iter)

        model = model_class(n_factors=round(optimizer.max['params']['n_factors']), reg=optimizer.max['params']['reg'])
        model.fit(data_X, data_y)
        y_pred = model.predict(val_X)

        if metric == 'rmse':
            metric_value = rmse(expected=val_y, actual=y_pred)
        elif metric == 'ndcg':
            metric_value = ndcg_at_k(X=val_X, y_expected=val_y, y_actual=y_pred)
        else:
            raise ValueError(f'Unknown metric: {metric}')

        end_time = time.perf_counter()
        return metric_value, end_time - start_time

    all_data = as_numpy(dataset_descriptor, only_ratings=False)

    validation_data_chunks = []
    data = all_data
    for i in range(n_time_splits):
        data, validation_data = TimeBasedSplittingStrategy(test_size=0.1).split(data)
        validation_data_chunks.append(validation_data)
    validation_data_chunks.reverse()

    res_columns = pd.MultiIndex.from_product([[str(s) for s in strategies], [metric, 'time']],
                                             names=['strategy', 'metric'])
    res = pd.DataFrame(columns=res_columns, index=range(n_time_splits))

    gc.disable()
    try:
        for i in range(len(validation_data_chunks)):
            validation_data = validation_data_chunks[i]

            for strategy in strategies:
                result_metric, evaluation_time = validate_model(strategy, data, validation_data)
                res.loc[i][str(strategy), metric] = result_metric
                res.loc[i][str(strategy), 'time'] = evaluation_time
                gc.collect()

            data = np.r_[data, validation_data]
    finally:
        gc.enable()

    return res


def do_experiment_or_load_results(results_file: str, *do_exp_args, **do_exp_kwargs):
    results_file_path = None
    if results_file is not None:
        results_file_path = os.path.join(RESULTS_DIR, results_file)
        if os.path.exists(results_file_path):
            res = pd.read_csv(results_file_path, header=[0, 1])
            res.set_index(res.columns[0], inplace=True)
            res.columns = pd.MultiIndex.from_tuples(res.columns)
            return res

    res = do_experiment(*do_exp_args, **do_exp_kwargs)
    if results_file_path is not None:
        res.to_csv(results_file_path)

    return res

In [5]:
for dd in DATASETS:
    for model in ['svd', 'svdpp']:
        _ = do_experiment_or_load_results(
            f'{dd.id}_{model}_rmse_261021_1.csv',
            dataset_descriptor=dd,
            n_time_splits=5,
            metric='rmse',
            model=model,
            strategies=[
                RandomStrategy(test_size=0.2),
                CrossValidationRandomStrategy(n_folds=5),
                TimeBasedSplittingStrategy(test_size=0.2),
                TemporalUserSplittingStrategy(test_size=0.2)
            ]
        )

        print(f'{dd.id}_{model}_rmse_261021_1.csv ready')

ml100k_svd_rmse_261021_1.csv ready
ml100k_svdpp_rmse_261021_1.csv ready
ml1m_svd_rmse_261021_1.csv ready
ml1m_svdpp_rmse_261021_1.csv ready
ml10m_svd_rmse_261021_1.csv ready
ml10m_svdpp_rmse_261021_1.csv ready
ep_svd_rmse_261021_1.csv ready
ep_svdpp_rmse_261021_1.csv ready
libt_svd_rmse_261021_1.csv ready
libt_svdpp_rmse_261021_1.csv ready
gr_s_svd_rmse_261021_1.csv ready
gr_s_svdpp_rmse_261021_1.csv ready
drug_rec_svd_rmse_261021_1.csv ready
drug_rec_svdpp_rmse_261021_1.csv ready
amz_software_svd_rmse_261021_1.csv ready
amz_software_svdpp_rmse_261021_1.csv ready
amz_amazon_fashion_svd_rmse_261021_1.csv ready
amz_amazon_fashion_svdpp_rmse_261021_1.csv ready
amz_all_beauty_svd_rmse_261021_1.csv ready
amz_all_beauty_svdpp_rmse_261021_1.csv ready
amz_appliances_svd_rmse_261021_1.csv ready
amz_appliances_svdpp_rmse_261021_1.csv ready
amz_gift_cards_svd_rmse_261021_1.csv ready
amz_gift_cards_svdpp_rmse_261021_1.csv ready
amz_luxury_beauty_svd_rmse_261021_1.csv ready
amz_luxury_beauty_svdpp_

In [6]:
for dd in DATASETS:
    for model in ['svd', 'svdpp']:
        _ = do_experiment_or_load_results(
            f'{dd.id}_{model}_ndcg_271021_1.csv',
            dataset_descriptor=dd,
            n_time_splits=5,
            metric='ndcg',
            model=model,
            strategies=[
                RandomStrategy(test_size=0.2),
                CrossValidationRandomStrategy(n_folds=5),
                TimeBasedSplittingStrategy(test_size=0.2),
                TemporalUserSplittingStrategy(test_size=0.2)
            ]
        )

        print(f'{dd.id}_{model}_ndcg_271021_1.csv ready')

ml100k_svd_ndcg_271021_1.csv ready
ml100k_svdpp_ndcg_271021_1.csv ready
ml1m_svd_ndcg_271021_1.csv ready
ml1m_svdpp_ndcg_271021_1.csv ready
ml10m_svd_ndcg_271021_1.csv ready
ml10m_svdpp_ndcg_271021_1.csv ready
ep_svd_ndcg_271021_1.csv ready
ep_svdpp_ndcg_271021_1.csv ready
libt_svd_ndcg_271021_1.csv ready
libt_svdpp_ndcg_271021_1.csv ready
gr_s_svd_ndcg_271021_1.csv ready
gr_s_svdpp_ndcg_271021_1.csv ready
drug_rec_svd_ndcg_271021_1.csv ready
drug_rec_svdpp_ndcg_271021_1.csv ready
amz_software_svd_ndcg_271021_1.csv ready
amz_software_svdpp_ndcg_271021_1.csv ready
amz_amazon_fashion_svd_ndcg_271021_1.csv ready
amz_amazon_fashion_svdpp_ndcg_271021_1.csv ready
amz_all_beauty_svd_ndcg_271021_1.csv ready
amz_all_beauty_svdpp_ndcg_271021_1.csv ready
amz_appliances_svd_ndcg_271021_1.csv ready
amz_appliances_svdpp_ndcg_271021_1.csv ready
amz_gift_cards_svd_ndcg_271021_1.csv ready
amz_gift_cards_svdpp_ndcg_271021_1.csv ready
amz_luxury_beauty_svd_ndcg_271021_1.csv ready
amz_luxury_beauty_svdpp_