The notebook for dataset summary description table generation.

In [1]:
import pandas as pd
from drsu.config import DRSUConfiguration
from drsu.datasets import ALL_DESCRIPTORS, as_pandas, download_and_transform_dataset

DRSUConfiguration.local_dataset_dir = '../data'
RESULTS_DIR = '../results'

In [2]:
DATASETS = []
for dd in ALL_DESCRIPTORS:
    if dd.id.startswith('amz_'):
        if dd.n_rows > 1000000:
            continue

    DATASETS.append(dd)

print('Chosen Datasets: ', [dd.name for dd in DATASETS])

Chosen Datasets:  ['Movielens 100k', 'Movielens 1M', 'Movielens 10M', 'epinions', 'LibraryThing', 'GoodRead Reviews (w/ spoilers)', 'Drug Recommendations', 'Amazon Ratings (Software)', 'Amazon Ratings (Amazon Fashion)', 'Amazon Ratings (All Beauty)', 'Amazon Ratings (Appliances)', 'Amazon Ratings (Gift Cards)', 'Amazon Ratings (Luxury Beauty)', 'Amazon Ratings (Magazine Subscriptions)', 'Amazon Ratings (Prime Pantry)']


In [3]:
for dd in DATASETS:
    download_and_transform_dataset(dd, verbose=False)
    print(f'"{dd.name}" ready')

"Movielens 100k" ready
"Movielens 1M" ready
"Movielens 10M" ready
"epinions" ready
"LibraryThing" ready
"GoodRead Reviews (w/ spoilers)" ready
"Drug Recommendations" ready
"Amazon Ratings (Software)" ready
"Amazon Ratings (Amazon Fashion)" ready
"Amazon Ratings (All Beauty)" ready
"Amazon Ratings (Appliances)" ready
"Amazon Ratings (Gift Cards)" ready
"Amazon Ratings (Luxury Beauty)" ready
"Amazon Ratings (Magazine Subscriptions)" ready
"Amazon Ratings (Prime Pantry)" ready


In [4]:
res = pd.DataFrame(columns=['Rows', '# of Users', '# of Items', 'Avg RPU', 'Avg RPI'], index=[dd.name for dd in DATASETS])
for dd in DATASETS:
    df = as_pandas(dd)
    res['Rows'][dd.name] = len(df)
    res['# of Users'][dd.name] = df['user_id'].nunique()
    res['# of Items'][dd.name] = df['item_id'].nunique()
    res['Avg RPU'][dd.name] = f"{res['Rows'][dd.name] / res['# of Users'][dd.name]:.2f}"
    res['Avg RPI'][dd.name] = f"{res['Rows'][dd.name] / res['# of Items'][dd.name]:.2f}"

res

Unnamed: 0,Rows,# of Users,# of Items,Avg RPU,Avg RPI
Movielens 100k,100000,943,1682,106.04,59.45
Movielens 1M,1000209,6040,3706,165.6,269.89
Movielens 10M,10000054,69878,10677,143.11,936.6
epinions,188478,116260,41269,1.62,4.57
LibraryThing,1387125,70618,385251,19.64,3.6
GoodRead Reviews (w/ spoilers),1330981,18868,25469,70.54,52.26
Drug Recommendations,53471,708,2635,75.52,20.29
Amazon Ratings (Software),459436,21663,375147,21.21,1.22
Amazon Ratings (Amazon Fashion),883636,186189,749233,4.75,1.18
Amazon Ratings (All Beauty),371345,32586,324038,11.4,1.15
