This is a demo on how to generate tabular datasets using this package.

In [1]:
import numpy as np
import pandas as pd
from recsyslabs.datasetgen.fixed_users import FixedUsers
from recsyslabs.datasetgen.fixed_observable_items import FixedObservableItems

# Control for the number of users

## Generating Single Interactions

In [2]:
n_users = 10
n_items = 4
item_exposure_bias_pmf = [0.1, 0.2, 0.3, 0.4]

tab_data_generator = FixedUsers(
    n_users=n_users,
    n_items=n_items,
    item_exposure_bias_pmf=item_exposure_bias_pmf)
dataset = tab_data_generator.generate_single_interaction()
print(type(dataset))
dataset

<class 'recsyslabs.datasetgen.dataset.Dataset'>


Unnamed: 0,item_id,rating,user_id
0,1,1,0
1,1,1,1
2,3,-1,2
3,2,1,3
4,1,1,4
5,3,1,5
6,3,1,6
7,0,-1,7
8,2,-1,8
9,3,-1,9


## Generate Multi-interactions with items

### using a lower and upper bound for rating values

In [3]:
n_users = 20
n_items = 5
num_interactions = 10
ratings_domain = (1, 5)
user_rating_bias_pmf = (0.4, 0.1, 0.05, 0.05, 0.4)

tab_data_generator = FixedUsers(
    n_users=n_users, 
    n_items=n_items,
    ratings_domain=ratings_domain,
    user_rating_bias_pmf=user_rating_bias_pmf)
dataset = tab_data_generator.generate_multi_interaction(
    num_interactions=num_interactions
)
dataset

Unnamed: 0,item_id,rating,user_id
0,1,5,0
1,2,5,0
2,0,1,0
3,3,3,0
4,1,1,0
...,...,...,...
195,0,1,19
196,0,4,19
197,0,5,19
198,2,5,19


In [4]:
dataset.to_sparse(aggfunc='median')

item_id,0,1,2,3,4
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3.0,3.0,5.0,2.0,
1,1.0,5.0,1.0,,5.0
2,,4.0,4.0,5.0,
3,5.0,,3.0,5.0,1.0
4,3.0,2.0,3.0,2.0,
5,1.0,5.0,4.0,1.0,1.0
6,1.0,5.0,1.0,5.0,5.0
7,3.0,3.5,3.0,5.0,5.0
8,,3.0,2.0,1.0,1.0
9,4.0,1.0,5.0,2.0,1.0


In [5]:
dataset['rating'].describe()

count    200.0000
mean       2.9700
std        1.8209
min        1.0000
25%        1.0000
50%        2.5000
75%        5.0000
max        5.0000
Name: rating, dtype: float64

In [6]:
dataset['rating'].value_counts(normalize=True).sort_index()

rating
1    0.380
2    0.120
3    0.055
4    0.040
5    0.405
Name: proportion, dtype: float64

In [7]:
dataset.groupby(['item_id'])['rating'].agg(['count', 'mean', 'std']).sort_index()

Unnamed: 0_level_0,count,mean,std
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,38,2.736842,1.840743
1,47,3.12766,1.837023
2,43,3.302326,1.819666
3,38,2.815789,1.798727
4,34,2.764706,1.826718


### using a fixed ratings alphabet

In [8]:
n_users = 3
n_items = 5

# items_pmf is generated with an exponentially decaying distribution
item_exposure_bias_pmf = np.exp(-np.arange(n_items))
item_exposure_bias_pmf = item_exposure_bias_pmf / item_exposure_bias_pmf.sum()

num_interactions = 2
ratings_alphabet = (-1, 1)
user_rating_bias_pmf = [0.9, 0.1]

tab_data_generator = FixedUsers(
    n_users=n_users, 
    n_items=n_items,
    item_exposure_bias_pmf=item_exposure_bias_pmf,
    ratings_alphabet=ratings_alphabet,
    user_rating_bias_pmf=user_rating_bias_pmf)
dataset = tab_data_generator.generate_multi_interaction(
    num_interactions=num_interactions
)
dataset

Unnamed: 0,item_id,rating,user_id
0,1,-1,0
1,0,-1,0
2,0,-1,1
3,0,-1,1
4,1,-1,2
5,0,-1,2


In [9]:
dataset.to_sparse()

item_id,0,1
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-1.0,-1.0
1,-1.0,
2,-1.0,-1.0


In [10]:
dataset['rating'].describe()

count    6.0
mean    -1.0
std      0.0
min     -1.0
25%     -1.0
50%     -1.0
75%     -1.0
max     -1.0
Name: rating, dtype: float64

In [11]:
dataset['rating'].value_counts(normalize=True).sort_index()

rating
-1    1.0
Name: proportion, dtype: float64

In [12]:
dataset.groupby(['item_id'])['rating'].agg(['count', 'mean', 'std']).sort_index()

Unnamed: 0_level_0,count,mean,std
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,4,-1.0,0.0
1,2,-1.0,0.0


# Control for the number of observed items

In [19]:
n_users = 100
n_items = 4

# minimum observable interactions of a item
min_interactions = 10

# number of interactions per user to generate in each internal loop
num_interactions = 20

# maximum number of loops to generate the dataset
max_loops = 100

# dataset will have 
# min size n_users * min_interactions
# max size n_users * num_interactions * max_loops

# items_pmf is generated with an exponentially decaying distribution using an decaying factor lambda
lambda_ = 3   # decay factor, the larger the faster the decay
item_exposure_bias_pmf = np.exp(-lambda_ * np.arange(n_items))
item_exposure_bias_pmf = item_exposure_bias_pmf / item_exposure_bias_pmf.sum()
print(item_exposure_bias_pmf)

# ratings alphabet and corresponding rating bias
ratings_alphabet = (0, 1)
user_rating_bias_pmf = [0.9, 0.1]

tab_data_generator = FixedObservableItems(
    n_users=n_users,
    n_items=n_items,
    item_exposure_bias_pmf=item_exposure_bias_pmf,
    ratings_alphabet=ratings_alphabet,
    user_rating_bias_pmf=user_rating_bias_pmf)

dataset = tab_data_generator.generate_minimum_interactions(
    min_interactions=min_interactions,
    num_interactions=num_interactions
)

dataset

[9.50218770e-01 4.73086069e-02 2.35535684e-03 1.17266312e-04]


Unnamed: 0,item_id,rating,user_id
0,0,0,0
1,1,0,0
2,0,0,0
3,0,0,0
4,1,0,0
...,...,...,...
91995,0,0,99
91996,0,0,99
91997,0,0,99
91998,0,0,99


In [20]:
# check minimum number of interactions per item
dataset.groupby(['item_id'])['rating'].count().sort_index()

item_id
0    87378
1     4385
2      227
3       10
Name: rating, dtype: int64

In [21]:
dataset['rating'].describe()

count    92000.000000
mean         0.100196
std          0.300262
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: rating, dtype: float64

In [22]:
dataset.to_sparse()

item_id,0,1,2,3
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.105626,0.085106,0.0,
1,0.102623,0.050000,0.0,
2,0.087302,0.108108,0.0,
3,0.104598,0.063830,0.0,0.0
4,0.115959,0.108696,0.0,
...,...,...,...,...
95,0.110599,0.104167,0.0,
96,0.098737,0.085106,0.0,0.0
97,0.089266,0.090909,0.0,0.0
98,0.085648,0.055556,0.0,


In [23]:
dataset['rating'].value_counts(normalize=True).sort_index()

rating
0    0.899804
1    0.100196
Name: proportion, dtype: float64

In [24]:
dataset.groupby(['item_id'])['rating'].agg(['count', 'mean', 'std']).sort_index()

Unnamed: 0_level_0,count,mean,std
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,87378,0.10046,0.300614
1,4385,0.095553,0.294011
2,227,0.092511,0.290386
3,10,0.0,0.0
