This is a demo on how to generate tabular datasets using this package.

In [1]:
import numpy as np
import pandas as pd
from recsyslabs.datasetgen.fixed_users import FixedUsers
from recsyslabs.datasetgen.fixed_observable_items import FixedObservableItems

# Control for the number of users

## Generating Single Interactions

In [2]:
n_users = 10
n_items = 4
item_exposure_bias_pmf = [0.1, 0.2, 0.3, 0.4]

tab_data_generator = FixedUsers(
    n_users=n_users,
    n_items=n_items,
    item_exposure_bias_pmf=item_exposure_bias_pmf)
dataset = tab_data_generator.generate_single_interaction()
print(type(dataset))
dataset

<class 'recsyslabs.datasetgen.dataset.Dataset'>


Unnamed: 0,item_id,rating,user_id
0,2,0,0
1,3,-1,1
2,3,1,2
3,2,-1,3
4,3,1,4
5,2,0,5
6,2,1,6
7,3,-1,7
8,1,-1,8
9,0,0,9


## Generate Multi-interactions with items

### using a lower and upper bound for rating values

In [3]:
n_users = 20
n_items = 5
num_interactions = 10
ratings_domain = (1, 5)
user_rating_bias_pmf = (0.4, 0.1, 0.05, 0.05, 0.4)

tab_data_generator = FixedUsers(
    n_users=n_users, 
    n_items=n_items,
    ratings_domain=ratings_domain,
    user_rating_bias_pmf=user_rating_bias_pmf)
dataset = tab_data_generator.generate_multi_interaction(
    num_interactions=num_interactions
)
dataset

Unnamed: 0,item_id,rating,user_id
0,0,1,0
1,2,5,0
2,4,1,0
3,0,2,0
4,0,5,0
...,...,...,...
195,4,1,19
196,3,5,19
197,4,5,19
198,3,1,19


In [4]:
dataset.to_sparse(aggfunc='median')

item_id,0,1,2,3,4
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3.5,5.0,5.0,3.0,1.0
1,3.5,5.0,,2.0,3.0
2,1.0,1.5,1.0,5.0,5.0
3,1.0,2.5,3.5,4.0,
4,5.0,3.0,1.0,3.0,5.0
5,1.0,3.0,5.0,4.5,1.0
6,3.0,5.0,5.0,3.0,4.5
7,4.0,3.5,3.0,1.0,5.0
8,2.0,1.5,5.0,2.0,2.0
9,1.0,3.0,3.5,1.5,2.0


In [5]:
dataset['rating'].describe()

count    200.000000
mean       3.105000
std        1.800049
min        1.000000
25%        1.000000
50%        3.000000
75%        5.000000
max        5.000000
Name: rating, dtype: float64

In [6]:
dataset['rating'].value_counts(normalize=True).sort_index()

rating
1    0.340
2    0.125
3    0.045
4    0.070
5    0.420
Name: proportion, dtype: float64

In [7]:
dataset.groupby(['item_id'])['rating'].agg(['count', 'mean', 'std']).sort_index()

Unnamed: 0_level_0,count,mean,std
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,45,3.022222,1.789983
1,41,2.95122,1.843247
2,42,3.333333,1.843468
3,37,3.0,1.699673
4,35,3.228571,1.880014


### using a fixed ratings alphabet

In [8]:
n_users = 3
n_items = 5

# items_pmf is generated with an exponentially decaying distribution
item_exposure_bias_pmf = np.exp(-np.arange(n_items))
item_exposure_bias_pmf = item_exposure_bias_pmf / item_exposure_bias_pmf.sum()

num_interactions = 2
ratings_alphabet = (-1, 1)
user_rating_bias_pmf = [0.9, 0.1]

tab_data_generator = FixedUsers(
    n_users=n_users, 
    n_items=n_items,
    item_exposure_bias_pmf=item_exposure_bias_pmf,
    ratings_alphabet=ratings_alphabet,
    user_rating_bias_pmf=user_rating_bias_pmf)
dataset = tab_data_generator.generate_multi_interaction(
    num_interactions=num_interactions
)
dataset

Unnamed: 0,item_id,rating,user_id
0,1,-1,0
1,0,-1,0
2,1,-1,1
3,0,-1,1
4,2,-1,2
5,0,-1,2


In [9]:
dataset.to_sparse()

item_id,0,1,2
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-1.0,-1.0,
1,-1.0,-1.0,
2,-1.0,,-1.0


In [10]:
dataset['rating'].describe()

count    6.0
mean    -1.0
std      0.0
min     -1.0
25%     -1.0
50%     -1.0
75%     -1.0
max     -1.0
Name: rating, dtype: float64

In [11]:
dataset['rating'].value_counts(normalize=True).sort_index()

rating
-1    1.0
Name: proportion, dtype: float64

In [12]:
dataset.groupby(['item_id'])['rating'].agg(['count', 'mean', 'std']).sort_index()

Unnamed: 0_level_0,count,mean,std
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3,-1.0,0.0
1,2,-1.0,0.0
2,1,-1.0,


# Control for the number of observed items

In [13]:
n_users = 100
n_items = 4

# minimum observable interactions of a item
min_interactions = 10

# number of interactions per user to generate in each internal loop
num_interactions = 20

# maximum number of loops to generate the dataset
max_loops = 100

# dataset will have 
# min size n_users * min_interactions
# max size n_users * num_interactions * max_loops

# items_pmf is generated with an exponentially decaying distribution using an decaying factor lambda
lambda_ = 3   # decay factor, the larger the faster the decay
item_exposure_bias_pmf = np.exp(-lambda_ * np.arange(n_items))
item_exposure_bias_pmf = item_exposure_bias_pmf / item_exposure_bias_pmf.sum()
print(item_exposure_bias_pmf)

# ratings alphabet and corresponding rating bias
ratings_alphabet = (0, 1)
user_rating_bias_pmf = [0.9, 0.1]

tab_data_generator = FixedObservableItems(
    n_users=n_users,
    n_items=n_items,
    item_exposure_bias_pmf=item_exposure_bias_pmf,
    ratings_alphabet=ratings_alphabet,
    user_rating_bias_pmf=user_rating_bias_pmf)

dataset = tab_data_generator.generate_minimum_interactions(
    min_interactions=min_interactions,
    num_interactions=num_interactions
)

dataset

[9.50218770e-01 4.73086069e-02 2.35535684e-03 1.17266312e-04]


Unnamed: 0,item_id,rating,user_id
0,0,0,0
1,0,0,0
2,0,0,0
3,0,0,0
4,0,0,0
...,...,...,...
119995,0,1,99
119996,0,0,99
119997,0,0,99
119998,1,0,99


In [14]:
# check minimum number of interactions per item
dataset.groupby(['item_id'])['rating'].count().sort_index()

item_id
0    114100
1      5640
2       250
3        10
Name: rating, dtype: int64

In [15]:
dataset['rating'].describe()

count    120000.000000
mean          0.100367
std           0.300490
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: rating, dtype: float64

In [16]:
dataset.to_sparse()

item_id,0,1,2,3
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.103478,0.021277,0.000000,
1,0.105124,0.104478,0.000000,
2,0.094987,0.120690,0.000000,
3,0.091228,0.118644,0.000000,
4,0.102655,0.115942,0.000000,
...,...,...,...,...
95,0.114611,0.055556,0.000000,
96,0.087979,0.145833,0.000000,
97,0.094323,0.076923,0.333333,
98,0.099559,0.114754,0.250000,


In [17]:
dataset['rating'].value_counts(normalize=True).sort_index()

rating
0    0.899633
1    0.100367
Name: proportion, dtype: float64

In [18]:
dataset.groupby(['item_id'])['rating'].agg(['count', 'mean', 'std']).sort_index()

Unnamed: 0_level_0,count,mean,std
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,114100,0.10057,0.300759
1,5640,0.096277,0.294996
2,250,0.096,0.295182
3,10,0.2,0.421637


# Control for minimum observable symbol of rating alphabet per item

In [28]:
n_users = 100
n_items = 10

# minimum observable interactions of a item
min_interactions = 5

# number of interactions per user to generate in each internal loop
num_interactions = 20

# maximum number of loops to generate the dataset
max_loops = 1000

# dataset will have 
# min size n_users * min_interactions
# max size n_users * num_interactions * max_loops

# items_pmf is generated with an exponentially decaying distribution using an decaying factor lambda
lambda_ = 1   # decay factor, the larger the faster the decay
item_exposure_bias_pmf = np.exp(-lambda_ * np.arange(n_items))
item_exposure_bias_pmf = item_exposure_bias_pmf / item_exposure_bias_pmf.sum()
print(item_exposure_bias_pmf)

# ratings alphabet and corresponding rating bias
ratings_alphabet = (1, 2, 3, 4, 5)
user_rating_bias_pmf = [0.4, 0.05, 0.1, 0.05, 0.4]

tab_data_generator = FixedObservableItems(
    n_users=n_users,
    n_items=n_items,
    item_exposure_bias_pmf=item_exposure_bias_pmf,
    ratings_alphabet=ratings_alphabet,
    user_rating_bias_pmf=user_rating_bias_pmf)

dataset = tab_data_generator.generate_minimum_interactions_per_rating_symbol(
    min_interactions=min_interactions,
    num_interactions=num_interactions,
    max_loops=max_loops
)

dataset

[6.32149258e-01 2.32554716e-01 8.55520989e-02 3.14728583e-02
 1.15782175e-02 4.25938820e-03 1.56694135e-03 5.76445508e-04
 2.12062451e-04 7.80134161e-05]


Unnamed: 0,item_id,rating,user_id
0,1,1,0
1,1,5,0
2,0,5,0
3,2,4,0
4,0,3,0
...,...,...,...
997995,0,1,99
997996,3,1,99
997997,0,2,99
997998,0,3,99


In [33]:
dataset[['rating', 'item_id']].value_counts().sort_values().head(10).sort_index()

rating  item_id
1       9          19
2       7          28
        8          14
        9           5
3       8          23
        9           9
4       7          32
        8           9
        9           6
5       9          30
Name: count, dtype: int64

In [35]:
dataset.to_sparse().isna().sum()

item_id
0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8    11
9    53
dtype: int64