This is a demo on how to generate tabular datasets using this package.

In [1]:
import numpy as np
import pandas as pd
from recsyslabs.datasetgen.fixed_users import FixedUsers

# Control for the number of users

## Generating Single Interactions

In [2]:
n_users = 10
n_items = 4
item_exposure_bias_pmf = [0.1, 0.2, 0.3, 0.4]

tab_data_generator = FixedUsers(
    n_users=n_users,
    n_items=n_items,
    item_exposure_bias_pmf=item_exposure_bias_pmf)
dataset = tab_data_generator.generate_single_interaction()
print(type(dataset))
dataset

<class 'recsyslabs.datasetgen.dataset.Dataset'>


Unnamed: 0,item_id,rating,user_id
0,0,-1,0
1,3,-1,1
2,3,0,2
3,3,1,3
4,1,1,4
5,1,0,5
6,2,1,6
7,2,0,7
8,3,-1,8
9,3,1,9


## Generate Multi-interactions with items

### using a lower and upper bound for rating values

In [3]:
n_users = 20
n_items = 5
num_interactions = 10
ratings_domain = (1, 5)
user_rating_bias_pmf = (0.4, 0.1, 0.05, 0.05, 0.4)

tab_data_generator = FixedUsers(
    n_users=n_users, 
    n_items=n_items,
    ratings_domain=ratings_domain,
    user_rating_bias_pmf=user_rating_bias_pmf)
dataset = tab_data_generator.generate_multi_interaction(
    num_interactions=num_interactions
)
dataset

Unnamed: 0,item_id,rating,user_id
0,2,1,0
1,1,5,0
2,4,5,0
3,1,1,0
4,2,1,0
...,...,...,...
195,3,5,19
196,0,1,19
197,2,1,19
198,3,4,19


In [4]:
dataset.to_sparse(aggfunc='median')

item_id,0,1,2,3,4
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3.0,3.0,1.0,,3.5
1,1.5,3.0,3.0,5.0,1.0
2,1.0,1.0,5.0,3.0,
3,3.0,1.0,1.0,5.0,5.0
4,5.0,3.0,5.0,3.0,5.0
5,3.0,1.0,1.0,1.0,2.0
6,5.0,1.0,5.0,5.0,3.0
7,2.0,5.0,1.0,5.0,1.5
8,3.0,2.0,1.0,5.0,5.0
9,1.0,3.0,2.5,5.0,1.0


In [5]:
dataset['rating'].describe()

count    200.000000
mean       2.975000
std        1.868645
min        1.000000
25%        1.000000
50%        2.000000
75%        5.000000
max        5.000000
Name: rating, dtype: float64

In [6]:
dataset['rating'].value_counts(normalize=True).sort_index()

rating
1    0.405
2    0.105
3    0.030
4    0.030
5    0.430
Name: proportion, dtype: float64

In [7]:
dataset.groupby(['item_id'])['rating'].agg(['count', 'mean', 'std']).sort_index()

Unnamed: 0_level_0,count,mean,std
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,33,2.909091,1.89347
1,49,2.918367,1.945473
2,37,2.567568,1.8034
3,39,3.589744,1.681446
4,42,2.880952,1.928001


### using a fixed ratings alphabet

In [8]:
n_users = 20
n_items = 5

# items_pmf is generated with an exponentially decaying distribution
item_exposure_bias_pmf = np.exp(-np.arange(n_items))
item_exposure_bias_pmf = item_exposure_bias_pmf / item_exposure_bias_pmf.sum()

num_interactions = 10
ratings_alphabet = (-1, 1)
user_rating_bias_pmf = [0.9, 0.1]

tab_data_generator = FixedUsers(
    n_users=n_users, 
    n_items=n_items,
    item_exposure_bias_pmf=item_exposure_bias_pmf,
    ratings_alphabet=ratings_alphabet,
    user_rating_bias_pmf=user_rating_bias_pmf)
dataset = tab_data_generator.generate_multi_interaction(
    num_interactions=num_interactions
)
dataset

Unnamed: 0,item_id,rating,user_id
0,0,-1,0
1,0,-1,0
2,0,-1,0
3,2,-1,0
4,1,-1,0
...,...,...,...
195,0,-1,19
196,2,-1,19
197,0,-1,19
198,1,-1,19


In [9]:
dataset.to_sparse()

item_id,0,1,2,3,4
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,-1.0,-1.0,-1.0,0.0,
1,-0.5,-0.5,,-1.0,1.0
2,-1.0,-1.0,-1.0,,
3,-0.75,-1.0,,,
4,-1.0,,,-1.0,-1.0
5,-0.666667,-1.0,-1.0,,
6,-0.714286,-1.0,-1.0,,
7,-0.6,-1.0,-1.0,-1.0,
8,-1.0,-1.0,,,
9,-1.0,0.0,-1.0,,


In [10]:
dataset['rating'].describe()

count    200.00000
mean      -0.78000
std        0.62735
min       -1.00000
25%       -1.00000
50%       -1.00000
75%       -1.00000
max        1.00000
Name: rating, dtype: float64

In [11]:
dataset['rating'].value_counts(normalize=True).sort_index()

rating
-1    0.89
 1    0.11
Name: proportion, dtype: float64

In [12]:
dataset.groupby(['item_id'])['rating'].agg(['count', 'mean', 'std']).sort_index()

Unnamed: 0_level_0,count,mean,std
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,126,-0.825397,0.566807
1,47,-0.659574,0.759765
2,17,-0.882353,0.485071
3,8,-0.75,0.707107
4,2,0.0,1.414214
