# Session Based Datasets

when running `run_example/session_based_rec_example.py` we see it supports only 3 datasets : `tmall-session`, `diginetica-session` and `nowplaying-session` 
however, when looking at [RecBole Dataset List](https://recbole.io/dataset_list.html) we see many more datasets.  
moreover, it looks like the `-session` based versions are different than the files downloaded from the [Google Drive](https://drive.google.com/drive/folders/1so0lckI6N6_niVEYaBu-LIcpOdZf99kj?usp=sharing)  

in this notebook I'll try to explore the various datasets to understand what do we have in it

I'll follow the example to download the datasets

In [1]:
import os 
from recbole.config import Config
from recbole.data import create_dataset
from recbole.data.utils import get_dataloader
from recbole.utils import init_logger, init_seed, get_model, get_trainer, set_color

In [2]:
from dataclasses import dataclass

@dataclass
class Arguments:
    model:str = 'GRU4Rec'
    dataset:str = 'diginetica-session'
    validation: bool = 'False'
    valid_portion: float = 0.1

args=Arguments()


## DIGINETICA
there are 2 versions : the `diginetica-session` and the `diginetica` 

### diginetica-session

In [6]:
args=Arguments()
args

Arguments(model='GRU4Rec', dataset='diginetica-session', validation='False', valid_portion=0.1)

In [7]:
config_dict = {
    'USER_ID_FIELD': 'session_id',
    'load_col': None,
    'neg_sampling': None,
    'benchmark_filename': ['train', 'test'],
    'alias_of_item_id': ['item_id_list'],
    'topk': [20],
    'metrics': ['Recall', 'MRR'],
    'valid_metric': 'MRR@20'
}

In [8]:
config = Config(model=args.model, dataset=f'{args.dataset}', config_dict=config_dict)
config.final_config_dict

{'gpu_id': 0,
 'use_gpu': True,
 'seed': 2020,
 'state': 'INFO',
 'reproducibility': True,
 'data_path': 'dataset/diginetica-session',
 'checkpoint_dir': 'saved',
 'show_progress': True,
 'save_dataset': False,
 'dataset_save_path': None,
 'save_dataloaders': False,
 'dataloaders_save_path': None,
 'log_wandb': False,
 'wandb_project': 'recbole',
 'epochs': 300,
 'train_batch_size': 2048,
 'learner': 'adam',
 'learning_rate': 0.001,
 'neg_sampling': None,
 'eval_step': 1,
 'stopping_step': 10,
 'clip_grad_norm': None,
 'weight_decay': 0.0,
 'loss_decimal_place': 4,
 'require_pow': False,
 'eval_args': {'split': {'LS': 'valid_and_test'},
  'order': 'TO',
  'mode': 'full',
  'group_by': 'user'},
 'repeatable': True,
 'metrics': ['Recall', 'MRR'],
 'topk': [20],
 'valid_metric': 'MRR@20',
 'valid_metric_bigger': True,
 'eval_batch_size': 4096,
 'metric_decimal_place': 4,
 'embedding_size': 64,
 'hidden_size': 128,
 'num_layers': 1,
 'dropout_prob': 0.3,
 'loss_type': 'CE',
 'field_separat

In [9]:
dataset = create_dataset(config)

Downloaded 0.01 GB: 100%|██████████| 8/8 [00:03<00:00,  2.61it/s]


In [10]:
dataset

[1;35mdiginetica-session[0m
[1;34mThe number of users[0m: 719471
[1;34mAverage actions of users[0m: 1.0845872656260858
[1;34mThe number of items[0m: 43098
[1;34mAverage actions of items[0m: 18.110520574651286
[1;34mThe number of inters[0m: 780328
[1;34mThe sparsity of the dataset[0m: 99.9974834429483%
[1;34mRemain Fields[0m: ['session_id', 'item_id_list', 'item_id', 'item_length']

In [11]:
# the main interaction data is in the inter_feat data frame:
dataset.inter_feat.head()


Unnamed: 0,session_id,item_id_list,item_id,item_length
0,1,[24864],1,1
1,2,"[137, 3]",2,2
2,3,[137],3,1
3,4,[299],4,1
4,5,[1010],5,1


### diginetica

In [12]:
digi_args = Arguments(dataset='diginetica')
digi_args

Arguments(model='GRU4Rec', dataset='diginetica', validation='False', valid_portion=0.1)

In [13]:
digi_config_dict= {
        'USER_ID_FIELD': 'session_id',
        'load_col': None,       # load all columns. dont filter anything
        'neg_sampling': None,
        # 'benchmark_filename': ['train', 'test'],
        # 'alias_of_item_id': ['item_id_list'],
        'eval_args':{
            'group_by': 'user',
            'order': 'TO',
            'split':{'LS': 'test_only'},
            'mode': 'uni100'},
        'topk': [20],
        'metrics': ['Recall', 'MRR'],
        'valid_metric': 'MRR@20'
    }


In [14]:
digi_config = Config(model=digi_args.model, dataset=f'{digi_args.dataset}', config_dict=digi_config_dict)
digi_config.final_config_dict

{'gpu_id': 0,
 'use_gpu': True,
 'seed': 2020,
 'state': 'INFO',
 'reproducibility': True,
 'data_path': 'dataset/diginetica',
 'checkpoint_dir': 'saved',
 'show_progress': True,
 'save_dataset': False,
 'dataset_save_path': None,
 'save_dataloaders': False,
 'dataloaders_save_path': None,
 'log_wandb': False,
 'wandb_project': 'recbole',
 'epochs': 300,
 'train_batch_size': 2048,
 'learner': 'adam',
 'learning_rate': 0.001,
 'neg_sampling': None,
 'eval_step': 1,
 'stopping_step': 10,
 'clip_grad_norm': None,
 'weight_decay': 0.0,
 'loss_decimal_place': 4,
 'require_pow': False,
 'eval_args': {'group_by': 'user',
  'order': 'TO',
  'split': {'LS': 'test_only'},
  'mode': 'uni100'},
 'repeatable': True,
 'metrics': ['Recall', 'MRR'],
 'topk': [20],
 'valid_metric': 'MRR@20',
 'valid_metric_bigger': True,
 'eval_batch_size': 4096,
 'metric_decimal_place': 4,
 'embedding_size': 64,
 'hidden_size': 128,
 'num_layers': 1,
 'dropout_prob': 0.3,
 'loss_type': 'CE',
 'field_separator': '\t',


In [26]:
digi_config.final_config_dict['data_path'] = os.path.join(os.path.dirname(os.getcwd()),digi_config.final_config_dict['data_path'])
digi_config.final_config_dict['data_path']

'/home/gkoren2/study/git/guyk1971/RecBole/dataset/diginetica'

In [28]:
# diginetica = create_dataset(digi_config)
diginetica

[1;35mdiginetica[0m
[1;34mThe number of users[0m: 204790
[1;34mAverage actions of users[0m: 4.078212208663551
[1;34mThe number of items[0m: 184048
[1;34mAverage actions of items[0m: 19.3613918768546
[1;34mThe number of inters[0m: 835173
[1;34mThe sparsity of the dataset[0m: 99.99778416918709%
[1;34mRemain Fields[0m: ['session_id', 'item_id', 'timestamp', 'number of times', 'item_priceLog2', 'item_name', 'item_category']

In [35]:
# diginetica.inter_feat.session_id.value_counts()
len(diginetica)

835173

In [29]:
dataset

[1;35mdiginetica-session[0m
[1;34mThe number of users[0m: 719471
[1;34mAverage actions of users[0m: 1.0845872656260858
[1;34mThe number of items[0m: 43098
[1;34mAverage actions of items[0m: 18.110520574651286
[1;34mThe number of inters[0m: 780328
[1;34mThe sparsity of the dataset[0m: 99.9974834429483%
[1;34mRemain Fields[0m: ['session_id', 'item_id_list', 'item_id', 'item_length']

In [36]:
len(dataset)

780328

In [38]:
len(dataset.inter_feat.session_id.unique())

719470

In [39]:
print(dataset.inter_feat.head())
print(diginetica.inter_feat.head())

   session_id item_id_list  item_id  item_length
0           1      [24864]        1            1
1           2     [137, 3]        2            2
2           3        [137]        3            1
3           4        [299]        4            1
4           5       [1010]        5            1
   session_id  item_id     timestamp  number of times
0           1        1  1.463053e+09              1.0
1           1        2  1.463754e+09              1.0
2           1        3  1.462967e+09              1.0
3           1        4  1.463836e+09              1.0
4           1        5  1.462897e+09              1.0


## TMALL

### tmall-session
