In [1]:
import os
import numpy as np
import pandas as pd

import logging
from logging import getLogger
from recbole.config import Config
from recbole.data import create_dataset, data_preparation
import recbole.model.sequential_recommender as seq_models
import recbole.trainer as trainers
from recbole.utils import init_seed, init_logger

logger = None

In [2]:
dataset_name = 'coveo'
data_path = f'./data/{dataset_name}'
dataset = f'{dataset_name}_processed_view_train_full_test_stacked.inter'
dataset_train = f'{dataset_name}_processed_view_train_full.inter'
dataset_test = f'{dataset_name}_processed_view_test_augmented.inter'
# Model parameters
model_params = {
    'coveo': {
        "model": "Gru4Rec",
        "loss": "CE",
        "learner": "adagrad",
        "embedding_size": 512,
        "hidden_size": 512,
        "num_layers": 1,
        "train_batch_size": 32,
        "eval_batch_size": 2048,
        "dropout_prob": 0.4, #embedding dropout
        "lr": 0.03,
    },
}

In [3]:
def create_inter_files(parameter_dict:dict):
    for fn in os.listdir(parameter_dict['data_path']):
        if fn.endswith('.tsv'):
            file_path = os.path.join(parameter_dict['data_path'], fn)
            with open(file_path, 'r') as f:
                lines = f.readlines()
                headers = lines[0].split('\t')
                headers = [h.replace('\n', '') + parameter_dict['column_postfix'][h.replace('\n', '')] for h in headers]
                lines[0] = '\t'.join(headers) + '\n'
            with open(file_path, 'w') as f:
                f.writelines(lines)
            os.rename(file_path, file_path.replace('.tsv', '.inter'))

def stack_train_test_datasets(data_path:str, dataset_train:str, dataset_test:str):
    ds_stacked = []
    for ds_name in [dataset_train, dataset_test]:
        data = pd.read_csv(os.path.join(data_path, ds_name), sep='\t')
        print(data.shape)
        ds_stacked.append(data)
    ds_stacked = pd.concat(ds_stacked, axis=0)
    ds_stacked.to_csv(os.path.join(data_path, dataset), sep='\t', index=False)
    print("stacked_shape", ds_stacked.shape)

def augment_session_data(parameter_dict:dict, data_path:str, dataset_test:str):
    user_id = parameter_dict['USER_ID_FIELD']
    user_id += parameter_dict['column_postfix'][user_id] 
    item_id = parameter_dict['ITEM_ID_FIELD']
    item_id += parameter_dict['column_postfix'][item_id]
    timestamp = parameter_dict['TIME_FIELD']
    timestamp += parameter_dict['column_postfix'][timestamp]

    data = pd.read_csv(os.path.join(data_path, dataset_test.replace('_augmented', '')), sep='\t')
    print("original_shape", data.shape, "orignal_sessions", data[user_id].nunique())
    data = data.sort_values(by=[user_id, timestamp])
    new_data = []
    i = 0
    for session_id, session_data in data.groupby(user_id):
        for i in range(1, len(session_data)):
            aug_data = session_data.iloc[:i+1].copy()
            aug_data[user_id] = aug_data[user_id].apply(lambda x: f"{x}_aug_{i-1}")
            new_data.append(aug_data)
    new_data = pd.concat(new_data, axis=0)
    new_data.to_csv(os.path.join(data_path, dataset_test), sep='\t', index=False)
    print("augmented_shape", new_data.shape, "augmented_sessions", new_data[user_id].nunique())

In [4]:
epochs = 5
user_id = 'SessionId'
item_id = 'ItemId'
timestamp = 'Time'
parameter_dict = {
    'data_path': data_path,
    'dataset': dataset,
    'USER_ID_FIELD': user_id,
    'ITEM_ID_FIELD': item_id,
    'TIME_FIELD': timestamp,
    'column_postfix':{
        user_id: ':token',
        item_id: ':token',
        timestamp: ':float'
    },
    # Preproc
    'user_inter_num_interval': "[1,inf)",
    'item_inter_num_interval': "[1,inf)",
    'load_col': {'inter': [user_id, item_id, timestamp]},
    
    # Train
    'train_neg_sample_args': None, # XE does not support neg sampling
    'epochs': epochs,
    'eval_step': epochs,

    # Eval
    'neg_sampling': None,
    'shuffle': False,
    'eval_args': {
        'split': {
            'TS': {
                'train_path': os.path.join(data_path, dataset_train),
                'test_path': os.path.join(data_path, dataset_test),}
            },
        'group_by': 'user',
        'order': 'TO',
        'mode': 'full',
        },
    'topk': [1,5,10,20],
    'metrics': ['Recall', 'MRR', 'Hit'],
}
parameter_dict.update(model_params[dataset_name])

create_inter_files(parameter_dict)
augment_session_data(parameter_dict, data_path, dataset_test)
stack_train_test_datasets(data_path, dataset_train, dataset_test)

config = Config(model='GRU4Rec', config_dict=parameter_dict) #dataset="recbox_data"

# init random seed
init_seed(config['seed'], config['reproducibility'])

if logger is None:
    # logger initialization
    init_logger(config)
    logger = getLogger()
    # Create handlers
    c_handler = logging.StreamHandler()
    c_handler.setLevel(logging.INFO)
    logger.addHandler(c_handler)

    # write config info into log
logger.info(config)

original_shape (52501, 3) orignal_sessions 7748
augmented_shape (465904, 3) augmented_sessions 44753
(1411113, 3)
(465904, 3)


05 Sep 14:19    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = ./data/coveo/coveo_processed_view_train_full_test_stacked.inter
checkpoint_dir = saved
show_progress = True
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 5
train_batch_size = 32
learner = adagrad
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}

stacked_shape (1877017, 3)



eval_step = 5
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'TS': {'train_path': './data/coveo/coveo_processed_view_train_full.inter', 'test_path': './data/coveo/coveo_processed_view_test_augmented.inter'}}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}
repeatable = True
metrics = ['Recall', 'MRR', 'Hit']
topk = [1, 5, 10, 20]
valid_metric = MRR@10
valid_metric_bigger = True
eval_batch_size = 2048
metric_decimal_place = 4

Dataset Hyper Parameters:
field_separator = 	
seq_separator =  
USER_ID_FIELD = SessionId
ITEM_ID_FIELD = ItemId
RATING_FIELD = rating
TIME_FIELD = Time
seq_len = None
LABEL_FIELD = label
threshold = None
NEG_PREFIX = neg_
load_col = {'inter': ['SessionId', 'ItemId', 'Time']}
unload_col = None
unused_col = None
additional_feat_suffix = None
rm_dup_inter = None
val_interval = None
filter_inter_by_user_or_item = True
user_inter_num_interval = [1,in

In [5]:
dataset = create_dataset(config)
logger.info(dataset)

05 Sep 14:20    INFO  coveo_processed_view_train_full_test_stacked.inter
The number of users: 210427
Average actions of users: 8.920081168676875
The number of items: 10869
Average actions of items: 172.71043430253957
The number of inters: 1877017
The sparsity of the dataset: 99.91793137567123%
Remain Fields: ['SessionId', 'ItemId', 'Time']
coveo_processed_view_train_full_test_stacked.inter
The number of users: 210427
Average actions of users: 8.920081168676875
The number of items: 10869
Average actions of items: 172.71043430253957
The number of inters: 1877017
The sparsity of the dataset: 99.91793137567123%
Remain Fields: ['SessionId', 'ItemId', 'Time']


In [6]:
dataset.inter_feat.head()

Unnamed: 0,SessionId,ItemId,Time
0,1,1,1544228000000.0
1,1,2,1544228000000.0
2,1,3,1544229000000.0
3,1,4,1544229000000.0
4,1,5,1544229000000.0


In [7]:
# dataset splitting
train_data, _, test_data = data_preparation(config, dataset)

changed feat format


05 Sep 14:21    INFO  time based split, from prepared file, train_path=[./data/coveo/coveo_processed_view_train_full.inter], test_path=[./data/coveo/coveo_processed_view_test_augmented.inter]
time based split, from prepared file, train_path=[./data/coveo/coveo_processed_view_train_full.inter], test_path=[./data/coveo/coveo_processed_view_test_augmented.inter]


(465904, 3) (1411113, 3)
(465904, 3) (1411113, 3)


05 Sep 14:21    INFO  [Training]: train_batch_size = [32] train_neg_sample_args: [{'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}]
[Training]: train_batch_size = [32] train_neg_sample_args: [{'distribution': 'none', 'sample_num': 'none', 'alpha': 'none', 'dynamic': False, 'candidate_num': 0}]
05 Sep 14:21    INFO  [Evaluation]: eval_batch_size = [2048] eval_args: [{'split': {'TS': {'train_path': './data/coveo/coveo_processed_view_train_full.inter', 'test_path': './data/coveo/coveo_processed_view_test_augmented.inter'}}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}]
[Evaluation]: eval_batch_size = [2048] eval_args: [{'split': {'TS': {'train_path': './data/coveo/coveo_processed_view_train_full.inter', 'test_path': './data/coveo/coveo_processed_view_test_augmented.inter'}}, 'order': 'TO', 'group_by': 'user', 'mode': {'valid': 'full', 'test': 'full'}}]


In [8]:
# model loading and initialization
model_gru4rec = seq_models.GRU4Rec(config, train_data.dataset).to(config['device'])
logger.info(model_gru4rec)

# trainer loading and initialization
trainer = trainers.Trainer(config, model_gru4rec)

# model training
best_valid_score, best_valid_result = trainer.fit(train_data=train_data, valid_data=test_data, show_progress=False)

05 Sep 14:21    INFO  GRU4Rec(
  (item_embedding): Embedding(10869, 512, padding_idx=0)
  (emb_dropout): Dropout(p=0.4, inplace=False)
  (gru_layers): GRU(512, 512, bias=False, batch_first=True)
  (dense): Linear(in_features=512, out_features=512, bias=True)
  (loss_fct): CrossEntropyLoss()
)
Trainable parameters: 7400448
GRU4Rec(
  (item_embedding): Embedding(10869, 512, padding_idx=0)
  (emb_dropout): Dropout(p=0.4, inplace=False)
  (gru_layers): GRU(512, 512, bias=False, batch_first=True)
  (dense): Linear(in_features=512, out_features=512, bias=True)
  (loss_fct): CrossEntropyLoss()
)
Trainable parameters: 7400448
05 Sep 14:23    INFO  epoch 0 training [time: 97.39s, train loss: 8.8068]
epoch 0 training [time: 97.39s, train loss: 8.8068]
05 Sep 14:25    INFO  epoch 1 training [time: 98.58s, train loss: 8.5009]
epoch 1 training [time: 98.58s, train loss: 8.5009]
05 Sep 14:26    INFO  epoch 2 training [time: 98.40s, train loss: 8.3896]
epoch 2 training [time: 98.40s, train loss: 8.38

Avg. epoch time (s) 98.02
