# Import and setup

In [1]:
!which python

/home/guy/anaconda3/envs/difsr/bin/python


In [2]:
import os
import sys
difsr_root= os.path.dirname(os.getcwd())
sys.path.insert(1, difsr_root)
sys.path

['/home/guy/workspace/work/git/gkoren2/DIF-SR/notebooks',
 '/home/guy/workspace/work/git/gkoren2/DIF-SR',
 '/home/guy/anaconda3/envs/difsr/lib/python39.zip',
 '/home/guy/anaconda3/envs/difsr/lib/python3.9',
 '/home/guy/anaconda3/envs/difsr/lib/python3.9/lib-dynload',
 '',
 '/home/guy/anaconda3/envs/difsr/lib/python3.9/site-packages']

In [3]:
import os 
import pandas as pd
import numpy as np
from datetime import datetime
from recbole.config import Config
from recbole.data import create_dataset
from recbole.data.utils import get_dataloader
from recbole.utils import init_logger, init_seed, get_model, get_trainer, set_color
import matplotlib.pyplot as plt

In [4]:
dataset_root = os.path.abspath('../dataset')
# os.listdir(dataset_root)
os.listdir('../dataset/')

['Amazon_Sports_and_Outdoors',
 'Amazon_Beauty',
 'Amazon_Toys_and_Games',
 'Steam',
 'yelp']

In [5]:
from dataclasses import dataclass

@dataclass
class Arguments:
    model:str = 'SASRecD'
    dataset:str = 'yelp'
    config_files:str = None





# Yelp

In [6]:
dataset_path=os.path.join(dataset_root,'yelp')
os.listdir(dataset_path)

['README.md', 'yelp.inter', 'yelp.zip', 'yelp.user', 'yelp.item']

## EDA

### Reading the dataset using recbole

In [None]:
# reading the dataset through the recbole framework
args=Arguments(dataset="yelp",config_files=os.path.join(difsr_root,'configs/yelp_cat_L1.yaml'))
config_file_list = args.config_files.strip().split(' ') if args.config_files else None
config = Config(model=args.model, dataset=f'{args.dataset}', config_file_list=config_file_list)
config.final_config_dict['data_path'] = os.path.join(difsr_root,config.final_config_dict['data_path'])
config.final_config_dict

In [None]:
dataset = create_dataset(config)
dataset

In [None]:
dataset.item_feat.head()

In [None]:
# print the list of columns that can be considered as features
dataset.field2token_id.keys()

In [None]:
dataset.item_feat['categories'].loc[0].dtype

### read item data

In [None]:
ydf=pd.read_csv(os.path.join(dataset_path,'yelp.item'),sep='\t')
ydf.head()

In [None]:
# check how many non-Nan values are there in each column
ydf.count()/len(ydf)

### read original data
reading the data from the Yelp dataset, before it has been preprocessed by recbole. the dataset was downloaded from Kaggle (version 4)


In [None]:
import json
yelp_orig_path='/home/guy/sd1tb/datasets/yelp'
os.listdir(yelp_orig_path)


In [None]:
#read json file to dataframe
def read_json_to_df(path):
    with open(path) as f:
        data = [json.loads(line) for line in f]
    return pd.DataFrame.from_dict(data)

yelp_review=read_json_to_df(os.path.join(yelp_orig_path,'yelp_academic_dataset_review.json'))
yelp_review.head()

In [None]:
# group by reviews of user u by day of review and count the number of reviews
u.groupby(u['date'].dt.date).count()['review_id']

In [None]:
# find how many users have more than 100 reviews
len(yelp_review.groupby('user_id').count()['review_id'][yelp_review.groupby('user_id').count()['review_id']>100])

In [None]:
yelp_review['user_id'].value_counts()

## running predictions

In [7]:
import torch



In [8]:
# reading the dataset through the recbole framework
args=Arguments(dataset="yelp",config_files=os.path.join(difsr_root,'configs/yelp_cat_L1.yaml'))
config_file_list = args.config_files.strip().split(' ') if args.config_files else None
config = Config(model=args.model, dataset=f'{args.dataset}', config_file_list=config_file_list)
config.final_config_dict['data_path'] = os.path.join(difsr_root,config.final_config_dict['data_path'])
config.final_config_dict

{'gpu_id': 0,
 'use_gpu': True,
 'seed': 212,
 'state': 'INFO',
 'reproducibility': True,
 'data_path': '/home/guy/workspace/work/git/gkoren2/DIF-SR/dataset/yelp',
 'checkpoint_dir': 'saved/yelp_cat_L1',
 'show_progress': True,
 'save_dataset': False,
 'save_dataloaders': False,
 'epochs': 2,
 'train_batch_size': 1536,
 'learner': 'adam',
 'learning_rate': 0.0001,
 'neg_sampling': None,
 'eval_step': 2,
 'stopping_step': 10,
 'clip_grad_norm': None,
 'weight_decay': 0.0,
 'multi_gpus': False,
 'eval_args': {'split': {'LS': 'valid_and_test'},
  'group_by': 'user',
  'order': 'TO',
  'mode': 'full'},
 'repeatable': True,
 'metrics': ['Recall', 'NDCG'],
 'topk': [3, 5, 10, 20],
 'valid_metric': 'Recall@20',
 'valid_metric_bigger': True,
 'eval_batch_size': 256,
 'loss_decimal_place': 4,
 'metric_decimal_place': 4,
 'n_layers': 1,
 'n_heads': 8,
 'hidden_size': 256,
 'attribute_hidden_size': [64],
 'inner_size': 256,
 'hidden_dropout_prob': 0.5,
 'attn_dropout_prob': 0.3,
 'hidden_act': 'g

In [9]:
ITEM_ID = config['ITEM_ID_FIELD']
ITEM_SEQ = ITEM_ID + config['LIST_SUFFIX']
ITEM_SEQ_LEN = config['ITEM_LIST_LENGTH_FIELD']

In [10]:
# create a dataset of type SequentialDataset
dataset = create_dataset(config)
dataset

[1;35myelp[0m
[1;34mThe number of users[0m: 30500
[1;34mAverage actions of users[0m: 10.399750811502017
[1;34mThe number of items[0m: 20069
[1;34mAverage actions of items[0m: 15.805361769982062
[1;34mThe number of inters[0m: 317182
[1;34mThe sparsity of the dataset[0m: 99.94818172387231%
[1;34mRemain Fields[0m: ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'date', 'item_name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'item_stars', 'item_review_count', 'is_open', 'categories']

### Manual prediction

In [11]:
built_datasets = dataset.build()
train_dataset, valid_dataset, test_dataset = built_datasets
test_dataset

[1;35myelp[0m
[1;34mThe number of users[0m: 30500
[1;34mAverage actions of users[0m: 1.0
[1;34mThe number of items[0m: 20069
[1;34mAverage actions of items[0m: 2.3587780355761794
[1;34mThe number of inters[0m: 30499
[1;34mThe sparsity of the dataset[0m: 99.99501735406291%
[1;34mRemain Fields[0m: ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'date', 'item_name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'item_stars', 'item_review_count', 'is_open', 'categories', 'review_id_list', 'business_id_list', 'stars_list', 'useful_list', 'funny_list', 'cool_list', 'date_list', 'item_length']

In [12]:
#create an instance of the model
model = get_model(config['model'])(config, dataset).to(config['device'])
model 

SASRecD(
  (item_embedding): Embedding(20069, 256, padding_idx=0)
  (position_embedding): Embedding(50, 256)
  (feature_embed_layer_list): ModuleList(
    (0): FeatureSeqEmbLayer()
  )
  (trm_encoder): DIFTransformerEncoder(
    (layer): ModuleList(
      (0): DIFTransformerLayer(
        (multi_head_attention): DIFMultiHeadAttention(
          (query): Linear(in_features=256, out_features=256, bias=True)
          (key): Linear(in_features=256, out_features=256, bias=True)
          (value): Linear(in_features=256, out_features=256, bias=True)
          (query_p): Linear(in_features=256, out_features=256, bias=True)
          (key_p): Linear(in_features=256, out_features=256, bias=True)
          (query_layers): ModuleList(
            (0): Linear(in_features=64, out_features=64, bias=True)
          )
          (key_layers): ModuleList(
            (0): Linear(in_features=64, out_features=64, bias=True)
          )
          (fusion_layer): VanillaAttention(
            (projection):

In [15]:
# set model path
model_path=os.path.join(difsr_root,'saved/yelp_cat_L1')
os.listdir(model_path)


['SASRecD-Apr-09-2023_e150.pth',
 'SASRecD-Apr-09-2023_20-03-18.pth',
 'SASRecD-Apr-10-2023_10-51-26.pth',
 'SASRecD-Apr-09-2023_16-15-47.pth']

In [16]:
model_file_name=os.path.join(model_path,'SASRecD-Apr-09-2023_e150.pth')
# load the model
model_file = torch.load(model_file_name)
model.load_state_dict(model_file['state_dict'])
model.load_other_parameter(model_file.get('other_parameter'))
print('Loading model structure and parameters from {}'.format(model_file))
model.eval()


Loading model structure and parameters from {'config': 
[1;35mGeneral Hyper Parameters:
[0m[1;36mgpu_id[0m =[1;33m 0[0m
[1;36muse_gpu[0m =[1;33m True[0m
[1;36mseed[0m =[1;33m 5678[0m
[1;36mstate[0m =[1;33m INFO[0m
[1;36mreproducibility[0m =[1;33m True[0m
[1;36mdata_path[0m =[1;33m dataset/yelp[0m
[1;36mshow_progress[0m =[1;33m True[0m
[1;36msave_dataset[0m =[1;33m False[0m
[1;36msave_dataloaders[0m =[1;33m False[0m
[1;36mbenchmark_filename[0m =[1;33m None[0m

[1;35mTraining Hyper Parameters:
[0m[1;36mcheckpoint_dir[0m =[1;33m /export/work/gkoren2/temp/difsr/exp4/yelp_cat_L1[0m
[1;36mepochs[0m =[1;33m 150[0m
[1;36mtrain_batch_size[0m =[1;33m 2048[0m
[1;36mlearner[0m =[1;33m adam[0m
[1;36mlearning_rate[0m =[1;33m 0.0001[0m
[1;36meval_step[0m =[1;33m 2[0m
[1;36mstopping_step[0m =[1;33m 10[0m
[1;36mclip_grad_norm[0m =[1;33m None[0m
[1;36mweight_decay[0m =[1;33m 0.0[0m
[1;36mloss_decimal_place[0m =[1;33m 4[0

SASRecD(
  (item_embedding): Embedding(20069, 256, padding_idx=0)
  (position_embedding): Embedding(50, 256)
  (feature_embed_layer_list): ModuleList(
    (0): FeatureSeqEmbLayer()
  )
  (trm_encoder): DIFTransformerEncoder(
    (layer): ModuleList(
      (0): DIFTransformerLayer(
        (multi_head_attention): DIFMultiHeadAttention(
          (query): Linear(in_features=256, out_features=256, bias=True)
          (key): Linear(in_features=256, out_features=256, bias=True)
          (value): Linear(in_features=256, out_features=256, bias=True)
          (query_p): Linear(in_features=256, out_features=256, bias=True)
          (key_p): Linear(in_features=256, out_features=256, bias=True)
          (query_layers): ModuleList(
            (0): Linear(in_features=64, out_features=64, bias=True)
          )
          (key_layers): ModuleList(
            (0): Linear(in_features=64, out_features=64, bias=True)
          )
          (fusion_layer): VanillaAttention(
            (projection):

In [17]:
idx=13
n_inter=1 # number of interactions to sample
interaction = test_dataset[idx:idx+n_inter]
pos_i = interaction[ITEM_ID]
interaction

The batch_size of interaction: 1
    review_id, torch.Size([1]), cpu, torch.int64
    user_id, torch.Size([1]), cpu, torch.int64
    business_id, torch.Size([1]), cpu, torch.int64
    stars, torch.Size([1]), cpu, torch.float32
    useful, torch.Size([1]), cpu, torch.float32
    funny, torch.Size([1]), cpu, torch.float32
    cool, torch.Size([1]), cpu, torch.float32
    date, torch.Size([1]), cpu, torch.float32
    item_length, torch.Size([1]), cpu, torch.int64
    review_id_list, torch.Size([1, 50]), cpu, torch.int64
    business_id_list, torch.Size([1, 50]), cpu, torch.int64
    stars_list, torch.Size([1, 50]), cpu, torch.float64
    useful_list, torch.Size([1, 50]), cpu, torch.float64
    funny_list, torch.Size([1, 50]), cpu, torch.float64
    cool_list, torch.Size([1, 50]), cpu, torch.float64
    date_list, torch.Size([1, 50]), cpu, torch.float64
    item_name, torch.Size([1, 12]), cpu, torch.int64
    address, torch.Size([1, 20]), cpu, torch.int64
    city, torch.Size([1, 4]), cpu,

In [19]:
# to get the embedding of the predicted item
item_seq = interaction[ITEM_SEQ].to(config['device'])
item_seq_len = interaction[ITEM_SEQ_LEN].to(config['device'])
gt_item = interaction[ITEM_ID]
item_embeddings = model.item_embedding.weight
seq_output=model(item_seq, item_seq_len)
scores = torch.matmul(seq_output, item_embeddings.transpose(0, 1))  # [B, item_num]

In [20]:
topk=20
topk_scores, topk_items = torch.topk(scores, topk)
print(topk_items)


tensor([[ 4239,  9837,  7681,  1176,   252,  6442,  5399, 18876,  7034, 18496,
          1798,  6092, 14591, 15932, 16949, 15591,  5090, 16198, 12129, 15811]],
       device='cuda:0')


In [38]:
def analyze_interaction(interaction_idx, model, test_dataset, topk=100):
    interaction = test_dataset[interaction_idx:interaction_idx+1]
    item_seq = interaction[ITEM_SEQ].to(config['device'])
    item_seq_len = interaction[ITEM_SEQ_LEN].to(config['device'])
    gt_item = interaction[ITEM_ID]
    item_embeddings = model.item_embedding.weight
    seq_output=model(item_seq, item_seq_len)
    scores = torch.matmul(seq_output, item_embeddings.transpose(0, 1))  # [B, item_num]
    topk_scores, topk_items = torch.topk(scores, topk)
    
    return gt_item.detach().cpu().numpy(), topk_items.detach().cpu().numpy()




In [40]:
gt_item, topk_items = analyze_interaction(13, model, test_dataset, topk=20)
gt_item in topk_items

True

In [43]:
result=[gt in topk for gt, topk in [analyze_interaction(i, model, test_dataset, topk=10) for i in range(len(test_dataset))]]
# this should be equivalent to the recall@topk 
sum(result)/len(result)


1956

### Evaluate using trainer

In [11]:
from recbole.data import data_preparation

In [12]:
train_data, valid_data, test_data = data_preparation(config, dataset)
test_data

<recbole.data.dataloader.general_dataloader.FullSortEvalDataLoader at 0x7f2b5e2f1910>

In [13]:
test_data.dataset

[1;35myelp[0m
[1;34mThe number of users[0m: 30500
[1;34mAverage actions of users[0m: 1.0
[1;34mThe number of items[0m: 20069
[1;34mAverage actions of items[0m: 2.3587780355761794
[1;34mThe number of inters[0m: 30499
[1;34mThe sparsity of the dataset[0m: 99.99501735406291%
[1;34mRemain Fields[0m: ['review_id', 'user_id', 'business_id', 'stars', 'useful', 'funny', 'cool', 'date', 'item_name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'item_stars', 'item_review_count', 'is_open', 'categories', 'review_id_list', 'business_id_list', 'stars_list', 'useful_list', 'funny_list', 'cool_list', 'date_list', 'item_length']

In [14]:
interaction=test_data.dataset[13:14]
interaction.business_id_list

tensor([[19574,  4309,  3359, 11125, 16113,  4239,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0]])

In [17]:
# interaction, scores, positive_u, positive_i = eval_func(batched_data)
batched_data = next(test_data)

In [18]:
batched_data[0].business_id_list[89]

tensor([10649,  6044, 12291,  9631, 15847, 19154, 19154, 13164,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0])

In [None]:
batched_data

In [15]:
#create an instance of the model
model = get_model(config['model'])(config, dataset).to(config['device'])
model 

SASRecD(
  (item_embedding): Embedding(20069, 256, padding_idx=0)
  (position_embedding): Embedding(50, 256)
  (feature_embed_layer_list): ModuleList(
    (0): FeatureSeqEmbLayer()
  )
  (trm_encoder): DIFTransformerEncoder(
    (layer): ModuleList(
      (0): DIFTransformerLayer(
        (multi_head_attention): DIFMultiHeadAttention(
          (query): Linear(in_features=256, out_features=256, bias=True)
          (key): Linear(in_features=256, out_features=256, bias=True)
          (value): Linear(in_features=256, out_features=256, bias=True)
          (query_p): Linear(in_features=256, out_features=256, bias=True)
          (key_p): Linear(in_features=256, out_features=256, bias=True)
          (query_layers): ModuleList(
            (0): Linear(in_features=64, out_features=64, bias=True)
          )
          (key_layers): ModuleList(
            (0): Linear(in_features=64, out_features=64, bias=True)
          )
          (fusion_layer): VanillaAttention(
            (projection):

In [15]:
# set model path
model_path=os.path.join(difsr_root,'saved/yelp_cat_L1')
os.listdir(model_path)

['SASRecD-Apr-09-2023_e150.pth',
 'SASRecD-Apr-09-2023_20-03-18.pth',
 'SASRecD-Apr-10-2023_10-51-26.pth',
 'SASRecD-Apr-09-2023_16-15-47.pth']

In [16]:
# get trainer
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)


In [17]:
model_file_name=os.path.join(model_path,'SASRecD-Apr-09-2023_e150.pth')
test_result = trainer.evaluate(test_data,
                               load_best_model=True,
                               model_file = model_file_name,
                               show_progress=config['show_progress'])
test_result

[1;35mEvaluate   [0m: 100%|███████████████████████| 120/120 [07:45<00:00,  3.88s/it, [1;33mGPU RAM: 0.52 G/11.92 G[0m][0m


{'recall@3': 0.0356,
 'recall@5': 0.0446,
 'recall@10': 0.0641,
 'recall@20': 0.0924,
 'ndcg@3': 0.0297,
 'ndcg@5': 0.0334,
 'ndcg@10': 0.0396,
 'ndcg@20': 0.0468}

In [None]:
model_file_name=os.path.join(model_path,'SASRecD-Apr-09-2023_20-03-18.pth')
test_result = trainer.evaluate(test_data,
                               load_best_model=True,
                               model_file = model_file_name,
                               show_progress=config['show_progress'])
test_result