# Import and setup

In [None]:
!which python

In [None]:
import os
import sys
difsr_root= os.path.dirname(os.getcwd())
sys.path.insert(1, difsr_root)
sys.path

In [None]:
import os 
import pandas as pd
import numpy as np
from datetime import datetime
from recbole.config import Config
from recbole.data import create_dataset
from recbole.data.utils import get_dataloader
from recbole.utils import init_logger, init_seed, get_model, get_trainer, set_color
import matplotlib.pyplot as plt

In [None]:
dataset_root = os.path.abspath('../dataset')
# os.listdir(dataset_root)
os.listdir('../dataset/')

In [None]:
from dataclasses import dataclass

@dataclass
class Arguments:
    model:str = 'SASRecD'
    dataset:str = 'yelp'
    config_files:str = None





# Yelp

In [None]:
dataset_path=os.path.join(dataset_root,'yelp')
os.listdir(dataset_path)

## EDA

### Reading the dataset using recbole

In [None]:
# reading the dataset through the recbole framework
args=Arguments(dataset="yelp",config_files=os.path.join(difsr_root,'configs/yelp_cat_L1.yaml'))
config_file_list = args.config_files.strip().split(' ') if args.config_files else None
config = Config(model=args.model, dataset=f'{args.dataset}', config_file_list=config_file_list)
config.final_config_dict['data_path'] = os.path.join(difsr_root,config.final_config_dict['data_path'])
config.final_config_dict

In [None]:
dataset = create_dataset(config)
dataset

In [None]:
dataset.item_feat.head()

In [None]:
# print the list of columns that can be considered as features
dataset.field2token_id.keys()

In [None]:
dataset.item_feat['categories'].loc[0].dtype

### read item data

In [None]:
ydf=pd.read_csv(os.path.join(dataset_path,'yelp.item'),sep='\t')
ydf.head()

In [None]:
# check how many non-Nan values are there in each column
ydf.count()/len(ydf)

### read original data
reading the data from the Yelp dataset, before it has been preprocessed by recbole. the dataset was downloaded from Kaggle (version 4)


In [None]:
import json
yelp_orig_path='/home/guy/sd1tb/datasets/yelp'
os.listdir(yelp_orig_path)


In [None]:
#read json file to dataframe
def read_json_to_df(path):
    with open(path) as f:
        data = [json.loads(line) for line in f]
    return pd.DataFrame.from_dict(data)

yelp_review=read_json_to_df(os.path.join(yelp_orig_path,'yelp_academic_dataset_review.json'))
yelp_review.head()

In [None]:
# group by reviews of user u by day of review and count the number of reviews
u.groupby(u['date'].dt.date).count()['review_id']

In [None]:
# find how many users have more than 100 reviews
len(yelp_review.groupby('user_id').count()['review_id'][yelp_review.groupby('user_id').count()['review_id']>100])

In [None]:
yelp_review['user_id'].value_counts()

## running predictions

In [None]:
import torch
torch.cuda.is_available()



In [None]:
# reading the dataset through the recbole framework
# args=Arguments(dataset="yelp",config_files=os.path.join(difsr_root,'configs/yelp_cat_L1.yaml'))
args=Arguments(dataset="yelp",config_files=os.path.join(difsr_root,'configs/yelp_cat_city_L1.yaml'))
config_file_list = args.config_files.strip().split(' ') if args.config_files else None
config = Config(model=args.model, dataset=f'{args.dataset}', config_file_list=config_file_list)
config.final_config_dict['data_path'] = os.path.join(difsr_root,config.final_config_dict['data_path'])
config.final_config_dict

In [None]:
ITEM_ID = config['ITEM_ID_FIELD']
ITEM_SEQ = ITEM_ID + config['LIST_SUFFIX']
ITEM_SEQ_LEN = config['ITEM_LIST_LENGTH_FIELD']

In [None]:
# create a dataset of type SequentialDataset
dataset = create_dataset(config)
dataset

### Manual prediction

In [None]:
built_datasets = dataset.build()
train_dataset, valid_dataset, test_dataset = built_datasets
test_dataset

In [None]:
#create an instance of the model
model = get_model(config['model'])(config, dataset).to(config['device'])
model 

In [None]:
# set model path
model_path=os.path.join(difsr_root,'saved/yelp_cat_city_L1')
os.listdir(model_path)


In [None]:
model_file_name=os.path.join(model_path,'SASRecD-Apr-09-2023_e150.pth')
# load the model
model_file = torch.load(model_file_name)
model.load_state_dict(model_file['state_dict'])
model.load_other_parameter(model_file.get('other_parameter'))
print('Loading model structure and parameters from {}'.format(model_file))
model.eval()


In [None]:
idx=13
n_inter=1 # number of interactions to sample
interaction = test_dataset[idx:idx+n_inter]
pos_i = interaction[ITEM_ID]
interaction

In [None]:
# to get the embedding of the predicted item
item_seq = interaction[ITEM_SEQ].to(config['device'])
item_seq_len = interaction[ITEM_SEQ_LEN].to(config['device'])
gt_item = interaction[ITEM_ID]
item_embeddings = model.item_embedding.weight
seq_output=model(item_seq, item_seq_len)
scores = torch.matmul(seq_output, item_embeddings.transpose(0, 1))  # [B, item_num]

In [None]:
# attention mask
attn_mask = model.extended_attention_mask.detach().cpu().squeeze(0)  # [1,L,L]

# accessing the attributes attention scores of the first layer [b,h,L,p,L] where p is the number of attributes
attr_attn_scores = model.trm_encoder.layer[0].multi_head_attention.attribute_attention_table.detach().cpu().permute(0,3,1,2,4).squeeze(0)    #[p,h,L,L]

# accessing the item ID attention scores of the first layer: [b,h,L,L]
item_attn_scores = model.trm_encoder.layer[0].multi_head_attention.item_attention_scores.detach().cpu().squeeze(0)     #[h,L,L]

# accessing the position attention scores of the first layer: [b,h,L,L]
pos_attn_scores = model.trm_encoder.layer[0].multi_head_attention.pos_scores.detach().cpu().squeeze(0)    #[h,L,L]

# attention probs - after fusion of attributes, score and item ID attention and doing softmax normalization
attn_probs = model.trm_encoder.layer[0].multi_head_attention.attention_probs.detach().cpu().squeeze(0)   #[h,L,L]



In [None]:
attn_mask.shape, attr_attn_scores.shape, item_attn_scores.shape, pos_attn_scores.shape, attn_probs.shape

In [None]:
def attn_view(attention_weights,attention_mask,item_seq_len,norm_to_probs=True):
    # Reshape attention weights tensor to have shape (num_heads, seq_length, seq_length)
    if norm_to_probs:
        attention_weights = attention_weights + attention_mask
        attention_weights = torch.nn.Softmax(dim=-1)(attention_weights)
    attention_weights = attention_weights.numpy()[:,:item_seq_len,:item_seq_len]
    num_heads = attention_weights.shape[0]
    attention_weights = np.transpose(attention_weights, (1, 2, 0))  # switch to (seq_length, seq_length, num_heads)

    # Normalize each head's attention weights across all tokens in sequence
    for h in range(num_heads):
        attention_weights[:, :, h] = attention_weights[:, :, h] / np.sum(attention_weights[:, :, h], axis=1, keepdims=True)

    # Visualize attention maps as heatmaps or matrices
    fig, axs = plt.subplots(nrows=num_heads, ncols=1, figsize=(10, 20))
    for h in range(num_heads):
        im=axs[h].imshow(attention_weights[:, :, h], cmap='viridis', interpolation='nearest')
        axs[h].set_title('Head {}'.format(h+1))
        # add colorbar to each plot
        fig.colorbar(im, ax=axs[h])
    plt.show()

In [None]:
attn_view(item_attn_scores,attn_mask,item_seq_len)


In [None]:
attn_view(attr_attn_scores[1],attn_mask,item_seq_len)

In [None]:
attn_view(pos_attn_scores,attn_mask,item_seq_len)

In [None]:
attn_probs.shape

In [None]:
attn_view(attn_probs,attn_mask,item_seq_len,norm_to_probs=False)

In [None]:
from bertviz import head_view

In [None]:
head_view([item_attn_scores.unsqueeze(0)],list(item_seq[0].detach().cpu().numpy()),prettify_tokens=False)
# head_view(attr_attn_scores[0],item_seq)

In [None]:
topk=20
topk_scores, topk_items = torch.topk(scores, topk)
print(topk_items)


In [None]:
def analyze_interaction(interaction_idx, model, test_dataset, topk=100):
    interaction = test_dataset[interaction_idx:interaction_idx+1]
    item_seq = interaction[ITEM_SEQ].to(config['device'])
    item_seq_len = interaction[ITEM_SEQ_LEN].to(config['device'])
    gt_item = interaction[ITEM_ID]
    item_embeddings = model.item_embedding.weight
    seq_output=model(item_seq, item_seq_len)
    scores = torch.matmul(seq_output, item_embeddings.transpose(0, 1))  # [B, item_num]
    topk_scores, topk_items = torch.topk(scores, topk)
    
    return gt_item.detach().cpu().numpy(), topk_items.detach().cpu().numpy()




In [None]:
gt_item, topk_items = analyze_interaction(13, model, test_dataset, topk=20)
gt_item in topk_items

In [None]:
result=[gt in topk for gt, topk in [analyze_interaction(i, model, test_dataset, topk=10) for i in range(len(test_dataset))]]
# this should be equivalent to the recall@topk 
sum(result)/len(result)


### Evaluate using trainer

In [None]:
from recbole.data import data_preparation

In [None]:
train_data, valid_data, test_data = data_preparation(config, dataset)
test_data

In [None]:
test_data.dataset

In [None]:
interaction=test_data.dataset[13:14]
interaction.business_id_list

In [None]:
# interaction, scores, positive_u, positive_i = eval_func(batched_data)
batched_data = next(test_data)

In [None]:
batched_data[0].business_id_list[89]

In [None]:
batched_data

In [None]:
#create an instance of the model
model = get_model(config['model'])(config, dataset).to(config['device'])
model 

In [None]:
# set model path
model_path=os.path.join(difsr_root,'saved/yelp_cat_L1')
os.listdir(model_path)

In [None]:
# get trainer
trainer = get_trainer(config['MODEL_TYPE'], config['model'])(config, model)


In [None]:
model_file_name=os.path.join(model_path,'SASRecD-Apr-09-2023_e150.pth')
test_result = trainer.evaluate(test_data,
                               load_best_model=True,
                               model_file = model_file_name,
                               show_progress=config['show_progress'])
test_result

In [None]:
model_file_name=os.path.join(model_path,'SASRecD-Apr-09-2023_20-03-18.pth')
test_result = trainer.evaluate(test_data,
                               load_best_model=True,
                               model_file = model_file_name,
                               show_progress=config['show_progress'])
test_result