# Import and setup

In [1]:
!which python

/home/guy/anaconda3/envs/difsr/bin/python


In [2]:
import os
import sys
difsr_root= os.path.dirname(os.getcwd())
sys.path.insert(1, difsr_root)
sys.path

['/home/guy/workspace/work/git/gkoren2/DIF-SR/notebooks',
 '/home/guy/workspace/work/git/gkoren2/DIF-SR',
 '/home/guy/anaconda3/envs/difsr/lib/python39.zip',
 '/home/guy/anaconda3/envs/difsr/lib/python3.9',
 '/home/guy/anaconda3/envs/difsr/lib/python3.9/lib-dynload',
 '',
 '/home/guy/anaconda3/envs/difsr/lib/python3.9/site-packages']

In [16]:
import os 
import pandas as pd
import numpy as np
from datetime import datetime
from recbole.config import Config
from recbole.data import create_dataset
from recbole.data.utils import get_dataloader
from recbole.utils import init_logger, init_seed, get_model, get_trainer, set_color
import matplotlib.pyplot as plt

In [4]:
dataset_root = os.path.abspath('../dataset')
# os.listdir(dataset_root)
os.listdir('../dataset/')

['Amazon_Sports_and_Outdoors',
 'Amazon_Beauty',
 'Amazon_Toys_and_Games',
 'Steam',
 'yelp']

In [5]:
from dataclasses import dataclass

@dataclass
class Arguments:
    model:str = 'SASRecD'
    dataset:str = 'Amazon_Beauty'
    config_files:str = None





# Amazon Beauty

In [6]:
dataset_path=os.path.join(dataset_root,'Amazon_Beauty')
os.listdir(dataset_path)

['Amazon_Beauty.zip', 'Amazon_Beauty.inter', 'Amazon_Beauty.item']

## Reading the dataset using recbole 

In [7]:
# reading the dataset through the recbole framework
args=Arguments(dataset="Amazon_Beauty",config_files=os.path.join(difsr_root,'configs/Amazon_Beauty_opt.yaml'))
config_file_list = args.config_files.strip().split(' ') if args.config_files else None
config = Config(model=args.model, dataset=f'{args.dataset}', config_file_list=config_file_list)
config.final_config_dict['data_path'] = os.path.join(difsr_root,config.final_config_dict['data_path'])
config.final_config_dict

{'gpu_id': 0,
 'use_gpu': True,
 'seed': 212,
 'state': 'INFO',
 'reproducibility': True,
 'data_path': '/home/guy/workspace/work/git/gkoren2/DIF-SR/dataset/Amazon_Beauty',
 'checkpoint_dir': 'saved',
 'show_progress': True,
 'save_dataset': False,
 'save_dataloaders': False,
 'epochs': 200,
 'train_batch_size': 1536,
 'learner': 'adam',
 'learning_rate': 0.0001,
 'neg_sampling': None,
 'eval_step': 2,
 'stopping_step': 10,
 'clip_grad_norm': None,
 'weight_decay': 0.0,
 'multi_gpus': False,
 'eval_args': {'split': {'LS': 'valid_and_test'},
  'group_by': 'user',
  'order': 'TO',
  'mode': 'full'},
 'repeatable': True,
 'metrics': ['Recall', 'NDCG'],
 'topk': [3, 5, 10, 20],
 'valid_metric': 'Recall@20',
 'valid_metric_bigger': True,
 'eval_batch_size': 128,
 'loss_decimal_place': 4,
 'metric_decimal_place': 4,
 'n_layers': 4,
 'n_heads': 8,
 'hidden_size': 256,
 'attribute_hidden_size': [64],
 'inner_size': 256,
 'hidden_dropout_prob': 0.5,
 'attn_dropout_prob': 0.3,
 'hidden_act': 'ge

In [8]:
dataset = create_dataset(config)
dataset

[1;35mAmazon_Beauty[0m
[1;34mThe number of users[0m: 22364
[1;34mAverage actions of users[0m: 8.876358270357287
[1;34mThe number of items[0m: 12102
[1;34mAverage actions of items[0m: 16.403768283612923
[1;34mThe number of inters[0m: 198502
[1;34mThe sparsity of the dataset[0m: 99.92665707018277%
[1;34mRemain Fields[0m: ['user_id', 'item_id', 'rating', 'timestamp', 'title', 'sales_type', 'sales_rank', 'categories', 'price', 'brand']

In [9]:
dataset.item_feat.head()

Unnamed: 0,item_id,title,sales_type,sales_rank,categories,price,brand
0,0,0.0,0.0,277296.568007,[],17.005026,0.0
1,1,1.0,1.0,10486.0,"[1, 2, 3, 4, 5, 6]",5.04,1.0
2,2,2.0,1.0,52254.0,"[1, 7, 8, 9, 10, 11, 12, 5, 13]",19.99,2.0
3,3,3.0,1.0,78916.0,"[1, 14, 15, 16, 17, 18]",65.86,3.0
4,4,4.0,1.0,764.0,"[1, 14, 15, 16, 17, 19]",52.33,4.0


In [38]:
dataset.item_feat['brand'].value_counts()

0.0       2099
40.0       291
124.0      203
10.0       191
41.0       159
          ... 
1048.0       1
1045.0       1
1043.0       1
1039.0       1
2076.0       1
Name: brand, Length: 2077, dtype: int64

In [37]:
# check the distribution of the number of categories per item
dataset.item_feat['categories'].apply(lambda x:len(x)).value_counts()



7     2178
6     2149
4     2120
5     1801
8     1295
9      876
11     681
10     610
12     144
14     109
3       82
2       29
13      27
0        1
Name: categories, dtype: int64

In [39]:
dataset.item_feat['sales_type'].value_counts()

1.0     10169
2.0      1307
0.0       287
5.0        77
3.0        71
4.0        32
13.0       30
7.0        27
8.0        18
12.0       16
17.0       15
6.0        12
10.0       11
16.0       10
15.0        8
11.0        5
9.0         3
14.0        1
18.0        1
19.0        1
20.0        1
Name: sales_type, dtype: int64

In [27]:
dataset.item_feat['sales_rank'].value_counts()

277296.568007    287
2123.000000        3
120451.000000      3
14119.000000       2
21570.000000       2
                ... 
234978.000000      1
983834.000000      1
230650.000000      1
195874.000000      1
136798.000000      1
Name: sales_rank, Length: 11708, dtype: int64

In [40]:
dataset.item_feat['title'].value_counts()

0.0        8
11242.0    3
5300.0     2
10588.0    2
820.0      2
          ..
4042.0     1
4035.0     1
4045.0     1
4039.0     1
12066.0    1
Name: title, Length: 12067, dtype: int64

## read item data

In [22]:
df=pd.read_csv(os.path.join(dataset_path,'Amazon_Beauty.item'),sep='\t')
df.head()

Unnamed: 0,item_id:token,title:token,sales_type:token,sales_rank:float,categories:token_seq,price:float,brand:token
0,205616461,Bio-Active Anti-Aging Serum (Firming Ultra-Hyd...,Health & Personal Care,461765.0,"'Beauty', 'Skin Care', 'Face', 'Creams & Moist...",,
1,558925278,Eco Friendly Ecotools Quality Natural Bamboo C...,Beauty,402875.0,"'Beauty', 'Tools & Accessories', 'Makeup Brush...",,
2,733001998,Mastiha Body Lotion,Beauty,540255.0,"'Beauty', 'Skin Care', 'Body', 'Moisturizers',...",,
3,737104473,Hello Kitty Lustre Lipstick (See sellers comme...,Beauty,931125.0,"'Beauty', 'Makeup', 'Lips', 'Lipstick'",,
4,762451459,Stephanie Johnson Mermaid Round Snap Mirror,,,"'Beauty', 'Tools & Accessories', 'Mirrors', 'M...",19.98,


In [25]:
# check how many non-Nan values are there in each column
df.count()/len(df)


item_id:token           1.000000
title:token             0.998287
sales_type:token        0.979865
sales_rank:float        0.979865
categories:token_seq    1.000000
price:float             0.732743
brand:token             0.490112
dtype: float64

it looks liket the only feature that exist for each item is categories (although there might be an empty string, right?)

In [None]:
# check how many Nans in brand:token
df['brand:token'].isna().sum()

# Amazon Toys

In [None]:
dataset_path=os.path.join(dataset_root,'Amazon_Toys_and_Games')
os.listdir(dataset_path)

## Reading the dataset using recbole 

In [None]:
# reading the dataset through the recbole framework
args=Arguments(dataset="Amazon_Toys_and_Games",config_files=os.path.join(difsr_root,'configs/Amazon_Toys_and_Games.yaml'))
config_file_list = args.config_files.strip().split(' ') if args.config_files else None
config = Config(model=args.model, dataset=f'{args.dataset}', config_file_list=config_file_list)
config.final_config_dict['data_path'] = os.path.join(difsr_root,config.final_config_dict['data_path'])
config.final_config_dict

In [None]:
dataset = create_dataset(config)
dataset

In [None]:
dataset.item_feat.head()