In [None]:
# !pip install recbole
# !pip install ray
# !pip install numpy==1.20.3

In [1]:
import os
import json
import argparse
import pandas as pd
import numpy as np
import time, datetime
from tqdm import tqdm
from logging import getLogger
import torch

from recbole.model.general_recommender.multivae import MultiVAE
from recbole.quick_start import run_recbole

from recbole.config import Config
from recbole.data import create_dataset, data_preparation, Interaction
from recbole.utils import init_logger, get_trainer, get_model, init_seed, set_color


SEED=13

In [2]:
!ls ../../data/train
!readlink -ef ../../data/train/train_ratings.csv

ls: cannot access '../../data/train': No such file or directory


# 데이터 불러오기

In [31]:
train = pd.read_csv("/opt/ml/input/data/train/train_ratings.csv")

In [32]:
train

Unnamed: 0,user,item,time
0,11,4643,1230782529
1,11,170,1230782534
2,11,531,1230782539
3,11,616,1230782542
4,11,2140,1230782563
...,...,...,...
5154466,138493,44022,1260209449
5154467,138493,4958,1260209482
5154468,138493,68319,1260209720
5154469,138493,40819,1260209726


In [33]:
user2idx = {v:k for k,v in enumerate(sorted(set(train.user)))}
item2idx = {v:k for k,v in enumerate(sorted(set(train.item)))}
uidx2user = {k:v for k,v in enumerate(sorted(set(train.user)))}
iidx2item = {k:v for k,v in enumerate(sorted(set(train.item)))}

In [34]:
train.user = train.user.map(user2idx)
train.item = train.item.map(item2idx)

In [35]:
train

Unnamed: 0,user,item,time
0,0,2505,1230782529
1,0,109,1230782534
2,0,319,1230782539
3,0,368,1230782542
4,0,1183,1230782563
...,...,...,...
5154466,31359,4882,1260209449
5154467,31359,2652,1260209482
5154468,31359,5768,1260209720
5154469,31359,4791,1260209726


In [36]:
train.columns=['user_id:token','item_id:token','timestamp:float']

In [37]:
outpath = f"dataset/train_data"
os.makedirs(outpath, exist_ok=True)
# sub_train=train.groupby("user").sample(n=10, random_state=SEED)
# sub_train.shape
train.to_csv(os.path.join(outpath,"train_data.inter"),sep='\t',index=False)

In [8]:
yamldata="""
USER_ID_FIELD: user_id
ITEM_ID_FIELD: item_id
TIME_FIELD: timestamp

load_col:
    inter: [user_id, item_id, timestamp]

show_progress : False
epochs : 10
device : torch.device("cuda" if torch.cuda.is_available() else "cpu")
eval_args:
    split: {'RS': [9, 1, 0]}
    group_by: user
    order: RO
    mode: full
metrics: ['Recall', 'MRR', 'NDCG', 'Hit', 'Precision', 'MAP']
topk: 10
valid_metric: Recall@10


"""
with open("general.yaml", "w") as f:
    f.write(yamldata)

In [9]:
def run(model_name):
    if model_name in [
        "MultiVAE",
        "MultiDAE",
        "RecVAE",

    ]:
        parameter_dict = {
            "neg_sampling": None,
        }
        return run_recbole(
            model=model_name,
            dataset='train_data',
            config_file_list=['general.yaml'],
            config_dict=parameter_dict,
        )
    else:
        return run_recbole(
            model=model_name,
            dataset='train_data',
            config_file_list=['general.yaml'],
        )

In [58]:
model_list = ['EASE'] # 해당 리스트에 쓰고 싶은 모델을 넣어줍니다.
for model_name in model_list:
    print(f"running {model_name}...")
    start = time.time()
    result = run(model_name)
    t = time.time() - start
    print(f"It took {t/60:.2f} mins")
    print(result)
    # wandb.run.finish()

running EASE...


06 Jun 07:26    INFO  ['/opt/conda/lib/python3.8/site-packages/ipykernel_launcher.py', '-f', '/opt/ml/.local/share/jupyter/runtime/kernel-f7665407-7991-4c30-9c8b-9f85f2c84c0c.json']
06 Jun 07:26    INFO  
General Hyper Parameters:
gpu_id = 0
use_gpu = True
seed = 2020
state = INFO
reproducibility = True
data_path = dataset/train_data
checkpoint_dir = saved
show_progress = False
save_dataset = False
dataset_save_path = None
save_dataloaders = False
dataloaders_save_path = None
log_wandb = False

Training Hyper Parameters:
epochs = 10
train_batch_size = 2048
learner = adam
learning_rate = 0.001
train_neg_sample_args = {'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}
eval_step = 1
stopping_step = 10
clip_grad_norm = None
weight_decay = 0.0
loss_decimal_place = 4

Evaluation Hyper Parameters:
eval_args = {'split': {'RS': [9, 1, 0]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}
repeatable = False
metrics = ['Recall', 'MRR', 'NDCG', 'Hit'

It took 3.11 mins
{'best_valid_score': 0.1966, 'valid_score_bigger': True, 'best_valid_result': OrderedDict([('recall@10', 0.1966), ('mrr@10', 0.5928), ('ndcg@10', 0.3203), ('hit@10', 0.8735), ('precision@10', 0.2572), ('map@10', 0.1883)]), 'test_result': None}


# inference

In [13]:
model_path='/opt/ml/input/Recbole/saved/MultiVAE-Jun-04-2023_12-03-05.pth'
# rank K 설정
K = 10

In [14]:
# config, model, dataset 불러오기
checkpoint = torch.load(model_path)
config = checkpoint['config']
config['dataset'] = 'train_data'

dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

model = get_model(config['model'])(config, test_data.dataset).to(config['device'])
model.load_state_dict(checkpoint['state_dict'])
model.load_other_parameter(checkpoint.get('other_parameter'))

# device 설정
device = config.final_config_dict['device']

# user, item id -> token 변환 array
user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]

# user id list
all_user_list = torch.arange(1, len(user_id2token)).view(-1,128)

# user, item 길이
user_len = len(user_id2token)
item_len = len(item_id2token)

# user-item sparse matrix
matrix = dataset.inter_matrix(form='csr')

# user id, predict item id 저장 변수
pred_list = None
user_list = None

# model 평가모드 전환
model.eval()

# progress bar 설정
tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink'))

for data in tbar:
    # interaction 생성
    interaction = dict()
    interaction = Interaction(interaction)
    interaction[user_id] = data
    interaction = interaction.to(device)

    # user item별 score 예측
    score = model.full_sort_predict(interaction)
    score = score.view(-1, item_len)

    rating_pred = score.cpu().data.numpy().copy()

    user_index = data.numpy()

    idx = matrix[user_index].toarray() > 0

    rating_pred[idx] = -np.inf
    rating_pred[:, 0] = -np.inf
    ind = np.argpartition(rating_pred, -K)[:, -K:]

    arr_ind = rating_pred[np.arange(len(rating_pred))[:, None], ind]

    arr_ind_argsort = np.argsort(arr_ind)[np.arange(len(rating_pred)), ::-1]

    batch_pred_list = ind[
        np.arange(len(rating_pred))[:, None], arr_ind_argsort
    ]

    if pred_list is None:
        pred_list = batch_pred_list
        user_list = user_index
    else:
        pred_list = np.append(pred_list, batch_pred_list, axis=0)
        user_list = np.append(
            user_list, user_index, axis=0
        )

result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))

# # 데이터 저장
# sub = pd.DataFrame(result, columns=["user", "item"])
# sub.to_csv(
#     "submission.csv", index=False
)
print('inference done!')

04 Jun 12:24    INFO  [Training]: train_batch_size = [2048] train_neg_sample_args: [{'distribution': 'uniform', 'sample_num': 1, 'alpha': 1.0, 'dynamic': False, 'candidate_num': 0}]
04 Jun 12:24    INFO  [Evaluation]: eval_batch_size = [4096] eval_args: [{'split': {'RS': [9, 1, 0]}, 'group_by': 'user', 'order': 'RO', 'mode': 'full'}]
Inference:   0%|          | 0/245 [00:00<?, ?it/s]:   2%|▏         | 6/245 [00:00<00:04, 54.37it/s]:   5%|▍         | 12/245 [00:00<00:04, 54.43it/s]:   7%|▋         | 17/245 [00:00<00:04, 52.90it/s]:   9%|▉         | 22/245 [00:00<00:04, 51.70it/s]:  11%|█▏        | 28/245 [00:00<00:04, 51.72it/s]:  14%|█▍        | 34/245 [00:00<00:04, 51.91it/s]:  16%|█▋        | 40/245 [00:00<00:03, 51.81it/s]:  19%|█▉        | 46/245 [00:00<00:03, 52.04it/s]:  21%|██        | 52/245 [00:00<00:03, 52.40it/s]:  24%|██▎       | 58/245 [00:01<00:03, 52.72it/s]:  26%|██▌       | 64/245 [00:01<00:03, 52.94it/s]:  29%|██▊       | 70/245 [00:01<00:03, 52.89it/s]:  31%|███     

inference done!


In [16]:
sub

Unnamed: 0,user,item
0,0,455
1,0,43
2,0,41
3,0,4101
4,0,2619
...,...,...
313595,31359,455
313596,31359,43
313597,31359,41
313598,31359,1977


In [17]:
sub.user = sub.user.map(uidx2user)
sub.item = sub.item.map(iidx2item)

# 추천 결과 10개

In [30]:
# SAVE OUTPUT
output_dir = '/opt/ml/input/Recbole/output/'
write_path = os.path.join(output_dir, "MultiVAE.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("user,item\n")
    for id, p in sub.values:
        w.write('{},{}\n'.format(id,p))

writing prediction : /opt/ml/input/Recbole/output/MultiVAE.csv


In [38]:
sub / 0.06/ 0.13

Unnamed: 0,user,item
0,11,858
1,11,50
2,11,47
3,11,8961
4,11,4886
...,...,...
313595,138493,858
313596,138493,50
313597,138493,47
313598,138493,3578
