In [1]:
import os
import json
import argparse
import pandas as pd
import numpy as np
import time, datetime
from tqdm import tqdm
from logging import getLogger
import torch

from recbole.model.general_recommender.multivae import MultiVAE
from recbole.quick_start import run_recbole

from recbole.config import Config
from recbole.data import dataset
from recbole.data import create_dataset, data_preparation, Interaction
from recbole.utils import init_logger, get_trainer, get_model, init_seed, set_color


SEED=13

# 데이터 불러오기

In [2]:
def run(model_name):

    return run_recbole(
        model=model_name,
        dataset='train_data',
        config_file_list=['/home/dhkim/server_front/winery_AI/winery/Recbole/general.yaml'],
    )

In [3]:
from recbole.config import Config
config_file_list=['/home/dhkim/server_front/winery_AI/winery/Recbole/general.yaml']
model='DCN'
dataset='train_data'
config = Config(model=model, dataset=dataset, config_file_list=config_file_list)
dataset = create_dataset(config)


In [4]:
train_data, valid_data, test_data = data_preparation(config, dataset)

In [5]:
for batch_idx, interaction in enumerate(train_data):    
    break

In [6]:
dataset.field2token_id['grape']

{'[PAD]': 0,
 'Merlot': 1,
 'CabernetSauvignon': 2,
 'CabernetFranc': 3,
 'MoscatoBianco': 4,
 'Gamay': 5,
 'PinotGris': 6,
 'Chardonnay': 7,
 'PinotNoir': 8,
 'PinotMeunier': 9,
 'Shiraz/Syrah': 10,
 'CheninBlanc': 11,
 'Tempranillo': 12,
 'PetitManseng': 13,
 'Sylvaner': 14,
 'TintaBarroca': 15,
 'Mencia': 16,
 'Riesling': 17,
 'Tibouren': 18,
 'Empty': 19,
 'Sangiovese': 20,
 'TourigaNacional': 21,
 'Malbec': 22,
 'PetitVerdot': 23,
 'Dafni': 24,
 'TintaRoriz': 25,
 'TourigaFrancesa': 26,
 '588': 27,
 'Traminer': 28,
 'Corvina': 29,
 'Rondinella': 30,
 'Molinara': 31,
 'Grenache': 32,
 "Nerod'Avola": 33,
 'Mammolo': 34,
 'GrünerVeltliner': 35,
 'Mourvedre': 36,
 'Teroldego': 37,
 'Carignan': 38,
 'Cinsault': 39,
 'Aglianico': 40,
 'Turbiana': 41,
 'Friulano': 42,
 'Monastrell': 43,
 'Nebbiolo': 44,
 'SauvignonBlanc': 45,
 'Sémillon': 46,
 'Palomino': 47,
 'GrenacheBlanc': 48,
 'Clairette': 49,
 'PicpoulBlanc': 50,
 'Garnacha': 51,
 'Barbera': 52,
 'Alvarinho': 53,
 'Caprettone': 54,

In [7]:
interaction

The batch_size of interaction: 1024
    user_id, torch.Size([1024]), cpu, torch.int64
    item_id, torch.Size([1024]), cpu, torch.int64
    label, torch.Size([1024]), cpu, torch.float32
    count, torch.Size([1024]), cpu, torch.float32
    mean, torch.Size([1024]), cpu, torch.float32
    country, torch.Size([1024]), cpu, torch.int64
    region, torch.Size([1024]), cpu, torch.int64
    winery, torch.Size([1024]), cpu, torch.int64
    winetype, torch.Size([1024]), cpu, torch.int64
    grape, torch.Size([1024, 15]), cpu, torch.int64
    vintage, torch.Size([1024]), cpu, torch.int64
    price, torch.Size([1024]), cpu, torch.float32
    rating, torch.Size([1024]), cpu, torch.float32
    num_votes, torch.Size([1024]), cpu, torch.float32
    wine_style, torch.Size([1024]), cpu, torch.int64
    Red Fruit_count, torch.Size([1024]), cpu, torch.float32
    Tropical_count, torch.Size([1024]), cpu, torch.float32
    Tree Fruit_count, torch.Size([1024]), cpu, torch.float32
    Oaky_count, torch.Size

In [8]:
token_fields = []
for field_name in ['user_id','item_id','country']:
    token_fields.append(interaction[field_name].unsqueeze(1))
if len(token_fields) > 0:
    token_fields = torch.cat(
        token_fields, dim=1
    )  # [batch_size, num_token_field, 2]
else:
    token_fields = None

In [12]:
model.token_embedding_table
model.float_embedding_table
model.token_seq_embedding_table
model.float_seq_embedding_table


tensor([[378685,  15832,      2],
        [169517,  16090,      3],
        [140820,   9406,      2],
        ...,
        [ 99298,  12515,      4],
        [119760,    875,      3],
        [ 39587,  13678,      1]])

In [None]:
dataset.field2token_id

In [None]:
model_name = 'DCN'
print(f"running {model_name}...")
start = time.time()
result = run(model_name)
t = time.time() - start
print(f"It took {t/60:.2f} mins")
print(result)
# wandb.run.finish()

# inference

In [None]:
model_path='/opt/ml/input/Recbole/saved/MultiVAE-Jun-04-2023_12-03-05.pth'
# rank K 설정
K = 10

In [None]:
# config, model, dataset 불러오기
checkpoint = torch.load(model_path)
config = checkpoint['config']
config['dataset'] = 'train_data'

dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)

model = get_model(config['model'])(config, test_data.dataset).to(config['device'])
model.load_state_dict(checkpoint['state_dict'])
model.load_other_parameter(checkpoint.get('other_parameter'))

# device 설정
device = config.final_config_dict['device']

# user, item id -> token 변환 array
user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]

# user id list
all_user_list = torch.arange(1, len(user_id2token)).view(-1,128)

# user, item 길이
user_len = len(user_id2token)
item_len = len(item_id2token)

# user-item sparse matrix
matrix = dataset.inter_matrix(form='csr')

# user id, predict item id 저장 변수
pred_list = None
user_list = None

# model 평가모드 전환
model.eval()

# progress bar 설정
tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink'))

for data in tbar:
    # interaction 생성
    interaction = dict()
    interaction = Interaction(interaction)
    interaction[user_id] = data
    interaction = interaction.to(device)

    # user item별 score 예측
    score = model.full_sort_predict(interaction)
    score = score.view(-1, item_len)

    rating_pred = score.cpu().data.numpy().copy()

    user_index = data.numpy()

    idx = matrix[user_index].toarray() > 0

    rating_pred[idx] = -np.inf
    rating_pred[:, 0] = -np.inf
    ind = np.argpartition(rating_pred, -K)[:, -K:]

    arr_ind = rating_pred[np.arange(len(rating_pred))[:, None], ind]

    arr_ind_argsort = np.argsort(arr_ind)[np.arange(len(rating_pred)), ::-1]

    batch_pred_list = ind[
        np.arange(len(rating_pred))[:, None], arr_ind_argsort
    ]

    if pred_list is None:
        pred_list = batch_pred_list
        user_list = user_index
    else:
        pred_list = np.append(pred_list, batch_pred_list, axis=0)
        user_list = np.append(
            user_list, user_index, axis=0
        )

result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))

# # 데이터 저장
# sub = pd.DataFrame(result, columns=["user", "item"])
# sub.to_csv(
#     "submission.csv", index=False

print('inference done!')

In [None]:
sub

In [None]:
sub.user = sub.user.map(uidx2user)
sub.item = sub.item.map(iidx2item)

# 추천 결과 10개

In [None]:
# SAVE OUTPUT
output_dir = '/opt/ml/input/Recbole/output/'
write_path = os.path.join(output_dir, "MultiVAE.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("user,item\n")
    for id, p in sub.values:
        w.write('{},{}\n'.format(id,p))

In [None]:
sub / 0.06/ 0.13