In [1]:
import os
import json
import argparse
import pandas as pd
import numpy as np
import time, datetime
from tqdm import tqdm
from logging import getLogger
import torch

from recbole.model.general_recommender.multivae import MultiVAE
from recbole.quick_start import run_recbole

from recbole.config import Config
from recbole.data import dataset
from recbole.data import create_dataset, data_preparation, Interaction
from recbole.utils import init_logger, get_trainer, get_model, init_seed, set_color


SEED=13

# 데이터 불러오기

In [2]:
def run(model_name):

    return run_recbole(
        model=model_name,
        dataset='train_data',
        config_file_list=['/opt/ml/backend/Recbole/general.yaml'],
    )

In [3]:
from recbole.config import Config
config_file_list=['/opt/ml/backend/Recbole/general.yaml']
model='DCN'
dataset='train_data'
config = Config(model=model, dataset=dataset, config_file_list=config_file_list)
dataset = create_dataset(config)



KeyboardInterrupt



In [None]:
train_data, valid_data, test_data = data_preparation(config, dataset)

In [None]:
for batch_idx, interaction in enumerate(train_data):    
    break

In [None]:
len(set(dataset.field2token_id['user_id'].values()))

In [None]:
interaction

In [None]:
token_fields = []
for field_name in ['user_id','item_id','region']:
    token_fields.append(interaction[field_name].unsqueeze(1))
if len(token_fields) > 0:
    token_fields = torch.cat(
        token_fields, dim=1
    )  # [batch_size, num_token_field, 2]
else:
    token_fields = None

In [None]:
model.token_embedding_table
model.float_embedding_table
model.token_seq_embedding_table
model.float_seq_embedding_table


In [None]:
# dataset.field2token_id

In [None]:
# pip install --upgrade numpy

In [None]:
# xpip install numpy==1.19.5

In [None]:
model_name = 'DCN'
print(f"running {model_name}...")
start = time.time()
result = run(model_name)
t = time.time() - start
print(f"It took {t/60:.2f} mins")
print(result)
# wandb.run.finish()

# inference

In [3]:
model_path='/opt/ml/backend/Recbole/saved/DCN-Jul-19-2023_10-30-59.pth'
# rank K 설정
K = 10

In [4]:
import pdb
# config, model, dataset 불러오기
checkpoint = torch.load(model_path)
config = checkpoint['config']
config['dataset'] = 'train_data'

print("create dataset start!")
dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)
print("create dataset done!")

model = get_model(config['model'])(config, test_data.dataset).to(config['device'])
model.load_state_dict(checkpoint['state_dict'])
model.load_other_parameter(checkpoint.get('other_parameter'))

# device 설정
device = config.final_config_dict['device']

# user, item id -> token 변환 array
user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]

# user id list
user_count = len(user_id2token)
remainder = user_count % 128
padding_size = 128 - remainder if remainder != 0 else 0
all_user_list = torch.arange(0, user_count + padding_size).reshape(-1, 128)

# # user id list
# item_count = len(item_id2token)
# remainder = item_count % 128
# padding_size = 128 - remainder if remainder != 0 else 0
# all_item_list = torch.arange(0, item_count + padding_size).reshape(-1, 128)

# user, item 길이
user_len = len(user_id2token)
item_len = len(item_id2token)

# user-item sparse matrix
matrix = dataset.inter_matrix(form='csr')

# user id, predict item id 저장 변수
pred_list = None
user_list = None

# model 평가모드 전환
model.eval()


# progress bar 설정
tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink'))
for data in tbar:
    batch_pred_list = full_sort_topk(data, model, test_data, K, device=device)[1]
    batch_pred_list = batch_pred_list.clone().detach().cpu().numpy()
    if pred_list is None:
        pred_list = batch_pred_list
        user_list = data.numpy()
    else:
        pred_list = np.append(pred_list, batch_pred_list, axis=0)
        user_list = np.append(
            user_list, data.numpy(), axis=0
        )
tbar.close()


result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))

#데이터 저장
sub = pd.DataFrame(result, columns=["user", "item"])

# # 데이터 저장
# sub = pd.DataFrame(result, columns=["user", "item"])
# sub.to_csv(
#     "submission.csv", index=False

print('inference done!')

create dataset start!


KeyboardInterrupt: 

In [None]:
# config, model, dataset 불러오기
checkpoint = torch.load(model_path)
config = checkpoint['config']
config['dataset'] = 'train_data'

init_seed(config['seed'], config['reproducibility'])

print("create dataset start!")
dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)
print("create dataset done!")
model = get_model(config['model'])(config, test_data.dataset).to(config['device'])
model.load_state_dict(checkpoint['state_dict'])
model.load_other_parameter(checkpoint.get('other_parameter'))

# device 설정
device = config.final_config_dict['device']

# user, item id -> token 변환 array
user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]

# user id list

user_count = len(user_id2token)
remainder = user_count % 128
padding_size = 128 - remainder if remainder != 0 else 0
all_user_list = torch.arange(1, user_count + padding_size+1).reshape(-1, 128)
user_id2token = user_id2token[1:]
# user, item 길이
user_len = len(user_id2token)
item_len = len(item_id2token)

# user-item sparse matrix
# matrix = dataset.inter_matrix(form='csr')

# user id, predict item id 저장 변수
pred_list = None
user_list = []

# model 평가모드 전환
model.eval()

# progress bar 설정
tbar = tqdm(range(user_len), desc=set_color(f"Inference", 'pink'))
for user in tbar:
    # 모델에 입력할 사용자 ID를 토큰으로 변환하여 추론
    user_id_token = user_id2token[user]
    if user_id_token != '[PAD]':
        user_id_token = int(user_id_token)
        user_input = torch.tensor([user_id_token], dtype=torch.long).to(device)

    # 모델로부터 아이템 예측 점수를 얻습니다.
    # 이 때, 아이템 예측 점수를 오름차순으로 정렬하고 상위 K개를 선택합니다.
    pdb.set_trace()
    with torch.no_grad():
        item_scores = model(user_input)
        _, topk_items = torch.topk(item_scores, k=K)

    # 예측 결과를 리스트에 저장합니다.
    for item in topk_items[0]:
        result.append((user, int(item_id2token[item])))

tbar.close()


result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))

#데이터 저장
sub = pd.DataFrame(result, columns=["user", "item"])
print(len(sub))
print('inference done!')

In [8]:
model_path='/opt/ml/backend/Recbole/saved/FM-Jul-21-2023_14-59-29.pth'
# rank K 설정
K = 10

In [9]:
# config, model, dataset 불러오기
checkpoint = torch.load(model_path)
config = checkpoint['config']
config['dataset'] = 'train_data'

init_seed(config['seed'], config['reproducibility'])

print("create dataset start!")
dataset = create_dataset(config)
train_data, valid_data, test_data = data_preparation(config, dataset)
print("create dataset done!")
model = get_model(config['model'])(config, test_data.dataset).to(config['device'])

create dataset start!
create dataset done!


In [10]:
print("Type of train_data:", type(train_data))
print("Type of valid_data:", type(valid_data))
print("Type of test_data:", type(test_data))

Type of train_data: <class 'recbole.data.dataloader.general_dataloader.TrainDataLoader'>
Type of valid_data: <class 'recbole.data.dataloader.general_dataloader.NegSampleEvalDataLoader'>
Type of test_data: <class 'recbole.data.dataloader.general_dataloader.NegSampleEvalDataLoader'>


In [11]:
from recbole.utils.case_study import full_sort_topk

In [12]:
user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]

# user id list
batch_size = 64
user_count = len(user_id2token)
remainder = user_count % batch_size
padding_size = batch_size - remainder if remainder != 0 else 0
all_user_list = torch.arange(0, user_count + padding_size).reshape(-1, batch_size)
user_id2token = user_id2token[1:]
# user, item 길이
user_len = len(user_id2token)
item_len = len(item_id2token)

# user-item sparse matrix
# matrix = dataset.inter_matrix(form='csr')

# user id, predict item id 저장 변수
pred_list = None
user_list = []

# model 평가모드 전환
model.eval()
matrix = dataset.inter_matrix(form='csr')
# device 설정
device = config.final_config_dict['device']

In [13]:
all_user_list

tensor([[      0,       1,       2,  ...,      61,      62,      63],
        [     64,      65,      66,  ...,     125,     126,     127],
        [    128,     129,     130,  ...,     189,     190,     191],
        ...,
        [1154624, 1154625, 1154626,  ..., 1154685, 1154686, 1154687],
        [1154688, 1154689, 1154690,  ..., 1154749, 1154750, 1154751],
        [1154752, 1154753, 1154754,  ..., 1154813, 1154814, 1154815]])

In [None]:
tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink'))
import gc
with torch.no_grad():
    for data in tbar:
        if data.max() > len(user_id2token):
            data = [item for item in data if item < 1154808]
            data = torch.tensor(data)
        # interaction 생성
        interaction = Interaction({})
        interaction[user_id] = data.to(device)
        interaction = interaction.repeat_interleave(dataset.item_num)
        interaction.update(
            test_data.dataset.get_item_feature().to(device).repeat(len(data))
        )

        # user item별 score 예측
        score = model.predict(interaction)
        score = score.view(-1, item_len)
        rating_pred = score.cpu().data.numpy().copy()

        user_index = data.numpy()

        idx = matrix[user_index].toarray() > 0

        rating_pred[idx] = -np.inf
        rating_pred[:, 0] = -np.inf
        ind = np.argpartition(rating_pred, -K)[:, -K:]

        arr_ind = rating_pred[np.arange(len(rating_pred))[:, None], ind]

        arr_ind_argsort = np.argsort(arr_ind)[np.arange(len(rating_pred)), ::-1]

        batch_pred_list = ind[
            np.arange(len(rating_pred))[:, None], arr_ind_argsort
        ]

        if pred_list is None:
            pred_list = batch_pred_list
            user_list = user_index
        else:
            pred_list = np.append(pred_list, batch_pred_list, axis=0)
            user_list = np.append(
                user_list, user_index, axis=0
            )


[1;35mInference[0m:  45%|████▍     | 8075/18044 [21:30<29:42,  5.59it/s]

In [26]:
import json

# user_list를 key로, pred_list를 value로 갖는 dictionary 생성
data_dict = {str(user_id): pred_list[i].tolist() for i, user_id in enumerate(user_list)}

# dictionary를 JSON 형태로 변환
json_data = json.dumps(data_dict)

# JSON 문자열을 다시 딕셔너리로 디코딩
decoded_data = json.loads(json_data)

# JSON 데이터를 파일에 저장
file_path = "inference.json"  # 원하는 파일 경로와 이름 설정
with open(file_path, 'w') as f:
    f.write(json_data)
# 디코딩된 데이터 사용
# print(decoded_data)


In [30]:
len(decoded_data)

1154808

In [7]:
decoded_data

NameError: name 'decoded_data' is not defined

In [None]:
result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))

In [None]:
user_list

In [None]:
user_id2token.sort()

In [None]:
sorted(user_id2token , lambda key = 

In [None]:
sorted_indices = np.argsort(user_id2token)
sorted_user_id2token = {k: v for k, v in enumerate(sorted_indices)}

In [None]:
len(user_id2token)

In [None]:
len(user_list)

In [None]:
user_list

In [None]:
len(pred_list)

In [None]:
for data in tbar:
    print(data)
    batch_pred_list = full_sort_topk(data, model, train_data, K, device=device)[1]
    batch_pred_list = batch_pred_list.clone().detach().cpu().numpy()
    if pred_list is None:
        pred_list = batch_pred_list
        user_list = data.numpy()
    else:
        pred_list = np.append(pred_list, batch_pred_list, axis=0)
        user_list = np.append(
            user_list, data.numpy(), axis=0
        )

print(user_list, pred_list)
result = []
for user, pred in zip(user_list, pred_list):
    for item in pred:
        result.append((int(user_id2token[user]), int(item_id2token[item])))

#데이터 저장
sub = pd.DataFrame(result, columns=["user", "item"])

In [None]:
# tbar = tqdm(all_user_list, desc=set_color(f"Inference", 'pink'))
user_id = config['USER_ID_FIELD']
item_id = config['ITEM_ID_FIELD']
user_id2token = dataset.field2id_token[user_id]
item_id2token = dataset.field2id_token[item_id]
# device 설정
print(user_id)
device = config.final_config_dict['device']
for data in test_data:
    interaction = data[0].to(device)
    print(interaction)
    score = model.full_sort_predict(interaction)
    break
    
    interaction = interaction.to(device)

    # user item별 score 예측
    score = model.full_sort_predict(interaction)

In [None]:
sub.user = sub.user.map(uidx2user)
sub.item = sub.item.map(iidx2item)

# 추천 결과 10개

In [None]:
# SAVE OUTPUT
output_dir = '/opt/ml/input/Recbole/output/'
write_path = os.path.join(output_dir, "MultiVAE.csv")
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
with open(write_path, 'w', encoding='utf8') as w:
    print("writing prediction : {}".format(write_path))
    w.write("user,item\n")
    for id, p in sub.values:
        w.write('{},{}\n'.format(id,p))

In [None]:
sub / 0.06/ 0.13