In [1]:
import time
import datetime

# 시간 표시 함수
def format_time(elapsed):
    # 반올림
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))


"""Training GCMC model on the MovieLens data set.
The script loads the full graph to the training device.
"""
import os, time
import argparse
import logging
import random
import string
import numpy as np
import pandas as pd
import torch as th
import torch.nn as nn
from data_rotten_v2_amazon import RottenMovie
from utils import get_activation, get_optimizer, torch_total_param_num, torch_net_info, MetricLogger

Using backend: pytorch


In [2]:
import easydict

args = easydict.EasyDict({ 
    "data_name":                      "rotten", 
    "use_one_hot_fea":                True,
    "gpu":                            0,
    "seed":                           123,
    "data_test_ratio":                0.1,
    "data_valid_ratio":               0.1,
    "model_activation":               'leaky',
    "gcn_dropout":                    0.5,
    "gcn_agg_norm_symm":              True,
    "gcn_agg_units":                  32,
    "gcn_agg_accum":                  'sum',
    "gcn_out_units":                  32, # 64, 128
    "gen_r_num_basis_func":           2,
    "train_max_epoch":                300,
    "train_log_interval":             5,
    "train_valid_interval":           5,
    "train_optimizer":                'adam',
    "train_grad_clip":                1.0,
    "train_lr":                       0.01,
    "train_min_lr":                   0.0008,
    "train_lr_decay_factor":          0.5,
    "train_decay_patience":           25,
    "train_early_stopping_patience":  50,
    "share_param":                    False,
    "mix_cpu_gpu":                    False,
    "minibatch_size":                 40000,
    "num_workers_per_gpu":            8,
    "device":                         0,
    "save_dir":                       './save/',
    "save_id":                        1,
    "train_max_iter":                 1000
})

np.random.seed(args.seed)
th.manual_seed(args.seed)

if th.cuda.is_available():
    th.cuda.manual_seed_all(args.seed)

In [3]:
from train import train

dataset = RottenMovie(                 
#              train_data='./data/trainset_filtered.csv',
#              test_data='./data/testset_filtered.csv',
             train_data='./data/amazon_trainset.csv',
             test_data='./data/amazon_testset.csv',
             movie_data = './data/amazon_movie_info.csv',
             user_data = './data/amazon_user_info.csv',
             emotion=False,
             sentiment=False,

             name='rotten', 
             device=0, 
             mix_cpu_gpu=False,
             use_one_hot_fea=True, 
             symm=True,
             valid_ratio=0.1,
             )

......1: 데이터 로드
......3: Train/Valid 분리
All rating pairs : 195947
	All train rating pairs : 160000
		Train rating pairs : 144000
		Valid rating pairs : 16000
	Test rating pairs  : 35947
......4: User/Movie를 Global id에 매핑
Total user number = 3659, movie number = 33898
......5: features 생성
Feature dim: 
user: (3659, 3659)
movie: (33898, 33898)
......6: Graph Encoder/Decoder 생성
rating_values :  [1.0, 2.0, 3.0, 4.0, 5.0]
......7: Graph 결과 출력
Train enc graph: 	#user:3659	#movie:33898	#pairs:144000
Train dec graph: 	#user:3659	#movie:33898	#pairs:144000
Valid enc graph: 	#user:3659	#movie:33898	#pairs:144000
Valid dec graph: 	#user:3659	#movie:33898	#pairs:16000
Test enc graph: 	#user:3659	#movie:33898	#pairs:144000
Test dec graph: 	#user:3659	#movie:33898	#pairs:35947


In [4]:
args.rating_vals = dataset.rating_values
args.gcn_dropout = 0.50

In [5]:
# bests=100
# bests_es=100
# start_time = time.time()

# for dim in [256]:
#     args.gcn_out_units = dim
#     for agg in [128]:
#         args.gcn_agg_units = agg
#         for lr in [0.006*i for i in range(10)]:
#             args.train_lr = lr
#             args.save_dir = f'./test/test'
#             args.save_id = 'new_feature'
#             best = train(args, dataset)
#             print("****************************")
#             args.save_dir = f'./test/test_es'
#             args.save_id = 'new_feature_es'
#             best_es = train(args, dataset_es)

# #             print(best,'  VS  ', best_es)
#             if bests>best:
#                 bests = best
#             if bests_es>best_es:
#                 bests_es=best_es
                
# print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))
# print(bests,'  VS  ', bests_es)

In [12]:
bests=100
bests_es=100
start_time = time.time()

for dim in [256]:
    args.gcn_out_units = dim
    for agg in [128]:
        args.gcn_agg_units = agg
        for lr in [0.006]:
            args.train_lr = lr
            args.save_dir = f'./test/test'
            args.save_id = 'new_feature'
            best_rmse, test_df_1, test_real = train(args, dataset)
#             print("****************************")
#             args.save_dir = f'./test/test_es'
#             args.save_id = 'new_feature_es'
#             best_es_rmse, test_df_2, test_real = train(args, dataset_es)
                
print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))
# print(bests,'  VS  ', bests_es)

training...
Best Iter Idx=40, Best Valid RMSE=1.0088, Best Test RMSE=0.9995
  Training epoch took: 0:00:20


In [14]:
bests=100
bests_es=100
start_time = time.time()

for dim in [256]:
    args.gcn_out_units = dim
    for agg in [256]:
        args.gcn_agg_units = agg
        for lr in [0.006]:
            args.train_lr = lr
            args.save_dir = f'./test/test'
            args.save_id = 'new_feature'
            best_rmse, test_df_1, test_real = train(args, dataset)
#             print("****************************")
#             args.save_dir = f'./test/test_es'
#             args.save_id = 'new_feature_es'
#             best_es_rmse, test_df_2, test_real = train(args, dataset_es)
                
print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))
# print(bests,'  VS  ', bests_es)

training...
Best Iter Idx=55, Best Valid RMSE=1.0066, Best Test RMSE=0.9985
  Training epoch took: 0:00:26


In [8]:
print(best_rmse)
print(best_es_rmse)

0.8199276339186489
0.810794434027629


In [10]:
test_df_2.head()

Unnamed: 0,test_pred
0,3.14906
1,3.620616
2,3.130883
3,3.307628
4,3.7788


In [23]:
real_value = test_real.tolist()

In [36]:
pred_value = list(test_df_1.test_pred)

In [37]:
len(real_value)

28766

In [38]:
len(pred_value)

28766

In [39]:
diff = np.array(real_value) - np.array(pred_value)

In [40]:
mae = np.mean(np.abs(diff))

In [41]:
mae

0.630816800216139