In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import time
import datetime

# 시간 표시 함수
def format_time(elapsed):
    # 반올림
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))


"""Training GCMC model on the MovieLens data set.
The script loads the full graph to the training device.
"""
import os, time
import logging
import numpy as np
import pandas as pd
import torch as th
import torch.nn as nn
from data_rotten import RottenMovie
from utils import get_activation, get_optimizer, torch_total_param_num, torch_net_info, MetricLogger

Using backend: pytorch


In [3]:
import easydict

args = easydict.EasyDict({ 
    "data_name":                      "rotten", 
    "use_one_hot_fea":                False,
    "gcn_agg_accum":                  "stack",
    "gpu":                            0,
    "seed":                           123,
    "data_test_ratio":                0.1,
    "data_valid_ratio":               0.1,
    "model_activation":               'leaky',
    "gcn_dropout":                    0.7,
    "gcn_agg_norm_symm":              True,
    "gcn_agg_units":                  256,
    "gcn_agg_accum":                  'sum',
    "gcn_out_units":                  32,
    "gen_r_num_basis_func":           2,
    "train_max_epoch":                1000,
    "train_log_interval":             1,
    "train_valid_interval":           1,
    "train_optimizer":                'adam',
    "train_grad_clip":                1.0,
    "train_lr":                       0.01,
    "train_min_lr":                   0.001,
    "train_lr_decay_factor":          0.5,
    "train_decay_patience":           25,
    "train_early_stopping_patience":  50,
    "share_param":                    False,
    "mix_cpu_gpu":                    False,
    "minibatch_size":                 20000,
    "num_workers_per_gpu":            8,
    "device":                         0,
    "save_dir":                       './save/',
    "save_id":                        1,
    "train_max_iter":                 2000
})

In [4]:
np.random.seed(args.seed)
th.manual_seed(args.seed)

if th.cuda.is_available():
    th.cuda.manual_seed_all(args.seed)

In [5]:
dataset = RottenMovie(                 
             train_data='./data/trainset_filtered.csv',
             test_data='./data/testset_filtered.csv',
             movie_data = './data/rotten_tomatoes_movies.csv',
             user_data = './data/rotten_user_table.csv',
             emotion=True,
             sentiment=True,

             name='rotten', 
             device=0, 
             mix_cpu_gpu=False,
             use_one_hot_fea=False, 
             symm=True,
             valid_ratio=0.1, 
             )

......1: 데이터 로드
......3: Train/Valid 분리
All rating pairs : 245094
	All train rating pairs : 216328
		Train rating pairs : 194695
		Valid rating pairs : 21633
	Test rating pairs  : 28766
......4: User/Movie를 Global id에 매핑
Total user number = 9821, movie number = 17712
......5: features 생성
Feature dim: 
user: torch.Size([9821, 1627])
movie: torch.Size([17712, 339])
......6: Graph Encoder/Decoder 생성
......7: Graph 결과 출력
Train enc graph: 	#user:9821	#movie:17712	#pairs:194695
Train dec graph: 	#user:9821	#movie:17712	#pairs:194695
Valid enc graph: 	#user:9821	#movie:17712	#pairs:21633
Valid dec graph: 	#user:9821	#movie:17712	#pairs:21633
Test enc graph: 	#user:9821	#movie:17712	#pairs:28766
Test dec graph: 	#user:9821	#movie:17712	#pairs:28766


In [6]:
dataset.train_enc_graph

Graph(num_nodes={'movie': 17712, 'user': 9821},
      num_edges={('movie', 'rev-0_5', 'user'): 3032, ('movie', 'rev-11', 'user'): 20134, ('movie', 'rev-12', 'user'): 43945, ('movie', 'rev-13', 'user'): 32352, ('movie', 'rev-14', 'user'): 58307, ('movie', 'rev-15', 'user'): 39957, ('movie', 'rev-16', 'user'): 27766, ('movie', 'rev-17', 'user'): 5480, ('movie', 'rev-18', 'user'): 133882, ('movie', 'rev-19', 'user'): 3928, ('movie', 'rev-1_0', 'user'): 5648, ('movie', 'rev-1_5', 'user'): 11377, ('movie', 'rev-20', 'user'): 18423, ('movie', 'rev-21', 'user'): 5216, ('movie', 'rev-2_0', 'user'): 17699, ('movie', 'rev-2_5', 'user'): 31603, ('movie', 'rev-3_0', 'user'): 30080, ('movie', 'rev-3_5', 'user'): 46477, ('movie', 'rev-4_0', 'user'): 26858, ('movie', 'rev-4_5', 'user'): 4074, ('movie', 'rev-5_0', 'user'): 17847, ('user', '0_5', 'movie'): 3032, ('user', '11', 'movie'): 20134, ('user', '12', 'movie'): 43945, ('user', '13', 'movie'): 32352, ('user', '14', 'movie'): 58307, ('user', '15',

In [7]:
dataset.train_enc_graph.number_of_edges()

1168170

In [11]:
from train import train

args.rating_vals = dataset.rating_values

In [12]:
start_time = time.time()

best_test_rmse = list()

for dim in [75,150]:
    rmse_list = list() # 차원별로 list 생성
    args.gcn_out_units = dim
    print(f"start dimension: {dim}")
    
    mid_time = time.time()
    for i in range(1):
        args.save_dir = f'./save/feature_es_{dim}/run_{i}'
        args.save_id = 'feature_es'
        test_rmse = train(args, dataset)
        rmse_list.append(test_rmse)
        
    best_test_rmse.append(rmse_list)
    print("  1 model time epoch took: {:}".format(format_time(time.time() - mid_time)))
        
print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))

training...
Best Iter Idx=563, Best Valid RMSE=0.7582, Best Test RMSE=0.7389
training...
Best Iter Idx=871, Best Valid RMSE=0.7560, Best Test RMSE=0.7312
  Training epoch took: 0:04:18


In [13]:
best_test_rmse

[[0.738899617866152], [0.7312486534432296]]

In [None]:
for dim in [75,150,225]:
    args.gcn_out_units = dim
    for i in range(10):
        args.save_dir = f'./save/feature_es_{dim}/run_{i}'
        args.save_id = 'feature_es'
        train(args, dataset)