# Stacking Ensemble

In [1]:
import time
import datetime
# 시간 표시 함수
def format_time(elapsed):
    # 반올림
    elapsed_rounded = int(round((elapsed)))
    # hh:mm:ss으로 형태 변경
    return str(datetime.timedelta(seconds=elapsed_rounded))

start_time = time.time()
print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))

  Training epoch took: 0:00:00


In [35]:
"""Training IGMC model on the MovieLens dataset."""

import os
import sys
import time
import glob
import random
import argparse
from shutil import copy

import numpy as np
import torch as th
import torch.nn as nn
import torch.optim as optim

from model import IGMC
from explicit_data_rotten import RottenTomato
from dataset_rotten import RottenTomatoDataset, collate_rotten_tomato
from utils import MetricLogger

In [173]:
def evaluate(label_type, model, loader, device):
    predict_list = list()
    label_list = list()

    # Evaluate RMSE
    model.eval()
    mse = 0.
    for batch in loader:
        with th.no_grad():
            preds = model(batch[0].to(device))
        labels = batch[1].to(device)

        if label_type == 'rating':
            preds  = (preds + 1)/2
            labels = (labels + 1)/2
        else:
            preds  = preds + 1
            labels = labels + 1
        mse += ((preds - labels) ** 2).sum().item()

        predict_list.append(preds.tolist()) # 예측값 저장
        label_list.append(labels.tolist()) # 정답값 저장

    # 2차원 -> 1차원 리스트 변형
    predict_list = [element for array in predict_list for element in array]
    label_list = [element for array in label_list for element in array]    

    mse /= len(loader.dataset)
    rmse = np.sqrt(mse)
    return rmse, predict_list, label_list


def adj_rating_reg(model):
    arr_loss = 0
    for conv in model.convs:
        weight = conv.weight.view(conv.num_bases, conv.in_feat * conv.out_feat)
        weight = th.matmul(conv.w_comp, weight).view(conv.num_rels, conv.in_feat, conv.out_feat)
        arr_loss += th.sum((weight[1:, :, :] - weight[:-1, :, :])**2)
    return arr_loss

# @profile
def train_epoch(label_type, model, loss_fn, optimizer, arr_lambda, loader, device, log_interval):
    model.train()

    epoch_loss = 0.
    iter_loss = 0.
    iter_mse = 0.
    iter_cnt = 0
    iter_dur = []

    # 저장 리스트(예측, 정답)
    predict_list = list()
    label_list = list()

    # 서브그래프 단위로 학습
    for iter_idx, batch in enumerate(loader, start=1):
        t_start = time.time()

        inputs = batch[0].to(device)
        labels = batch[1].to(device)
        preds = model(inputs)

        if label_type == 'rating':
            preds  = (preds + 1)/2
            labels = (labels + 1)/2
        else:
            preds  = preds + 1
            labels = labels + 1
        
        if label_type == 'emotion':
            loss = loss_fn(preds, labels).mean()
        else:
            loss = loss_fn(preds, labels).mean() + arr_lambda * adj_rating_reg(model)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item() * preds.shape[0]
        iter_loss += loss.item() * preds.shape[0]
        iter_mse += ((preds - labels) ** 2).sum().item()
        iter_cnt += preds.shape[0]
        iter_dur.append(time.time() - t_start)

        predict_list.append(preds.tolist()) # 예측값 저장
        label_list.append(labels.tolist()) # 정답값 저장

        if iter_idx % log_interval == 0:
            print("Iter={}, loss={:.4f}, mse={:.4f}, time={:.4f}".format(
                iter_idx, iter_loss/iter_cnt, iter_mse/iter_cnt, np.average(iter_dur)))
            iter_loss = 0.
            iter_mse = 0.
            iter_cnt = 0

    # 2차원 -> 1차원 리스트 변형
    predict_list = [element for array in predict_list for element in array]
    label_list = [element for array in label_list for element in array]

    train_epoch_loss = epoch_loss / len(loader.dataset)  
    return train_epoch_loss, predict_list, label_list


def train(args):
    movielens = MovieLens(args.data_name, testing=args.testing,
                            test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)
    if args.testing:
        test_dataset = RottenTomatoDataset(
            movielens.test_rating_pairs, movielens.test_rating_values, movielens.train_graph, 
            args.hop, args.sample_ratio, args.max_nodes_per_hop) 
    else:
        test_dataset = RottenTomatoDataset(
            movielens.valid_rating_pairs, movielens.valid_rating_values, movielens.train_graph, 
            args.hop, args.sample_ratio, args.max_nodes_per_hop)
    train_dataset = RottenTomatoDataset(
        movielens.train_rating_pairs, movielens.train_rating_values, movielens.train_graph, 
        args.hop, args.sample_ratio, args.max_nodes_per_hop)

    train_loader = th.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, 
                            num_workers=args.num_workers, collate_fn=collate_movielens)
    test_loader = th.utils.data.DataLoader(test_dataset, batch_size=args.batch_size, shuffle=False, 
                            num_workers=args.num_workers, collate_fn=collate_movielens)

    in_feats = (args.hop+1)*2 #+ movielens.train_graph.ndata['refex'].shape[1]
    model = IGMC(in_feats=in_feats, 
                 latent_dim=[32, 32, 32, 32],
                 num_relations=5, # movielens.num_rating, 
                 num_bases=4, 
                 regression=True, 
                 edge_dropout=args.edge_dropout,
                #  side_features=args.use_features,
                #  n_side_features=n_features,
                #  multiply_by=args.multiply_by
            ).to(args.device)
    loss_fn = nn.MSELoss().to(args.device)
    optimizer = optim.Adam(model.parameters(), lr=args.train_lr, weight_decay=0)
    print("Loading network finished ...\n")

    ### prepare the logger
    logger = MetricLogger(args.save_dir, args.valid_log_interval)
    
    best_epoch = 0
    best_rmse = np.inf
    ### declare the loss information
    print("Start training ...")
    for epoch_idx in range(1, args.train_epochs+1):
        print ('Epoch', epoch_idx)
    
        train_loss = train_epoch(model, loss_fn, optimizer, args.arr_lambda, 
                                train_loader, args.device, args.train_log_interval)
        test_rmse = evaluate(model, test_loader, args.device)
        eval_info = {
            'epoch': epoch_idx,
            'train_loss': train_loss,
            'test_rmse': test_rmse,
        }
        print('=== Epoch {}, train loss {:.6f}, test rmse {:.6f} ==='.format(*eval_info.values()))

        if epoch_idx % args.train_lr_decay_step == 0:
            for param in optimizer.param_groups:
                param['lr'] = args.train_lr_decay_factor * param['lr']

        logger.log(eval_info, model, optimizer)
        if best_rmse > test_rmse:
            best_rmse = test_rmse
            best_epoch = epoch_idx
    eval_info = "Training ends. The best testing rmse is {:.6f} at epoch {}".format(best_rmse, best_epoch)
    print(eval_info)
    with open(os.path.join(args.save_dir, 'log.txt'), 'a') as f:
        f.write(eval_info)

In [149]:
def config():
    parser = argparse.ArgumentParser(description='IGMC')
    # general settings
    parser.add_argument('--testing', action='store_true', default=False,
                        help='if set, use testing mode which splits all ratings into train/test;\
                        otherwise, use validation model which splits all ratings into \
                        train/val/test and evaluate on val only')
    parser.add_argument('--device', default='0', type=int,
                        help='Running device. E.g `--device 0`, if using cpu, set `--device -1`')
    parser.add_argument('--seed', type=int, default=1234, metavar='S',
                        help='random seed (default: 1234)')
    parser.add_argument('--data_name', default='ml-100k', type=str,
                        help='The dataset name: ml-100k, ml-1m')
    parser.add_argument('--data_test_ratio', type=float, default=0.1) # for ml-100k the test ration is 0.2
    parser.add_argument('--num_workers', type=int, default=8)
    parser.add_argument('--data_valid_ratio', type=float, default=0.2)
    # parser.add_argument('--ensemble', action='store_true', default=False,
    #                     help='if True, load a series of model checkpoints and ensemble the results')               
    parser.add_argument('--train_log_interval', type=int, default=100)
    parser.add_argument('--valid_log_interval', type=int, default=10)
    parser.add_argument('--save_appendix', type=str, default='debug', 
                        help='what to append to save-names when saving results')
    # subgraph extraction settings
    parser.add_argument('--hop', default=1, metavar='S', 
                        help='enclosing subgraph hop number')
    parser.add_argument('--sample_ratio', type=float, default=1.0, 
                        help='if < 1, subsample nodes per hop according to the ratio')
    parser.add_argument('--max_nodes_per_hop', type=int, default=200, 
                        help='if > 0, upper bound the # nodes per hop by another subsampling')
    # parser.add_argument('--use_features', action='store_true', default=False,
    #                     help='whether to use node features (side information)')
    # edge dropout settings
    parser.add_argument('--edge_dropout', type=float, default=0.2, 
                        help='if not 0, random drops edges from adjacency matrix with this prob')
    parser.add_argument('--force_undirected', action='store_true', default=False, 
                        help='in edge dropout, force (x, y) and (y, x) to be dropped together')
    # optimization settings
    parser.add_argument('--train_lr', type=float, default=1e-3)
    parser.add_argument('--train_min_lr', type=float, default=1e-6)
    parser.add_argument('--train_lr_decay_factor', type=float, default=0.1)
    parser.add_argument('--train_lr_decay_step', type=int, default=50)
    parser.add_argument('--train_epochs', type=int, default=80)
    parser.add_argument('--batch_size', type=int, default=32)
    parser.add_argument('--arr_lambda', type=float, default=0.001)
    parser.add_argument('--num_rgcn_bases', type=int, default=4)
                
    args = parser.parse_args()
    args.device = th.device(args.device) if args.device >= 0 and th.cuda.is_available() else th.device('cpu')
    
    ### set save_dir according to localtime and test mode
    file_dir = os.path.dirname(os.path.realpath('__file__'))
    val_test_appendix = 'testmode' if args.testing else 'valmode'
    local_time = time.strftime('%y%m%d%H%M', time.localtime())
    args.save_dir = os.path.join(
        file_dir, 'log/{}_{}_{}_{}'.format(
            args.data_name, args.save_appendix, val_test_appendix, local_time
        )
    )
    if not os.path.exists(args.save_dir):
        os.makedirs(args.save_dir) 
    print(args)

    # backup current .py files
    for f in glob.glob(r"*.py"):
        copy(f, args.save_dir)

    # save command line input
    cmd_input = 'python3 ' + ' '.join(sys.argv)
    with open(os.path.join(args.save_dir, 'cmd_input.txt'), 'a') as f:
        f.write(cmd_input)
        f.write("\n")
    print('Command line input: ' + cmd_input + ' is saved.')
    
    return args

## 1. Config

In [150]:
import easydict

args = easydict.EasyDict({ 
    'data_name':            'rotten',
    'testing':     	        True,
    'device':      	        0,
    'seed':        	        1234,
    'data_test_ratio':      0.1,
    'num_workers':   	    8,
    'data_valid_ratio':     0.2,
    'train_log_interval':   200,
    'valid_log_interval':   10,
    'save_appendix':   	    'debug',
    'hop':   	            1,
    'sample_ratio':    	    1.0,
    'max_nodes_per_hop':    100,
    'edge_dropout':   	    0.2,
    'force_undirected':     False,
    'train_lr':   	        1e-3,
    'train_min_lr':   	    1e-6,
    'train_lr_decay_factor':0.1,
    'train_lr_decay_step':  50,
    'train_epochs':   	    10,
    'batch_size':   	    32,
    'arr_lambda':   	    0.001,
    'num_rgcn_bases':   	4,
    'train_epochs':   	    1
})

In [151]:
### set save_dir according to localtime and test mode
file_dir = os.path.dirname(os.path.realpath('__file__'))
val_test_appendix = 'testmode' if args.testing else 'valmode'
local_time = time.strftime('%y%m%d%H%M', time.localtime())
args.save_dir = os.path.join(
    file_dir, 'log/{}_{}_{}_{}'.format(
        args.data_name, args.save_appendix, val_test_appendix, local_time
    )
)
if not os.path.exists(args.save_dir):
    os.makedirs(args.save_dir) 
print(args)

# backup current .py files
for f in glob.glob(r"*.py"):
    copy(f, args.save_dir)

# save command line input
cmd_input = 'python3 ' + ' '.join(sys.argv)
with open(os.path.join(args.save_dir, 'cmd_input.txt'), 'a') as f:
    f.write(cmd_input)
    f.write("\n")
print('Command line input: ' + cmd_input + ' is saved.')

{'data_name': 'rotten', 'testing': True, 'device': 0, 'seed': 1234, 'data_test_ratio': 0.1, 'num_workers': 8, 'data_valid_ratio': 0.2, 'train_log_interval': 200, 'valid_log_interval': 10, 'save_appendix': 'debug', 'hop': 1, 'sample_ratio': 1.0, 'max_nodes_per_hop': 100, 'edge_dropout': 0.2, 'force_undirected': False, 'train_lr': 0.001, 'train_min_lr': 1e-06, 'train_lr_decay_factor': 0.1, 'train_lr_decay_step': 50, 'train_epochs': 1, 'batch_size': 32, 'arr_lambda': 0.001, 'num_rgcn_bases': 4, 'save_dir': 'C:\\Users\\user\\Jupyter_project\\keejun\\IGMC_CX\\log/rotten_debug_testmode_2111162042'}
Command line input: python3 C:\Users\user\anaconda3\envs\graph\lib\site-packages\ipykernel_launcher.py -f C:\Users\user\AppData\Roaming\jupyter\runtime\kernel-0ac5dae0-92ba-4f16-b89e-9354de532105.json is saved.


In [125]:
random.seed(args.seed)
np.random.seed(args.seed)
th.manual_seed(args.seed)
if th.cuda.is_available():
    th.cuda.manual_seed_all(args.seed)    

## 2. Model Setting

In [126]:
### prepare data and set model
path = './raw_data/rotten_tomato/'
rotten_tomato_r = RottenTomato('rating',    path, testing=args.testing,test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)
rotten_tomato_s = RottenTomato('sentiment', path, testing=args.testing,test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)
rotten_tomato_e = RottenTomato('emotion',   path, testing=args.testing,test_ratio=args.data_test_ratio, valid_ratio=args.data_valid_ratio)

Label_type: rating
	Train rating pairs : 216328
	Valid rating pairs : 43266
	Test rating pairs  : 28766
Label_type: sentiment
	Train rating pairs : 216328
	Valid rating pairs : 43266
	Test rating pairs  : 28766
Label_type: emotion
	Train rating pairs : 216328
	Valid rating pairs : 43266
	Test rating pairs  : 28766


In [209]:

print('testing')
test_dataset_r = RottenTomatoDataset(
    rotten_tomato_r.test_rating_pairs, rotten_tomato_r.test_rating_values, rotten_tomato_r.train_graph, 
    args.hop, args.sample_ratio, args.max_nodes_per_hop) 
test_dataset_s = RottenTomatoDataset(
    rotten_tomato_s.test_rating_pairs, rotten_tomato_s.test_rating_values, rotten_tomato_s.train_graph, 
    args.hop, args.sample_ratio, args.max_nodes_per_hop) 
test_dataset_e = RottenTomatoDataset(
    rotten_tomato_e.test_rating_pairs, rotten_tomato_e.test_rating_values, rotten_tomato_e.train_graph, 
    args.hop, args.sample_ratio, args.max_nodes_per_hop) 

print('valid')
valid_dataset_r = RottenTomatoDataset(
    rotten_tomato_r.valid_rating_pairs, rotten_tomato_r.valid_rating_values, rotten_tomato_r.train_graph, 
    args.hop, args.sample_ratio, args.max_nodes_per_hop)
valid_dataset_s = RottenTomatoDataset(
    rotten_tomato_s.valid_rating_pairs, rotten_tomato_s.valid_rating_values, rotten_tomato_s.train_graph, 
    args.hop, args.sample_ratio, args.max_nodes_per_hop)
valid_dataset_e = RottenTomatoDataset(
    rotten_tomato_e.valid_rating_pairs, rotten_tomato_e.valid_rating_values, rotten_tomato_e.train_graph, 
    args.hop, args.sample_ratio, args.max_nodes_per_hop)

testing
valid


In [212]:
print(len(valid_dataset_r))
print(len(test_dataset_r))

43266
28766


In [128]:
train_dataset_r = RottenTomatoDataset(
    rotten_tomato_r.train_rating_pairs, rotten_tomato_r.train_rating_values, rotten_tomato_r.train_graph, 
    args.hop, args.sample_ratio, args.max_nodes_per_hop)

train_dataset_s = RottenTomatoDataset(
    rotten_tomato_s.train_rating_pairs, rotten_tomato_s.train_rating_values, rotten_tomato_s.train_graph, 
    args.hop, args.sample_ratio, args.max_nodes_per_hop)

train_dataset_e = RottenTomatoDataset(
    rotten_tomato_e.train_rating_pairs, rotten_tomato_e.train_rating_values, rotten_tomato_e.train_graph, 
    args.hop, args.sample_ratio, args.max_nodes_per_hop)

In [216]:
train_loader_r = th.utils.data.DataLoader(train_dataset_r, batch_size=args.batch_size, shuffle=True, 
                        num_workers=args.num_workers, collate_fn=collate_rotten_tomato)
valid_loader_r = th.utils.data.DataLoader(valid_dataset_r, batch_size=args.batch_size, shuffle=False, 
                        num_workers=args.num_workers, collate_fn=collate_rotten_tomato)
test_loader_r = th.utils.data.DataLoader(test_dataset_r, batch_size=args.batch_size, shuffle=False, 
                        num_workers=args.num_workers, collate_fn=collate_rotten_tomato)

train_loader_s = th.utils.data.DataLoader(train_dataset_s, batch_size=args.batch_size, shuffle=True, 
                        num_workers=args.num_workers, collate_fn=collate_rotten_tomato)
valid_loader_s = th.utils.data.DataLoader(valid_dataset_s, batch_size=args.batch_size, shuffle=False, 
                        num_workers=args.num_workers, collate_fn=collate_rotten_tomato)
test_loader_s = th.utils.data.DataLoader(test_dataset_s, batch_size=args.batch_size, shuffle=False, 
                        num_workers=args.num_workers, collate_fn=collate_rotten_tomato)

train_loader_e = th.utils.data.DataLoader(train_dataset_e, batch_size=args.batch_size, shuffle=True, 
                        num_workers=args.num_workers, collate_fn=collate_rotten_tomato)
valid_loader_e = th.utils.data.DataLoader(valid_dataset_e, batch_size=args.batch_size, shuffle=False, 
                        num_workers=args.num_workers, collate_fn=collate_rotten_tomato)
test_loader_e = th.utils.data.DataLoader(test_dataset_e, batch_size=args.batch_size, shuffle=False, 
                        num_workers=args.num_workers, collate_fn=collate_rotten_tomato)

In [174]:
in_feats = (args.hop+1)*2 #+ rotten_tomato.train_graph.ndata['refex'].shape[1]

# rating
model_r = IGMC(in_feats=in_feats, 
             latent_dim=[32, 32, 32, 32],
             num_relations=10, # rotten_tomato.num_rating, 
             num_bases=4, 
             regression=True, 
             edge_dropout=args.edge_dropout,
        ).to(args.device)

# sentiment
model_s = IGMC(in_feats=in_feats, 
             latent_dim=[32, 32, 32, 32],
             num_relations=5, # rotten_tomato.num_rating, 
             num_bases=4, 
             regression=True, 
             edge_dropout=args.edge_dropout,
        ).to(args.device)

# emotion
model_e = IGMC(in_feats=in_feats, 
             latent_dim=[32, 32, 32, 32],
             num_relations=6, # rotten_tomato.num_rating, 
             num_bases=4, 
             regression=True, 
             edge_dropout=args.edge_dropout,
        ).to(args.device)


loss_fn = nn.MSELoss().to(args.device)
optimizer_r = optim.Adam(model_r.parameters(), lr=args.train_lr, weight_decay=0)
optimizer_s = optim.Adam(model_s.parameters(), lr=args.train_lr, weight_decay=0)
optimizer_e = optim.Adam(model_e.parameters(), lr=args.train_lr, weight_decay=0)

## 3. Training

### 3-1. rating

In [232]:
label_type = 'rating'

### prepare the logger
logger = MetricLogger(args.save_dir, args.valid_log_interval)

best_epoch = 0
best_rmse = np.inf
### declare the loss information
print("Start training ...")

# 마지막 epoch의 결과를 저장함.
predict_train_list = list()
label_train_list = list()

predict_valid_list = list()
label_valid_list = list()
best_predict_valid_list = list()
best_label_valid_list = list()

predict_test_list = list()
label_test_list = list()
best_predict_test_list = list()
best_label_test_list = list()

start_time = time.time()
for epoch_idx in range(1, 80):
    print ('Epoch', epoch_idx)
    
    train_loss, predict_train_list, label_train_list = train_epoch(label_type, model_r, loss_fn, optimizer_r, args.arr_lambda, 
                                                                   train_loader_r, args.device, args.train_log_interval)
    valid_rmse, predict_valid_list, label_valid_list = evaluate(label_type, model_r, valid_loader_r, args.device)
    test_rmse, predict_test_list, label_test_list = evaluate(label_type, model_r, test_loader_r, args.device)
    
    eval_info = {
        'epoch': epoch_idx,
        'train_loss': train_loss,
        'test_rmse': test_rmse,
    }
    print('=== Epoch {}, train loss {:.6f}, test rmse {:.6f} ==='.format(*eval_info.values()))

    if epoch_idx % args.train_lr_decay_step == 0:
        for param in optimizer.param_groups:
            param['lr'] = args.train_lr_decay_factor * param['lr']

    logger.log(eval_info, model_r, optimizer_r)
    if best_rmse > test_rmse:
        best_rmse = test_rmse
        best_epoch = epoch_idx
        
        best_predict_valid_list = predict_valid_list 
        best_label_valid_list = label_valid_list
        
        best_predict_test_list = predict_test_list 
        best_label_test_list = label_test_list

eval_info = "Training ends. The best testing rmse is {:.6f} at epoch {}".format(best_rmse, best_epoch)
print(eval_info)
print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))

Start training ...
Epoch 1
Iter=200, loss=2.4773, mse=2.4714, time=0.0436
Iter=400, loss=2.5149, mse=2.5088, time=0.0421
Iter=600, loss=2.5328, mse=2.5267, time=0.0417
Iter=800, loss=2.5247, mse=2.5188, time=0.0420
Iter=1000, loss=2.4946, mse=2.4886, time=0.0423
Iter=1200, loss=2.4882, mse=2.4822, time=0.0422
Iter=1400, loss=2.4536, mse=2.4476, time=0.0421
Iter=1600, loss=2.5476, mse=2.5415, time=0.0421
Iter=1800, loss=2.5210, mse=2.5149, time=0.0421
Iter=2000, loss=2.5425, mse=2.5364, time=0.0422
Iter=2200, loss=2.5566, mse=2.5505, time=0.0426
Iter=2400, loss=2.4756, mse=2.4696, time=0.0430
Iter=2600, loss=2.5307, mse=2.5249, time=0.0430
Iter=2800, loss=2.5427, mse=2.5370, time=0.0429
Iter=3000, loss=2.5281, mse=2.5225, time=0.0429
Iter=3200, loss=2.4675, mse=2.4618, time=0.0429
Iter=3400, loss=2.5806, mse=2.5747, time=0.0431
Iter=3600, loss=2.5870, mse=2.5808, time=0.0434
Iter=3800, loss=2.4086, mse=2.4024, time=0.0433
Iter=4000, loss=2.4483, mse=2.4423, time=0.0432
Iter=4200, loss=2

- 결과 리스트

In [233]:
print(len(predict_train_list))
print(len(label_train_list))

216328
216328


In [234]:
print(len(best_predict_valid_list))
print(len(best_label_valid_list))

43266
43266


In [235]:
print(len(best_predict_test_list))
print(len(best_label_test_list))

28766
28766


In [245]:
import pandas as pd
train_rating_df = pd.DataFrame([x for x in zip(predict_train_list, label_train_list)])
train_rating_df.rename(columns={0:'predict', 1:'label'}, inplace=True)

valid_rating_df = pd.DataFrame([x for x in zip(predict_valid_list, label_valid_list)])
valid_rating_df.rename(columns={0:'predict', 1:'label'}, inplace=True)

test_rating_df = pd.DataFrame([x for x in zip(best_predict_test_list, best_label_test_list)])
test_rating_df.rename(columns={0:'predict', 1:'label'}, inplace=True)

test_rating_df2 = pd.DataFrame([x for x in zip(predict_test_list, label_test_list)])
test_rating_df2.rename(columns={0:'predict', 1:'label'}, inplace=True)

In [246]:
path = './raw_data/rotten_tomato/ensemble/'
train_rating_df.to_csv(path + 'train_rating.csv', index=False)
valid_rating_df.to_csv(path + 'valid_rating.csv', index=False)
test_rating_df.to_csv(path + 'test_rating.csv', index=False)
test_rating_df2.to_csv(path + 'test_rating2.csv', index=False)

### 3-2. sentiment

In [247]:
label_type = 'sentiment'

### prepare the logger
logger = MetricLogger(args.save_dir, args.valid_log_interval)

best_epoch = 0
best_rmse = np.inf
### declare the loss information
print("Start training ...")

# 마지막 epoch의 결과를 저장함.
predict_train_list = list()
label_train_list = list()

predict_valid_list = list()
label_valid_list = list()
best_predict_valid_list = list()
best_label_valid_list = list()

predict_test_list = list()
label_test_list = list()
best_predict_test_list = list()
best_label_test_list = list()

start_time = time.time()
for epoch_idx in range(1, 80):
    print ('Epoch', epoch_idx)
    
    train_loss, predict_train_list, label_train_list = train_epoch(label_type, model_s, loss_fn, optimizer_s, args.arr_lambda, 
                                                                   train_loader_s, args.device, args.train_log_interval)
    valid_rmse, predict_valid_list, label_valid_list = evaluate(label_type, model_s, valid_loader_s, args.device)
    test_rmse, predict_test_list, label_test_list = evaluate(label_type, model_s, test_loader_s, args.device)
    
    eval_info = {
        'epoch': epoch_idx,
        'train_loss': train_loss,
        'test_rmse': test_rmse,
    }
    print('=== Epoch {}, train loss {:.6f}, test rmse {:.6f} ==='.format(*eval_info.values()))

    if epoch_idx % args.train_lr_decay_step == 0:
        for param in optimizer.param_groups:
            param['lr'] = args.train_lr_decay_factor * param['lr']

    logger.log(eval_info, model_s, optimizer_s)
    if best_rmse > test_rmse:
        best_rmse = test_rmse
        best_epoch = epoch_idx
        
        best_predict_valid_list = predict_valid_list 
        best_label_valid_list = label_valid_list
        
        best_predict_test_list = predict_test_list 
        best_label_test_list = label_test_list

eval_info = "Training ends. The best testing rmse is {:.6f} at epoch {}".format(best_rmse, best_epoch)
print(eval_info)
print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))

Start training ...
Epoch 1
Iter=200, loss=1.5511, mse=1.5494, time=0.0424
Iter=400, loss=1.5360, mse=1.5345, time=0.0423
Iter=600, loss=1.5444, mse=1.5430, time=0.0434
Iter=800, loss=1.4904, mse=1.4891, time=0.0444
Iter=1000, loss=1.4986, mse=1.4973, time=0.0460
Iter=1200, loss=1.4771, mse=1.4759, time=0.0461
Iter=1400, loss=1.4886, mse=1.4875, time=0.0457
Iter=1600, loss=1.4702, mse=1.4692, time=0.0453
Iter=1800, loss=1.4903, mse=1.4893, time=0.0449
Iter=2000, loss=1.4588, mse=1.4578, time=0.0447
Iter=2200, loss=1.4533, mse=1.4523, time=0.0446
Iter=2400, loss=1.4961, mse=1.4951, time=0.0447
Iter=2600, loss=1.4613, mse=1.4604, time=0.0445
Iter=2800, loss=1.4696, mse=1.4688, time=0.0443
Iter=3000, loss=1.4235, mse=1.4227, time=0.0442
Iter=3200, loss=1.4581, mse=1.4573, time=0.0439
Iter=3400, loss=1.4327, mse=1.4319, time=0.0443
Iter=3600, loss=1.4304, mse=1.4296, time=0.0441
Iter=3800, loss=1.4488, mse=1.4481, time=0.0440
Iter=4000, loss=1.4846, mse=1.4839, time=0.0438
Iter=4200, loss=1

In [248]:
import pandas as pd
train_sentiment_df = pd.DataFrame([x for x in zip(predict_train_list, label_train_list)])
train_sentiment_df.rename(columns={0:'predict', 1:'label'}, inplace=True)

valid_sentiment_df = pd.DataFrame([x for x in zip(predict_valid_list, label_valid_list)])
valid_sentiment_df.rename(columns={0:'predict', 1:'label'}, inplace=True)

test_sentiment_df = pd.DataFrame([x for x in zip(best_predict_test_list, best_label_test_list)])
test_sentiment_df.rename(columns={0:'predict', 1:'label'}, inplace=True)

In [249]:
path = './raw_data/rotten_tomato/ensemble/'
train_sentiment_df.to_csv(path + 'train_sentiment.csv', index=False)
valid_sentiment_df.to_csv(path + 'valid_sentiment.csv', index=False)
test_sentiment_df.to_csv(path + 'test_sentiment.csv', index=False)

### 3-3. emotion

In [250]:
label_type = 'emotion'

### prepare the logger
logger = MetricLogger(args.save_dir, args.valid_log_interval)

best_epoch = 0
best_rmse = np.inf
### declare the loss information
print("Start training ...")

# 마지막 epoch의 결과를 저장함.
predict_train_list = list()
label_train_list = list()

predict_valid_list = list()
label_valid_list = list()
best_predict_valid_list = list()
best_label_valid_list = list()

predict_test_list = list()
label_test_list = list()
best_predict_test_list = list()
best_label_test_list = list()

start_time = time.time()
for epoch_idx in range(1, 80):
    print ('Epoch', epoch_idx)
    
    train_loss, predict_train_list, label_train_list = train_epoch(label_type, model_e, loss_fn, optimizer_e, args.arr_lambda, 
                                                                   train_loader_e, args.device, args.train_log_interval)
    valid_rmse, predict_valid_list, label_valid_list = evaluate(label_type, model_e, valid_loader_e, args.device)
    test_rmse, predict_test_list, label_test_list = evaluate(label_type, model_e, test_loader_e, args.device)
    
    eval_info = {
        'epoch': epoch_idx,
        'train_loss': train_loss,
        'test_rmse': test_rmse,
    }
    print('=== Epoch {}, train loss {:.6f}, test rmse {:.6f} ==='.format(*eval_info.values()))

    if epoch_idx % args.train_lr_decay_step == 0:
        for param in optimizer.param_groups:
            param['lr'] = args.train_lr_decay_factor * param['lr']

    logger.log(eval_info, model_e, optimizer_e)
    if best_rmse > test_rmse:
        best_rmse = test_rmse
        best_epoch = epoch_idx
        
        best_predict_valid_list = predict_valid_list 
        best_label_valid_list = label_valid_list
        
        best_predict_test_list = predict_test_list 
        best_label_test_list = label_test_list
    
    print("  Training epoch took: {:}".format(format_time(time.time() - start_time)))

eval_info = "Training ends. The best testing rmse is {:.6f} at epoch {}".format(best_rmse, best_epoch)
print(eval_info)
print(" Total Training epoch took: {:}".format(format_time(time.time() - start_time)))

Start training ...
Epoch 1
Iter=200, loss=1.1998, mse=1.1998, time=0.0424
Iter=400, loss=1.2266, mse=1.2266, time=0.0413
Iter=600, loss=1.2013, mse=1.2013, time=0.0411
Iter=800, loss=1.2216, mse=1.2216, time=0.0410
Iter=1000, loss=1.2213, mse=1.2213, time=0.0407
Iter=1200, loss=1.2352, mse=1.2352, time=0.0402
Iter=1400, loss=1.2343, mse=1.2343, time=0.0401
Iter=1600, loss=1.2495, mse=1.2495, time=0.0397
Iter=1800, loss=1.2810, mse=1.2810, time=0.0394
Iter=2000, loss=1.2238, mse=1.2238, time=0.0392
Iter=2200, loss=1.1690, mse=1.1690, time=0.0392
Iter=2400, loss=1.2587, mse=1.2587, time=0.0391
Iter=2600, loss=1.2564, mse=1.2564, time=0.0391
Iter=2800, loss=1.2007, mse=1.2007, time=0.0390
Iter=3000, loss=1.2310, mse=1.2310, time=0.0389
Iter=3200, loss=1.2483, mse=1.2483, time=0.0389
Iter=3400, loss=1.2365, mse=1.2365, time=0.0388
Iter=3600, loss=1.2541, mse=1.2541, time=0.0388
Iter=3800, loss=1.2518, mse=1.2518, time=0.0387
Iter=4000, loss=1.2654, mse=1.2654, time=0.0386
Iter=4200, loss=1

KeyboardInterrupt: 

In [251]:
best_rmse

1.065909012649887

In [None]:
import pandas as pd
train_emotion_df = pd.DataFrame([x for x in zip(predict_train_list, label_train_list)])
train_emotion_df.rename(columns={0:'predict', 1:'label'}, inplace=True)

valid_emotion_df = pd.DataFrame([x for x in zip(predict_valid_list, label_valid_list)])
valid_emotion_df.rename(columns={0:'predict', 1:'label'}, inplace=True)

test_emotion_df = pd.DataFrame([x for x in zip(best_predict_test_list, best_label_test_list)])
test_emotion_df.rename(columns={0:'predict', 1:'label'}, inplace=True)

In [None]:
path = './raw_data/rotten_tomato/ensemble/'
train_emotion_df.to_csv(path + 'train_emotion.csv', index=False)
valid_emotion_df.to_csv(path + 'valid_emotion.csv', index=False)
test_emotion_df.to_csv(path + 'test_emotion.csv', index=False)

In [None]:
model.block

## 4. Train_epoch 함수 테스트

### - Rating

In [165]:
label_type = 'rating'
model = model_r
loss_fn = loss_fn
optimizer = optimizer_r
arr_lambda = args.arr_lambda
loader = train_loader_r
device = args.device
log_interval = args.train_log_interval

In [170]:
start_time = time.time()

model.train()

epoch_loss = 0.
iter_loss = 0.
iter_mse = 0.
iter_cnt = 0
iter_dur = []

# 저장 리스트(예측, 정답)
predict_list = list()
label_list = list()

# 서브그래프 단위로 학습
for iter_idx, batch in enumerate(loader, start=1):
    t_start = time.time()

    inputs = batch[0].to(device)
    labels = batch[1].to(device)
    preds = model(inputs)
    
    if label_type == 'emotion':
        loss = loss_fn(preds, labels).mean()
    else:
        loss = loss_fn(preds, labels).mean() + arr_lambda * adj_rating_reg(model)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    epoch_loss += loss.item() * preds.shape[0]
    iter_loss += loss.item() * preds.shape[0]
    iter_mse += ((preds - labels) ** 2).sum().item()
    iter_cnt += preds.shape[0]
    iter_dur.append(time.time() - t_start)

    if label_type == 'rating':
        preds  = (preds + 1)/2
        labels = (labels + 1)/2
    else:
        preds  = preds + 1
        labels = labels + 1

    predict_list.append(preds.tolist()) # 예측값 저장
    label_list.append(labels.tolist()) # 정답값 저장

    if iter_idx % log_interval == 0:
        print("Iter={}, loss={:.4f}, mse={:.4f}, time={:.4f}".format(
            iter_idx, iter_loss/iter_cnt, iter_mse/iter_cnt, np.average(iter_dur)))
        iter_loss = 0.
        iter_mse = 0.
        iter_cnt = 0

# 2차원 -> 1차원 리스트 변형
predict_list = [element for array in predict_list for element in array]
label_list = [element for array in label_list for element in array]

train_epoch_loss = epoch_loss / len(loader.dataset)  

print("  Time took: {:}".format(format_time(time.time() - start_time)))

Iter=200, loss=2.6334, mse=2.6295, time=0.0434


KeyboardInterrupt: 

In [171]:
len(predict_list)

263

In [172]:
len(label_list)

263

In [91]:
predict_list[:10]

[1.474928617477417,
 4.432092189788818,
 1.6393139362335205,
 3.679456949234009,
 2.2003211975097656,
 3.414210081100464,
 2.7713494300842285,
 3.535055160522461,
 3.712036371231079,
 2.852395534515381]

In [92]:
label_list[:10]

[1.5, 3.0, 1.0, 3.0, 2.0, 2.5, 4.5, 3.0, 3.5, 2.0]

### Evaluate 함수 테스트

In [93]:
label_type = 'rating'
model = model_r
loader = test_loader_r
device = args.device

In [117]:
start_time = time.time()
predict_list = list()
label_list = list()

# Evaluate RMSE
model.eval()
mse = 0.
for batch in loader:
    with th.no_grad():
        preds = model(batch[0].to(device))
    labels = batch[1].to(device)
    
    if label_type == 'rating':
        preds  = (preds + 1)/2
        labels = (labels + 1)/2
    else:
        preds  = preds + 1
        labels = labels + 1
    mse += ((preds - labels) ** 2).sum().item()
    
    predict_list.append(preds.tolist()) # 예측값 저장
    label_list.append(labels.tolist()) # 정답값 저장

# 2차원 -> 1차원 리스트 변형
predict_list = [element for array in predict_list for element in array]
label_list = [element for array in label_list for element in array]    
    
mse /= len(loader.dataset)
rmse = np.sqrt(mse)

print("  Time took: {:}".format(format_time(time.time() - start_time)))

  Time took: 0:00:26


In [120]:
rmse

0.8208980129263855