In [1]:
from feature_extractors import ScoreExpressionExtractor

In [1]:
import argparse
import os
import numpy as np

import torch
from torch.autograd import Variable
from torch import nn, optim
from torch.utils.data import DataLoader

from tokenizers import DummyTokenizer
from feature_extractors import LengthFeatureExtractor
from dictionaries import RandomWordDictionary
from dataloader import load_data, collate_fn, Preprocessor, MovieReviewDataset
from trainers import Trainer
import utils

import nsml
from nsml import DATASET_PATH, HAS_DATASET, GPU_NUM, IS_ON_NSML

from models.WordCNN import WordCNN


# DONOTCHANGE: They are reserved for nsml
# This is for nsml leaderboard
def bind_model(model, config):
    # 학습한 모델을 저장하는 함수입니다.
    def save(filename, *args):
        checkpoint = {
            'model': model.state_dict()
        }
        torch.save(checkpoint, filename)

    # 저장한 모델을 불러올 수 있는 함수입니다.
    def load(filename, *args):
        checkpoint = torch.load(filename)
        model.load_state_dict(checkpoint['model'])
        print('Model loaded')

    def infer(raw_data, **kwargs):
        """
        :param raw_data: raw input (여기서는 문자열)을 입력받습니다
        :param kwargs:
        :return:
        """
        # dataset.py에서 작성한 preprocess 함수를 호출하여, 문자열을 벡터로 변환합니다
        # Not yet implemented
        reviews, features = preprocessor.preprocess(raw_data)
        model.eval()
        # 저장한 모델에 입력값을 넣고 prediction 결과를 리턴받습니다
        output_prediction = model(reviews, features)
        point = output_prediction.data.squeeze(dim=1).tolist()
        # DONOTCHANGE: They are reserved for nsml
        # 리턴 결과는 [(confidence interval, 포인트)] 의 형태로 보내야만 리더보드에 올릴 수 있습니다. 리더보드 결과에 confidence interval의 값은 영향을 미치지 않습니다
        return list(zip(np.zeros(len(point)), point))

    # DONOTCHANGE: They are reserved for nsml
    # nsml에서 지정한 함수에 접근할 수 있도록 하는 함수입니다.
    nsml.bind(save=save, load=load, infer=infer)

args = argparse.ArgumentParser()
# DONOTCHANGE: They are reserved for nsml
args.add_argument('--mode', type=str, default='train')
args.add_argument('--pause', type=int, default=0)
args.add_argument('--iteration', type=str, default='0')

# User options
args.add_argument('--use_gpu', type=bool, default=torch.cuda.is_available() or GPU_NUM)
args.add_argument('--output', type=int, default=1)
args.add_argument('--epochs', type=int, default=10)
args.add_argument('--batch_size', type=int, default=64)
args.add_argument('--max_vocab_size', type=int, default=10000)
args.add_argument('--min_count', type=int, default=3)
args.add_argument('--sentence_length', type=int, default=20)
args.add_argument('--embedding_size', type=int, default=100)
args.add_argument('--learning_rate', type=float, default=0.01)
args.add_argument('--print_every', type=int, default=1)
args.add_argument('--save_every', type=int, default=1)

_StoreAction(option_strings=['--save_every'], dest='save_every', nargs=None, const=None, default=1, type=<class 'int'>, choices=None, help=None, metavar=None)

In [2]:
config = args.parse_args([])

In [3]:
logger = utils.get_logger('MovieReview')
logger.info('Arguments: {}'.format(config))

if not HAS_DATASET and not IS_ON_NSML:  # It is not running on nsml
    DATASET_PATH = 'data/movie_review_phase1/'

# DONOTCHANGE: They are reserved for nsml
if config.pause:
    nsml.paused(scope=locals())

[INFO] 04-04 00:20:27 > Arguments: Namespace(batch_size=64, embedding_size=100, epochs=10, iteration='0', learning_rate=0.01, max_vocab_size=10000, min_count=3, mode='train', output=1, pause=0, print_every=1, save_every=1, sentence_length=20, use_gpu=True)


In [4]:
if config.mode == 'train':
    # 데이터를 로드합니다.
    logger.info("Loading data...")
    train_data, val_data = load_data(DATASET_PATH, val_size=0.3)

    logger.info("Building preprocessor...")
    tokenizer = DummyTokenizer(config)
    feature_extractor1 = LengthFeatureExtractor(config)
    feature_extractors = [feature_extractor1]
    dictionary = RandomWordDictionary(tokenizer, config)
    dictionary.build_dictionary(train_data)

    preprocessor = Preprocessor(tokenizer, feature_extractors, dictionary)

    logger.info("Making dataset & dataloader...")
    train_dataset = MovieReviewDataset(train_data, preprocessor, sort=False, min_length=config.sentence_length, max_length=config.sentence_length)
    val_dataset = MovieReviewDataset(val_data, preprocessor, sort=False, min_length=config.sentence_length, max_length=config.sentence_length)

    train_dataloader = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn,
                              num_workers=2)
    val_dataloader = DataLoader(dataset=val_dataset, batch_size=config.batch_size, shuffle=True,
                                  collate_fn=collate_fn, num_workers=2)

    model = WordCNN(dictionary, config)
    if config.use_gpu:
        model = model.cuda()

    # DONOTCHANGE: Reserved for nsml use
    bind_model(model, config)

    criterion = nn.MSELoss(size_average=False)
    trainable_params = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.Adam(params=trainable_params, lr=0.01)

    trainer = Trainer(model, train_dataloader, val_dataloader, criterion=criterion, optimizer=optimizer,
                      lr_schedule=False, lr_scheduler=None, use_gpu=config.use_gpu, logger=logger)
    trainer.run(epochs=config.epochs)

# 로컬 테스트 모드일때 사용합니다
# 결과가 아래와 같이 나온다면, nsml submit을 통해서 제출할 수 있습니다.
# [(0.0, 9.045), (0.0, 5.91), ... ]
elif config.mode == 'test_local':
    with open(os.path.join(DATASET_PATH, 'train/train_data'), 'rt', encoding='utf-8') as f:
        reviews = f.readlines()
    res = nsml.infer(reviews)
    print(res)

[INFO] 04-04 00:20:28 > Loading data...
[INFO] 04-04 00:20:28 > Building preprocessor...
[INFO] 04-04 00:20:29 > Making dataset & dataloader...


  0%|          | 0/5692 [00:00<?, ?it/s]


RuntimeError: dimension out of range (expected to be in range of [-1, 0], but got 1)

In [5]:
%debug

> [0;32m/home/dreamgonfly/ToMuchInfo/code/trainers.py[0m(128)[0;36maccuracy[0;34m()[0m
[0;32m    126 [0;31m    [0;32mdef[0m [0maccuracy[0m[0;34m([0m[0mself[0m[0;34m,[0m [0moutputs[0m[0;34m,[0m [0mlabels[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0m
[0m[0;32m    127 [0;31m[0;34m[0m[0m
[0m[0;32m--> 128 [0;31m        [0mmaximum[0m[0;34m,[0m [0margmax[0m [0;34m=[0m [0moutputs[0m[0;34m.[0m[0mmax[0m[0;34m([0m[0mdim[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m    129 [0;31m        [0mcorrects[0m [0;34m=[0m [0margmax[0m [0;34m==[0m [0mlabels[0m  [0;31m# ByteTensor[0m[0;34m[0m[0m
[0m[0;32m    130 [0;31m        [0mn_corrects[0m [0;34m=[0m [0mcorrects[0m[0;34m.[0m[0mfloat[0m[0;34m([0m[0;34m)[0m[0;34m.[0m[0msum[0m[0;34m([0m[0;34m)[0m  [0;31m# FloatTensor[0m[0;34m[0m[0m
[0m
ipdb> outputs
Variable containing:
 0.1273
-0.5819
-0.6532
-0.1320
-0.5961
-0.7171
-0.2539
-0.3998
 0.2566
-0.1505