In [1]:
import torch
from torch import nn
from torch.autograd import Variable

In [9]:
import argparse
import os
from os.path import dirname, abspath, join, exists
import numpy as np

import torch
from torch.autograd import Variable
from torch import nn, optim
from torch.utils.data import DataLoader

import normalizers
import tokenizers
from tokenizers import TwitterTokenizer
import feature_extractors
import dictionaries
from dataloader import load_data, collate_fn, Preprocessor, MovieReviewDataset
from trainers import Trainer
import utils

import nsml
from nsml import DATASET_PATH, HAS_DATASET, GPU_NUM, IS_ON_NSML

from models.WordCNN import WordCNN
from models.VDCNN import VDCNN
from models.VDCNN_feat import VDCNN_feat
from models.WordCNN_feat import WordCNN_feat
from models.LSTMText import LSTMText

# Random seed
np.random.seed(0)
torch.manual_seed(0)

args = argparse.ArgumentParser()
# DONOTCHANGE: They are reserved for nsml
args.add_argument('--mode', type=str, default='train')
args.add_argument('--pause', type=int, default=0)
args.add_argument('--iteration', type=str, default='0')

# User options
args.add_argument('--model', type=str, default='LSTMText', choices=['WordCNN', 'WordCNN_feat', 'VDCNN', 'VDCNN_feat', 'VDCNN_feat_dropout', 'LSTMText'])
args.add_argument('--normalizer', type=str, default='BasicNormalizer')
args.add_argument('--tokenizer', type=str, default='TwitterTokenizer')
args.add_argument('--features', type=str, default='MovieActorFeaturesExtractor_ScoreExpressionExtractor')  # LengthFeatureExtractor_MovieActorFeaturesExtractor ...
args.add_argument('--dictionary', type=str, default='FastTextVectorizer')
args.add_argument('--use_gpu', type=bool, default=torch.cuda.is_available() or GPU_NUM)
args.add_argument('--output', type=int, default=1)
args.add_argument('--epochs', type=int, default=10)
args.add_argument('--batch_size', type=int, default=128)
args.add_argument('--vocabulary_size', type=int, default=50000)
args.add_argument('--embedding_size', type=int, default=256)
args.add_argument('--min_length', type=int, default=5)
args.add_argument('--max_length', type=int, default=100)
args.add_argument('--sort_dataset', action='store_true')
args.add_argument('--shuffle_dataset', action='store_true')
args.add_argument('--learning_rate', type=float, default=0.01)
args.add_argument('--lr_schedule', action='store_true')
args.add_argument('--print_every', type=int, default=1)
args.add_argument('--save_every', type=int, default=1)
config = args.parse_args()

logger = utils.get_logger('MovieReview')
logger.info('Arguments: {}'.format(config))

if config.model == 'WordCNN': Model = WordCNN
elif config.model == 'WordCNN_feat': Model = WordCNN_feat
elif config.model == 'VDCNN': Model = VDCNN
elif config.model == 'VDCNN_feat': Model = VDCNN_feat
elif config.model == 'LSTMText': Model = LSTMText

Normalizer = getattr(normalizers, config.normalizer)
normalizer = Normalizer(config)

Tokenizer = getattr(tokenizers, config.tokenizer)
tokenizer = Tokenizer(config)

Dictionary = getattr(dictionaries, config.dictionary)
dictionary = Dictionary(tokenizer, config)

feature_extractor_list = []
for feature_name in config.features.split('_'):
    FeatureExtractor = getattr(feature_extractors, feature_name)
    feature_extractor = FeatureExtractor(config)
    feature_extractor_list.append((feature_name, feature_extractor))

preprocessor = Preprocessor(config, normalizer, tokenizer, feature_extractor_list, dictionary)
print("Number of features:", preprocessor.n_features)

model = Model(config, n_features=preprocessor.n_features)

if config.use_gpu:
    model = model.cuda()

if not HAS_DATASET and not IS_ON_NSML:  # It is not running on nsml
    DATASET_PATH = 'data/small/'

# DONOTCHANGE: They are reserved for nsml
# This is for nsml leaderboard
def bind_model(model, config):
    # 학습한 모델을 저장하는 함수입니다.
    def save(filename, *args):
        checkpoint = {
            'model': model.state_dict(),
            'preprocessor': preprocessor.state_dict()
        }
        torch.save(checkpoint, filename)

    # 저장한 모델을 불러올 수 있는 함수입니다.
    def load(filename, *args):
        checkpoint = torch.load(filename)
        model.load_state_dict(checkpoint['model'])
        preprocessor.load_state_dict(checkpoint['preprocessor'])

        print('Model loaded')

    def infer(raw_data, **kwargs):
        """
        :param raw_data: raw input (여기서는 문자열)을 입력받습니다
        :param kwargs:
        :return:
        """
        # dataset.py에서 작성한 preprocess 함수를 호출하여, 문자열을 벡터로 변환합니다
        
        reviews, features = preprocessor.preprocess_all(raw_data)
        reviews, features = Variable(reviews), Variable(features)
        if config.use_gpu:
            reviews, features = reviews.cuda(), features.cuda()

        model.eval()
        # 저장한 모델에 입력값을 넣고 prediction 결과를 리턴받습니다
        output_prediction = model(reviews, features)
        prediction_clipped = torch.clamp(output_prediction, min=1, max=10)
        point = prediction_clipped.data.tolist()
        # DONOTCHANGE: They are reserved for nsml
        # 리턴 결과는 [(confidence interval, 포인트)] 의 형태로 보내야만 리더보드에 올릴 수 있습니다. 리더보드 결과에 confidence interval의 값은 영향을 미치지 않습니다
        return list(zip(np.zeros(len(point)), point))

    # DONOTCHANGE: They are reserved for nsml
    # nsml에서 지정한 함수에 접근할 수 있도록 하는 함수입니다.
    nsml.bind(save=save, load=load, infer=infer)

# DONOTCHANGE: Reserved for nsml use
bind_model(model, config)

# DONOTCHANGE: They are reserved for nsml
if config.pause:
    nsml.paused(scope=locals())

# 학습 모드일 때 사용합니다. (기본값)
if config.mode == 'train':
    # 데이터를 로드합니다.
    logger.info("Loading data...")
    train_data, val_data = load_data(DATASET_PATH, val_size=0.1)

    logger.info("Building preprocessor...")
    for feature_name, feature_extractor in preprocessor.feature_extractors:
        feature_extractor.fit(train_data)

    preprocessor.dictionary.build_dictionary(train_data)

    logger.info("Making dataset & dataloader...")
    train_dataset = MovieReviewDataset(train_data, preprocessor, sort=config.sort_dataset, min_length=config.min_length, max_length=config.max_length)
    val_dataset = MovieReviewDataset(val_data, preprocessor, sort=config.sort_dataset, min_length=config.min_length, max_length=config.max_length)

    train_dataloader = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=config.shuffle_dataset, collate_fn=collate_fn,
                              num_workers=2)
    val_dataloader = DataLoader(dataset=val_dataset, batch_size=config.batch_size, shuffle=True,
                                  collate_fn=collate_fn, num_workers=2)

    if preprocessor.dictionary.embedding is not None:
        embedding_weights = torch.FloatTensor(dictionary.embedding)
        if config.use_gpu:
            embedding_weights = embedding_weights.cuda()
        model.embedding.weight = nn.Parameter(embedding_weights, requires_grad=False)

    criterion = nn.MSELoss(size_average=False)
    trainable_params = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.Adam(params=trainable_params, lr=config.learning_rate)
    lr_scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.8)  # .ReduceLROnPlateau(optimizer, factor=0.7, patience=5, min_lr=0.00005)

    trainer = Trainer(model, train_dataloader, val_dataloader, criterion=criterion, optimizer=optimizer,
                      lr_schedule=config.lr_schedule, lr_scheduler=lr_scheduler, use_gpu=config.use_gpu, logger=logger)
    trainer.run(epochs=config.epochs)

# 로컬 테스트 모드일때 사용합니다
# 결과가 아래와 같이 나온다면, nsml submit을 통해서 제출할 수 있습니다.
# [(0.0, 9.045), (0.0, 5.91), ... ]

usage: ipykernel_launcher.py [-h] [--mode MODE] [--pause PAUSE]
                             [--iteration ITERATION]
                             [--model {WordCNN,WordCNN_feat,VDCNN,VDCNN_feat,VDCNN_feat_dropout,LSTMText}]
                             [--normalizer NORMALIZER] [--tokenizer TOKENIZER]
                             [--features FEATURES] [--dictionary DICTIONARY]
                             [--use_gpu USE_GPU] [--output OUTPUT]
                             [--epochs EPOCHS] [--batch_size BATCH_SIZE]
                             [--vocabulary_size VOCABULARY_SIZE]
                             [--embedding_size EMBEDDING_SIZE]
                             [--min_length MIN_LENGTH]
                             [--max_length MAX_LENGTH] [--sort_dataset]
                             [--shuffle_dataset]
                             [--learning_rate LEARNING_RATE] [--lr_schedule]
                             [--print_every PRINT_EVERY]
                             [--save_every

SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [10]:
config = args.parse_args([])

In [18]:
logger = utils.get_logger('MovieReview')
logger.info('Arguments: {}'.format(config))

if not HAS_DATASET and not IS_ON_NSML:  # It is not running on nsml
    DATASET_PATH = 'data/small'

# DONOTCHANGE: They are reserved for nsml
if config.pause:
    nsml.paused(scope=locals())

[INFO] 04-10 21:03:12 > Arguments: Namespace(batch_size=128, dictionary='FastTextVectorizer', embedding_size=256, epochs=10, features='MovieActorFeaturesExtractor_ScoreExpressionExtractor', iteration='0', learning_rate=0.01, lr_schedule=False, max_length=100, min_length=5, mode='train', model='LSTMText', normalizer='BasicNormalizer', output=1, pause=0, print_every=1, save_every=1, shuffle_dataset=False, sort_dataset=False, tokenizer='TwitterTokenizer', use_gpu=0, vocabulary_size=50000)


In [19]:
from feature_extractors import MovieActorFeaturesExtractor, ScoreExpressionExtractor
from dictionaries import FastTextVectorizer
from models.LSTMText import LSTMText

In [21]:
if config.mode == 'train':
    # 데이터를 로드합니다.
    logger.info("Loading data...")
    train_data, val_data = load_data(DATASET_PATH, val_size=0.3)

    logger.info("Building preprocessor...")
    tokenizer = TwitterTokenizer(config)
    feature_extractors = [MovieActorFeaturesExtractor, ScoreExpressionExtractor]
    dictionary = FastTextVectorizer(tokenizer, config)
    dictionary.build_dictionary(train_data)

    preprocessor = Preprocessor(tokenizer, feature_extractors, dictionary)

    logger.info("Making dataset & dataloader...")
    train_dataset = MovieReviewDataset(train_data, preprocessor, sort=False, min_length=config.sentence_length, max_length=config.sentence_length)
    val_dataset = MovieReviewDataset(val_data, preprocessor, sort=False, min_length=config.sentence_length, max_length=config.sentence_length)

    train_dataloader = DataLoader(dataset=train_dataset, batch_size=config.batch_size, shuffle=True, collate_fn=collate_fn,
                              num_workers=2)
    val_dataloader = DataLoader(dataset=val_dataset, batch_size=config.batch_size, shuffle=True,
                                  collate_fn=collate_fn, num_workers=2)

    model = LSTMText(dictionary, config)
    if config.use_gpu:
        model = model.cuda()

    # DONOTCHANGE: Reserved for nsml use
    bind_model(model, config)

    criterion = nn.MSELoss(size_average=False)
    trainable_params = [p for p in model.parameters() if p.requires_grad]
    optimizer = optim.Adam(params=trainable_params, lr=0.01)

    trainer = Trainer(model, train_dataloader, val_dataloader, criterion=criterion, optimizer=optimizer,
                      lr_schedule=False, lr_scheduler=None, use_gpu=config.use_gpu, logger=logger)
    trainer.run(epochs=config.epochs)

# 로컬 테스트 모드일때 사용합니다
# 결과가 아래와 같이 나온다면, nsml submit을 통해서 제출할 수 있습니다.
# [(0.0, 9.045), (0.0, 5.91), ... ]
elif config.mode == 'test_local':
    with open(os.path.join(DATASET_PATH, 'train/train_data'), 'rt', encoding='utf-8') as f:
        reviews = f.readlines()
    res = nsml.infer(reviews)
    print(res)

[INFO] 04-10 21:03:56 > Loading data...
[INFO] 04-10 21:03:56 > Building preprocessor...
(50000, 256)


TypeError: __init__() missing 2 required positional arguments: 'feature_extractors' and 'dictionary'

In [5]:
%debug

> [0;32m/home/dreamgonfly/ToMuchInfo/code/trainers.py[0m(128)[0;36maccuracy[0;34m()[0m
[0;32m    126 [0;31m    [0;32mdef[0m [0maccuracy[0m[0;34m([0m[0mself[0m[0;34m,[0m [0moutputs[0m[0;34m,[0m [0mlabels[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0m
[0m[0;32m    127 [0;31m[0;34m[0m[0m
[0m[0;32m--> 128 [0;31m        [0mmaximum[0m[0;34m,[0m [0margmax[0m [0;34m=[0m [0moutputs[0m[0;34m.[0m[0mmax[0m[0;34m([0m[0mdim[0m[0;34m=[0m[0;36m1[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m    129 [0;31m        [0mcorrects[0m [0;34m=[0m [0margmax[0m [0;34m==[0m [0mlabels[0m  [0;31m# ByteTensor[0m[0;34m[0m[0m
[0m[0;32m    130 [0;31m        [0mn_corrects[0m [0;34m=[0m [0mcorrects[0m[0;34m.[0m[0mfloat[0m[0;34m([0m[0;34m)[0m[0;34m.[0m[0msum[0m[0;34m([0m[0;34m)[0m  [0;31m# FloatTensor[0m[0;34m[0m[0m
[0m
ipdb> outputs
Variable containing:
 0.1273
-0.5819
-0.6532
-0.1320
-0.5961
-0.7171
-0.2539
-0.3998
 0.2566
-0.1505