# 사전 확인사항

- Runtime type 변경 후 GPU가 제대로 할당됐는지 확인하기
- 체크포인트 저장을 위한 Google Drive 연동
- 필요 모듈 설치
- 기존 코드에 남아있는 `logger`를 그대로 사용하기 위한 세팅

In [1]:
!nvidia-smi

Sun Sep  6 06:19:28 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.66       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    36W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!ls '/content/drive/My Drive'

'Colab Notebooks'   KoGPT2-personachat	 korquad_2.1


In [4]:
!pip install gluonnlp mxnet pytorch-lightning sentencepiece transformers



In [5]:
import logging

logger = logging.getLogger('cm_kogpt2')
logging.basicConfig(level=logging.INFO)

# KoGPT2 모델 다운받기

In [6]:
import hashlib
import os
import requests
import sys

import gluonnlp as nlp
from gluonnlp.data import SentencepieceTokenizer, SentencepieceDetokenizer
from transformers import GPT2Config, GPT2LMHeadModel

In [7]:
def _download(url, filename, chksum, cachedir='~/kogpt2/'):
    f_cachedir = os.path.expanduser(cachedir)
    os.makedirs(f_cachedir, exist_ok=True)
    file_path = os.path.join(f_cachedir, filename)
    if os.path.isfile(file_path):
        if hashlib.md5(open(file_path,
                            'rb').read()).hexdigest()[:10] == chksum:
            print('using cached model')
            return file_path
    with open(file_path, 'wb') as f:
        response = requests.get(url, stream=True)
        total = response.headers.get('content-length')

        if total is None:
            f.write(response.content)
        else:
            downloaded = 0
            total = int(total)
            for data in response.iter_content(
                    chunk_size=max(int(total / 1000), 1024 * 1024)):
                downloaded += len(data)
                f.write(data)
                done = int(50 * downloaded / total)
                sys.stdout.write('\r[{}{}]'.format('█' * done,
                                                   '.' * (50 - done)))
                sys.stdout.flush()
    sys.stdout.write('\n')
    assert chksum == hashlib.md5(open(
        file_path, 'rb').read()).hexdigest()[:10], 'corrupted file!'
    return file_path


def get_kogpt2_model(filepath, cachedir='/content/kogpt2/'):
    """Get KoGPT2 model after downloading"""

    model_info = {
        'url':
        'https://kobert.blob.core.windows.net/models/kogpt2/pytorch/pytorch_kogpt2_676e9bcfa7.params',
        'fname': 'pytorch_kogpt2_676e9bcfa7.params',
        'chksum': '676e9bcfa7'
    }

    kogpt2_config = {
        "initializer_range": 0.02,
        "layer_norm_epsilon": 1e-05,
        "n_ctx": 1024,
        "n_embd": 768,
        "n_head": 12,
        "n_layer": 12,
        "n_positions": 1024,
        "vocab_size": 50000,
        "activation_function": "gelu",
        "bos_id": 0,
        "eos_id": 1
    }

    if filepath:
        logger.info("Loading {}".format(filepath))
        model_path = filepath
    else:
        model_path = _download(model_info['url'],
                               model_info['fname'],
                               model_info['chksum'],
                               cachedir=cachedir)

    model = GPT2LMHeadModel.from_pretrained(pretrained_model_name_or_path=None,
                                            config=GPT2Config.from_dict(kogpt2_config),
                                            state_dict=torch.load(model_path))
    model.eval()

    return model


def get_kogpt2_tokenizer(cachedir='/content/kogpt2/'):
    """Get KoGPT2 Tokenizer after downloading"""

    vocab_info = {
        'url':
        'https://kobert.blob.core.windows.net/models/kogpt2/tokenizer/kogpt2_news_wiki_ko_cased_818bfa919d.spiece',
        'fname': 'kogpt2_news_wiki_ko_cased_818bfa919d.spiece',
        'chksum': '818bfa919d'
    }

    vocab_path = _download(vocab_info['url'],
                           vocab_info['fname'],
                           vocab_info['chksum'],
                           cachedir=cachedir)

    tokenizer = SentencepieceTokenizer(vocab_path)
    detokenizer = SentencepieceDetokenizer(vocab_path)
    vocab = nlp.vocab.BERTVocab.from_sentencepiece(vocab_path,
                                                   mask_token=None,
                                                   sep_token=None,
                                                   cls_token=None,
                                                   unknown_token='<unk>',
                                                   padding_token='<pad>',
                                                   bos_token='<s>',
                                                   eos_token='</s>')

    return tokenizer, detokenizer, vocab

# 데이터셋 로드 및 체크포인트 저장을 위한 함수

In [8]:
import json
import os
from datetime import datetime
from itertools import chain

import torch
from torch.utils.data import DataLoader, TensorDataset

In [9]:
SPECIAL_TOKENS = ["<s>", "</s>", "<usr>", "<sys>", "<pad>"]
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '<pad>',
                         'additional_special_tokens': ['<usr>', '<sys>']}
MODEL_INPUTS = ["input_ids", "labels", "token_type_ids"]
PADDED_INPUTS = ["input_ids", "labels", "token_type_ids"]

In [10]:
def get_dataset(tokenizer, vocab, dataset_path, dataset_cache):
    """Read PersonaChat json file and return tokenized dataset"""
    dataset_basename = os.path.basename(dataset_path).split(".")[0]
    dataset_cache = "dataset_cache_{}".format(dataset_basename)

    if dataset_cache and os.path.isfile(dataset_cache):
        logger.info("Load tokenized dataset from cache at %s", dataset_cache)
        dataset = torch.load(dataset_cache)

    else:
        logger.info("Reading {}".format(dataset_path))
        with open(dataset_path, "r", encoding="utf-8") as f:
            dataset = json.loads(f.read())

        logger.info("Tokenize and encode the dataset")

        def tokenize(obj):
            if isinstance(obj, str):
                return vocab[tokenizer(obj)]
            if isinstance(obj, dict):
                return dict((n, tokenize(o)) for n, o in obj.items())
            return list(tokenize(o) for o in obj)
        dataset = tokenize(dataset)
        torch.save(dataset, dataset_cache)

    return dataset


def make_logdir(model_name: str):
    """Create unique path to save results and checkpoints, e.g. runs/Sep22_19-45-59_gpu-7_gpt2"""
    # Code copied from ignite repo
    current_time = datetime.now().strftime('%b%d_%H-%M-%S')
    logdir = os.path.join(
        'runs', current_time + '_' + socket.gethostname() + '_' + model_name)
    return logdir


def pad_dataset(args, dataset, padding=0):
    """ Pad the dataset.
    This could be optimized by defining a Dataset class and padding at the batch level,
    but this is simpler. """
    # max_l = max(len(x) for x in dataset["input_ids"])
    max_l = args.max_len

    for name in PADDED_INPUTS:
        dataset[name] = [x + [padding if name != "labels" else -100] * (max_l - len(x)) for x in dataset[name]]

    return dataset


def build_input_from_segments(persona, history, reply, vocab, labels=False, with_eos=True):
    """ Build a sequence of input from 3 segments: persona, history and last reply. """
    bos, eos, speaker1, speaker2 = vocab[SPECIAL_TOKENS[:-1]]
    sequence = [[bos] + list(chain(*persona))] + \
        history + [reply + ([eos] if with_eos else [])]
    sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) %
                                 2 else speaker1] + s for i, s in enumerate(sequence[1:])]
    instance = {}
    instance["input_ids"] = list(chain(*sequence))
    instance["token_type_ids"] = [speaker2 if i %
                                  2 else speaker1 for i, s in enumerate(sequence) for _ in s]
    instance["labels"] = [-100] * len(instance["input_ids"])
    if labels:
        instance["labels"] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]

    return instance


def get_data_loaders(args, tokenizer, vocab):
    """ Prepare the dataset for training and evaluation """
    personachat = get_dataset(tokenizer, vocab, args.dataset_path, args.dataset_cache)

    logger.info("Build inputs and labels")
    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
    for dataset_name, dataset in personachat.items():
        num_candidates = len(dataset[0]["utterances"][0]["candidates"])
        if args.num_candidates > 0 and dataset_name == 'train':
            num_candidates = min(args.num_candidates, num_candidates)
        for dialog in dataset:
            persona = dialog["personality"].copy()
            for _ in range(args.personality_permutations):
                for utterance in dialog["utterances"]:
                    history = utterance["history"][-(2 * args.max_history + 1):]
                    for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
                        labels = bool(j == num_candidates - 1)
                        instance = build_input_from_segments(
                            persona, history, candidate, vocab, labels)
                        for input_name, input_array in instance.items():
                            datasets[dataset_name][input_name].append(
                                input_array)
                    datasets[dataset_name]["n_candidates"] = num_candidates
                # permuted personalities
                persona = [persona[-1]] + persona[:-1]

    logger.info("Pad inputs and convert to Tensor")
    tensor_datasets = {"train": [], "valid": []}
    for dataset_name, dataset in datasets.items():
        dataset = pad_dataset(
            args, dataset, padding=vocab[SPECIAL_TOKENS[-1]])
        for input_name in MODEL_INPUTS:
            tensor = torch.tensor(dataset[input_name])
            tensor = tensor.view(
                (-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
            tensor_datasets[dataset_name].append(tensor)

    logger.info("Build train and validation dataloaders")
    train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
    train_loader = DataLoader(train_dataset,
                              batch_size=args.train_batch_size,
                              num_workers=args.num_workers,
                              shuffle=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=args.valid_batch_size,
                              num_workers=args.num_workers,
                              shuffle=False)

    return train_loader, valid_loader

# PersonaChat model defined for PyTorch Lightning

In [11]:
from pytorch_lightning.core.lightning import LightningModule
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup

In [12]:
class CMPersonaChat(LightningModule):
    def __init__(self, hparams, *args):
        super(CMPersonaChat, self).__init__()
        self.hparams = hparams
        self.kogpt2 = get_kogpt2_model(hparams.model_params)

    def forward(self, inputs, token_type_ids):
        output, *_ = self.kogpt2(inputs, token_type_ids=token_type_ids)
        return output

    def training_step(self, batch, batch_idx):
        batch = tuple(input_tensor.to(self.hparams.device) for input_tensor in batch)
        token_ids, label, mask = batch
        # forward: input(batch,max_sentence_length) -> output(batch_size, max_sentence_length,vocab)
        # e.g. (4,768) -> (4,768,50000)
        loss, *_ = self.kogpt2(token_ids, token_type_ids=mask, labels=label)
        tensorboard_logs = {'train_loss': loss}
        return {'loss': loss, 'log': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        batch = tuple(input_tensor.to(self.hparams.device) for input_tensor in batch)
        token_ids, label, mask = batch
        loss, *_ = self.kogpt2(token_ids, token_type_ids=mask, labels=label)
        return {'val_loss': loss}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        tensorboard_logs = {'val_loss': avg_loss}
        return {'avg_val_loss': avg_loss, 'log': tensorboard_logs}

    def configure_optimizers(self):
        # Prepare optimizer
        param_optimizer = list(self.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.hparams.lr, correct_bias=False)
        # warm up lr
        num_train_steps = len(self.train_dataloader()) * self.hparams.max_epochs
        num_warmup_steps = int(num_train_steps * self.hparams.warmup_ratio)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps)
        lr_scheduler = {'scheduler': scheduler, 'name': 'cosine_schedule_with_warmup',
                        'monitor': 'loss', 'interval': 'step',
                        'frequency': 1}
        return [optimizer], [lr_scheduler]

# Main function

In [13]:
import argparse
from collections import defaultdict

import torch
import torch.nn.functional as F
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

## argparser

In [14]:
parser = argparse.ArgumentParser()
parser.add_argument("--device", type=str,
                    default="cuda" if torch.cuda.is_available() else "cpu",
                    help="Device (cuda or cpu)")
parser.add_argument("--dataset_path", type=str,
                    default="dataset/personachat_manual_translated.json",
                    help="Path of the dataset.")
parser.add_argument("--dataset_cache", type=str,
                    default='./dataset_cache',
                    help="Path or url of the dataset cache")
parser.add_argument("--num_candidates", type=int, default=1,
                    help="Number of candidates for training")
parser.add_argument("--personality_permutations", type=int, default=1,
                    help="Number of permutations of personality sentences")
parser.add_argument("--max_history", type=int, default=2,
                    help="Number of previous exchanges to keep in history")
parser.add_argument("--name", type=str,
                    default="cm_kogpt2",
                    help="Model name for logging")
parser.add_argument('--lr',
                    type=float,
                    default=5e-5,
                    help='The initial learning rate')
parser.add_argument('--warmup_ratio',
                    type=float,
                    default=0.1,
                    help='warmup ratio')

# Shared arguments for dataloader and training
parser.add_argument('--max_len',
                    type=int,
                    default=768,
                    help='max sentence length on input (default: 768)')
parser.add_argument("--train_batch_size", type=int,
                    default=2, help="Batch size for training")
parser.add_argument("--valid_batch_size", type=int,
                    default=1, help="Batch size for validation")
parser.add_argument("--num_workers", type=int,
                    default=16, help="Number of workers for DataLoader")

# Select train/inference
parser.add_argument('--train',
                    action='store_true',
                    default=False,
                    help='eval train set (default: False)')
parser.add_argument('--restore',
                    action='store_true',
                    default=False,
                    help='train using saved checkpoint (default: False)')
parser.add_argument('--chat',
                    action='store_true',
                    default=False,
                    help='response generation on given user input')
parser.add_argument('--model_params',
                    type=str,
                    help='model binary for starting chat')

# Additional arguments for chatting
parser.add_argument("--temperature", type=float, default=0.7, help="Sampling softmax temperature")
parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")

# Evaluation
parser.add_argument('--chat_test',
                    action='store_true',
                    default=False,
                    help='response generation on given user input')
parser.add_argument("--eval_dataset_path", type=str,
                    default="eval/eval_merge.json",
                    help="Path of the evaluation dataset.")
parser.add_argument('--num_eval_pp',
                    type=int, default=10,
                    help='The number of dialogue steps for the ping-pong test ')

_StoreAction(option_strings=['--num_eval_pp'], dest='num_eval_pp', nargs=None, const=None, default=10, type=<class 'int'>, choices=None, help='The number of dialogue steps for the ping-pong test ', metavar=None)

## main function

이 때 Colab에서 argument가 정상적으로 들어가게 하기 위해 아래와 같은 방식으로 `parse_args` 함수에 인자로 `args` 리스트를 줘야 힘
```
args = parser.parse_args(args=['--train', '--dataset_path', '/content/drive/My Drive/KoGPT2-personachat/dataset/sample.json'])
```

In [None]:
# Model configuration augments
parser = Trainer.add_argparse_args(parser)
args = parser.parse_args(args=['--train', '--dataset_path', '/content/drive/My Drive/KoGPT2-personachat/dataset/sample.json'])

tokenizer, detokenizer, vocab = get_kogpt2_tokenizer()
model = CMPersonaChat(args)
model.to(args.device)

# Fine-tuning KoGPT2 for the PersonaChat
train_loader, val_loader = get_data_loaders(args, tokenizer, vocab)
tb_logger = TensorBoardLogger("/content/drive/My Drive/KoGPT2-personachat/logs", name=args.name)

checkpoint_callback = ModelCheckpoint(
    filepath='{}/checkpoints/{}'.format(tb_logger.log_dir, '{epoch:02d}-{val_loss:.4f}'),
    verbose=True,
    save_last=True,
    save_top_k=10,
    monitor='val_loss',
    mode='min',
    prefix='model_'
)

if args.restore:
    trainer = Trainer(resume_from_checkpoint=args.model_params,
                        checkpoint_callback=checkpoint_callback,
                        gradient_clip_val=1.0,
                        logger=tb_logger)
else:
    trainer = Trainer.from_argparse_args(
        args,
        checkpoint_callback=checkpoint_callback,
        weights_save_path=os.getcwd(),
        gradient_clip_val=1.0,
        logger=tb_logger)
model.train()
trainer.fit(model, train_loader, val_loader)
logging.info('best model path {}'.format(checkpoint_callback.best_model_path))

using cached model
using cached model


INFO:cm_kogpt2:Load tokenized dataset from cache at dataset_cache_sample
INFO:cm_kogpt2:Build inputs and labels
INFO:cm_kogpt2:Pad inputs and convert to Tensor
INFO:cm_kogpt2:Build train and validation dataloaders
GPU available: True, used: False
INFO:lightning:GPU available: True, used: False
TPU available: False, using: 0 TPU cores
INFO:lightning:TPU available: False, using: 0 TPU cores

  | Name   | Type            | Params
-------------------------------------------
0 | kogpt2 | GPT2LMHeadModel | 124 M 
INFO:lightning:
  | Name   | Type            | Params
-------------------------------------------
0 | kogpt2 | GPT2LMHeadModel | 124 M 


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Epoch 00000: val_loss reached 4.03494 (best 4.03494), saving model to /content/drive/My Drive/KoGPT2-personachat/logs/cm_kogpt2/version_1/checkpoints/model_epoch=00-val_loss=4.0349.ckpt as top 10
INFO:lightning:
Epoch 00000: val_loss reached 4.03494 (best 4.03494), saving model to /content/drive/My Drive/KoGPT2-personachat/logs/cm_kogpt2/version_1/checkpoints/model_epoch=00-val_loss=4.0349.ckpt as top 10


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…