<a href="https://colab.research.google.com/github/dreamingjudith/KoGPT2-personachat/blob/dev/personachat_kogpt2_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 사전 확인사항

- Runtime type 변경 후 GPU가 제대로 할당됐는지 확인하기
- 체크포인트 저장을 위한 Google Drive 연동
- 필요 모듈 설치
- 기존 코드에 남아있는 `logger`를 그대로 사용하기 위한 세팅

In [1]:
!pip install transformers==4.10.3 tokenizers==0.10.3 pytorch-lightning==1.5.10



In [2]:
!nvidia-smi

Tue Apr  5 09:41:11 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   64C    P0    71W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
!ls '/content/drive/My Drive'

'Colab Notebooks'   KoGPT2-personachat	 korquad_2.1


In [5]:
import logging

logger = logging.getLogger('cm_kogpt2')
logging.basicConfig(level=logging.INFO)

In [6]:
import torch

# 모델, 토크나이저
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast

# 데이터셋
import json
import os
from torch.utils.data import DataLoader, TensorDataset
from itertools import chain

# PyTorch-Lightning을 이용한 모델 정의
from pytorch_lightning.core.lightning import LightningModule
from transformers.optimization import AdamW, get_cosine_schedule_with_warmup

# 기타 스크립트 실행을 위한 모듈
import argparse
from collections import defaultdict

import torch.nn.functional as F
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger

INFO:numexpr.utils:NumExpr defaulting to 2 threads.


# KoGPT2 모델, 토크나이저를 불러오기 위한 함수

In [7]:
def get_kogpt2_model():
    """Get KoGPT2 model after downloading"""

    model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
    model.eval()

    return model


def get_kogpt2_tokenizer():
    """Get KoGPT2 Tokenizer after downloading"""

    tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
                                                        bos_token='</s>',
                                                        eos_token='</s>',
                                                        unk_token='<unk>',
                                                        pad_token='<pad>',
                                                        mask_token='<mask>')

    return tokenizer

# 데이터셋 구성을 위한 함수

In [8]:
SPECIAL_TOKENS = ["<s>", "</s>", "<usr>", "<sys>", "<pad>"]
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<s>', 'eos_token': '</s>', 'pad_token': '<pad>',
                         'additional_special_tokens': ['<usr>', '<sys>']}
MODEL_INPUTS = ["input_ids", "labels", "token_type_ids"]
PADDED_INPUTS = ["input_ids", "labels", "token_type_ids"]

In [9]:
def get_dataset(tokenizer, dataset_path, dataset_cache):
    """Read PersonaChat json file and return tokenized dataset"""
    dataset_basename = os.path.basename(dataset_path).split(".")[0]
    dataset_cache = "/content/drive/My Drive/KoGPT2-personachat/dataset/dataset_cache_{}".format(dataset_basename)

    if dataset_cache and os.path.isfile(dataset_cache):
        logger.info("Load tokenized dataset from cache at %s", dataset_cache)
        dataset = torch.load(dataset_cache)

    else:
        logger.info("Reading {}".format(dataset_path))
        with open(dataset_path, "r", encoding="utf-8") as f:
            dataset = json.loads(f.read())

        logger.info("Tokenize and encode the dataset")

        def tokenize(obj):
            if isinstance(obj, str):
                return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
            if isinstance(obj, dict):
                return dict((n, tokenize(o)) for n, o in obj.items())
            return list(tokenize(o) for o in obj)
        dataset = tokenize(dataset)
        torch.save(dataset, dataset_cache)

    return dataset


def pad_dataset(dataset, padding=0):
    """ Pad the dataset.
    This could be optimized by defining a Dataset class and padding at the batch level,
    but this is simpler. """
    max_l = max(len(x) for x in dataset["input_ids"])

    for name in PADDED_INPUTS:
        dataset[name] = [x + [padding if name != "labels" else -100] * (max_l - len(x)) for x in dataset[name]]

    return dataset


def build_input_from_segments(persona, history, reply, tokenizer, labels=False, with_eos=True):
    """ Build a sequence of input from 3 segments: persona, history and last reply. """
    bos, eos, speaker1, speaker2 = tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[:-1])
    sequence = [[bos] + list(chain(*persona))] + \
        history + [reply + ([eos] if with_eos else [])]
    sequence = [sequence[0]] + [[speaker2 if (len(sequence)-i) %
                                 2 else speaker1] + s for i, s in enumerate(sequence[1:])]
    instance = {}
    instance["input_ids"] = list(chain(*sequence))
    instance["token_type_ids"] = [speaker2 if i %
                                  2 else speaker1 for i, s in enumerate(sequence) for _ in s]
    instance["labels"] = [-100] * len(instance["input_ids"])
    if labels:
        instance["labels"] = ([-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:]

    return instance


def get_data_loaders(args, tokenizer):
    """ Prepare the dataset for training and evaluation """
    personachat = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)

    logger.info("Build inputs and labels")
    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
    for dataset_name, dataset in personachat.items():
        num_candidates = len(dataset[0]["utterances"][0]["candidates"])
        if args.num_candidates > 0 and dataset_name == 'train':
            num_candidates = min(args.num_candidates, num_candidates)
        for dialog in dataset:
            persona = dialog["personality"].copy()
            for _ in range(args.personality_permutations):
                for utterance in dialog["utterances"]:
                    history = utterance["history"][-(2 * args.max_history + 1):]
                    for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
                        labels = bool(j == num_candidates - 1)
                        instance = build_input_from_segments(
                            persona, history, candidate, tokenizer, labels)
                        for input_name, input_array in instance.items():
                            datasets[dataset_name][input_name].append(
                                input_array)
                    datasets[dataset_name]["n_candidates"] = num_candidates
                # permuted personalities
                persona = [persona[-1]] + persona[:-1]

    logger.info("Pad inputs and convert to Tensor")
    tensor_datasets = {"train": [], "valid": []}
    for dataset_name, dataset in datasets.items():
        dataset = pad_dataset(
            dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
        for input_name in MODEL_INPUTS:
            tensor = torch.tensor(dataset[input_name])
            tensor = tensor.view(
                (-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
            tensor_datasets[dataset_name].append(tensor)

    logger.info("Build train and validation dataloaders")
    train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
    train_loader = DataLoader(train_dataset,
                              batch_size=args.train_batch_size,
                              num_workers=args.num_workers,
                              shuffle=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=args.valid_batch_size,
                              num_workers=args.num_workers,
                              shuffle=False)

    return train_loader, valid_loader

# PersonaChat model defined for PyTorch Lightning

In [10]:
class CMPersonaChat(LightningModule):
    def __init__(self, **hparams):  # should get hparams with ** if you want pass args
    # def __init__(self, hparams):  # not like this
        super(CMPersonaChat, self).__init__()
        self.save_hyperparameters()
        self.kogpt2 = get_kogpt2_model()

    @staticmethod
    def add_model_specific_args(parent_parser):
        # add model specific args
        parser = argparse.ArgumentParser(parents=[parent_parser], add_help=False)
        parser.add_argument('--lr',
                            type=float,
                            default=5e-5,
                            help='The initial learning rate')
        parser.add_argument('--warmup_ratio',
                            type=float,
                            default=0.1,
                            help='warmup ratio')
        return parser

    @property
    def num_training_steps(self) -> int:
        """Total training steps inferred from datamodule and devices.
        https://github.com/PyTorchLightning/pytorch-lightning/issues/5449#issuecomment-757863689
        https://github.com/Zasder3/train-CLIP/issues/29#issuecomment-1056339940
        """
        dataset = self.trainer._data_connector._train_dataloader_source.dataloader()

        if self.trainer.max_steps:
            return self.trainer.max_steps

        dataset_size = (
            self.trainer.limit_train_batches
            if self.trainer.limit_train_batches != 0
            else len(dataset)
        )

        num_devices = max(1, self.trainer.num_gpus, self.trainer.num_processes)
        if self.trainer.tpu_cores:
            num_devices = max(num_devices, self.trainer.tpu_cores)

        effective_batch_size = dataset.batch_size * self.trainer.accumulate_grad_batches * num_devices
        return (dataset_size // effective_batch_size) * self.trainer.max_epochs

    def forward(self, inputs, token_type_ids):
        output, *_ = self.kogpt2(inputs, token_type_ids=token_type_ids)
        return output

    def training_step(self, batch, batch_idx):
        token_ids, label, mask = batch
        # forward: input(batch,max_sentence_length) -> output(batch_size, max_sentence_length,vocab)
        # e.g. (4,768) -> (4,768,50000)
        outputs = self.kogpt2(token_ids, token_type_ids=mask, labels=label)
        self.log("loss/train_loss", outputs.loss)

        return outputs.loss

    def validation_step(self, batch, batch_idx):
        token_ids, label, mask = batch
        outputs = self.kogpt2(token_ids, token_type_ids=mask, labels=label)
        self.log("loss/val_loss", outputs.loss)

        return outputs.loss

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack(outputs).mean()
        self.log("loss/avg_val_loss", avg_loss)

    def configure_optimizers(self):
        # TODO: num_training_step을 구하기 위해 dataloder 없이 manual optimization을 이용해 warmup 하게 고치기
        # Prepare optimizer
        param_optimizer = list(self.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=self.hparams.lr, correct_bias=False)

        # Prepare learning rate scheduler
        num_train_steps = self.num_training_steps
        num_warmup_steps = int(num_train_steps * self.hparams.warmup_ratio)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps, num_training_steps=num_train_steps)
        lr_scheduler = {'scheduler': scheduler, 'name': 'cosine_schedule_with_warmup',
                        'monitor': 'loss', 'interval': 'step',
                        'frequency': 1}
        return [optimizer], [lr_scheduler]

# argparser

In [11]:
parser = argparse.ArgumentParser()
parser.add_argument("--dataset_path", type=str,
                    default="dataset/personachat_google_translated.json",
                    help="Path of the dataset.")
parser.add_argument("--dataset_cache", type=str,
                    default='./dataset_cache',
                    help="Path or url of the dataset cache")
parser.add_argument("--num_candidates", type=int, default=1,
                    help="Number of candidates for training")
parser.add_argument("--personality_permutations", type=int, default=1,
                    help="Number of permutations of personality sentences")
parser.add_argument("--max_history", type=int, default=2,
                    help="Number of previous exchanges to keep in history")
parser.add_argument("--name", type=str,
                    default="cm_kogpt2",
                    help="Model name for logging")
parser.add_argument("--ckpt_path", type=str,
                    help="Checkpoint path for training or evaluation")

# Shared arguments for dataloader and training
parser.add_argument('--max_len',
                    type=int,
                    default=768,
                    help='max sentence length on input (default: 768)')
parser.add_argument("--train_batch_size", type=int,
                    default=4, help="Batch size for training")
parser.add_argument("--valid_batch_size", type=int,
                    default=4, help="Batch size for validation")
parser.add_argument("--num_workers", type=int,
                    default=min(os.cpu_count(), 8), help="Number of workers for DataLoader")

# Select train/inference
parser.add_argument('--mode', type=str, choices=['train', 'eval', 'chat'],
                    required=True,
                    help='Script mode to execute (train, eval, chat)')

# Model configuration arguments
parser = CMPersonaChat.add_model_specific_args(parser)
parser = Trainer.add_argparse_args(parser)

# Fine-tune KoGPT2 for PersonaChat

이 때 Colab에서 argument가 정상적으로 들어가게 하기 위해 아래와 같은 방식으로 `parse_args` 함수에 인자로 `args` 리스트를 줘야 힘
```
args = parser.parse_args(args=['--mode', 'train', '--dataset_path', '/content/drive/My Drive/KoGPT2-personachat/dataset/sample.json', '--gpus', '1'])
```

In [14]:
args = parser.parse_args(args=['--mode', 'train',
                               '--dataset_path', '/content/drive/My Drive/KoGPT2-personachat/dataset/personachat_manual_translated.json',
                               '--gpus', '1',
                               '--max_epochs', '10',
                               '--accumulate_grad_batches', '8',
                               '--valid_batch_size', '2'])

tokenizer = get_kogpt2_tokenizer()
train_loader, val_loader = get_data_loaders(args, tokenizer)

# TensorBoard logger settings
tb_logger = TensorBoardLogger("/content/drive/My Drive/KoGPT2-personachat/logs", name=args.name, default_hp_metric=False)
checkpoint_callback = ModelCheckpoint(
    dirpath=f'{tb_logger.log_dir}/checkpoints',
    filename='model_{epoch:02d}-{loss/avg_val_loss:.4f}',
    auto_insert_metric_name=True,
    verbose=True,
    save_top_k=10,
    mode='min',
    monitor='loss/avg_val_loss'
)

if args.ckpt_path is None:
    trainer = Trainer.from_argparse_args(
        args,
        callbacks=[checkpoint_callback],
        gradient_clip_val=1.0,
        logger=tb_logger)

    model = CMPersonaChat(**vars(args))

# Fine-tune from saved checkpoint
else:
    trainer = Trainer(
        resume_from_checkpoint=args.ckpt_path,
        callbacks=[checkpoint_callback],
        gradient_clip_val=1.0,
        logger=tb_logger)

    model = CMPersonaChat.load_from_checkpoint(args.ckpt_path)

model.train()
trainer.fit(model, train_loader, val_loader)
logging.info('best model path {}'.format(checkpoint_callback.best_model_path))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.
INFO:cm_kogpt2:Load tokenized dataset from cache at /content/drive/My Drive/KoGPT2-personachat/dataset/dataset_cache_personachat_manual_translated
INFO:cm_kogpt2:Build inputs and labels
INFO:cm_kogpt2:Pad inputs and convert to Tensor
INFO:cm_kogpt2:Build train and validation dataloaders
INFO:pytorch_lightning.utilities.distributed:GPU available: True, used: True
INFO:pytorch_lightning.utilities.distributed:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.distributed:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.accelerators.gpu:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:pytorch_lightning.callbacks.model_summary:
  | Name   | Type            | Params
---

Validation sanity check: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.distributed:Epoch 0, global step 15: loss/avg_val_loss reached 4.33955 (best 4.33955), saving model to "/content/drive/My Drive/KoGPT2-personachat/logs/cm_kogpt2/version_3/checkpoints/model_epoch=00-loss/avg_val_loss=4.3395.ckpt" as top 10


Validating: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.distributed:Epoch 1, global step 31: loss/avg_val_loss reached 3.87913 (best 3.87913), saving model to "/content/drive/My Drive/KoGPT2-personachat/logs/cm_kogpt2/version_3/checkpoints/model_epoch=01-loss/avg_val_loss=3.8791.ckpt" as top 10


Validating: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.distributed:Epoch 2, global step 47: loss/avg_val_loss reached 4.06858 (best 3.87913), saving model to "/content/drive/My Drive/KoGPT2-personachat/logs/cm_kogpt2/version_3/checkpoints/model_epoch=02-loss/avg_val_loss=4.0686.ckpt" as top 10


Validating: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.distributed:Epoch 3, global step 63: loss/avg_val_loss reached 4.19910 (best 3.87913), saving model to "/content/drive/My Drive/KoGPT2-personachat/logs/cm_kogpt2/version_3/checkpoints/model_epoch=03-loss/avg_val_loss=4.1991.ckpt" as top 10


Validating: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.distributed:Epoch 4, global step 79: loss/avg_val_loss reached 4.33714 (best 3.87913), saving model to "/content/drive/My Drive/KoGPT2-personachat/logs/cm_kogpt2/version_3/checkpoints/model_epoch=04-loss/avg_val_loss=4.3371.ckpt" as top 10


Validating: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.distributed:Epoch 5, global step 95: loss/avg_val_loss reached 4.58556 (best 3.87913), saving model to "/content/drive/My Drive/KoGPT2-personachat/logs/cm_kogpt2/version_3/checkpoints/model_epoch=05-loss/avg_val_loss=4.5856.ckpt" as top 10


Validating: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.distributed:Epoch 6, global step 111: loss/avg_val_loss reached 4.73005 (best 3.87913), saving model to "/content/drive/My Drive/KoGPT2-personachat/logs/cm_kogpt2/version_3/checkpoints/model_epoch=06-loss/avg_val_loss=4.7301.ckpt" as top 10


Validating: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.distributed:Epoch 7, global step 127: loss/avg_val_loss reached 4.87372 (best 3.87913), saving model to "/content/drive/My Drive/KoGPT2-personachat/logs/cm_kogpt2/version_3/checkpoints/model_epoch=07-loss/avg_val_loss=4.8737.ckpt" as top 10


Validating: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.distributed:Epoch 8, global step 143: loss/avg_val_loss reached 4.92275 (best 3.87913), saving model to "/content/drive/My Drive/KoGPT2-personachat/logs/cm_kogpt2/version_3/checkpoints/model_epoch=08-loss/avg_val_loss=4.9227.ckpt" as top 10


Validating: 0it [00:00, ?it/s]

INFO:pytorch_lightning.utilities.distributed:Epoch 9, global step 159: loss/avg_val_loss reached 4.98591 (best 3.87913), saving model to "/content/drive/My Drive/KoGPT2-personachat/logs/cm_kogpt2/version_3/checkpoints/model_epoch=09-loss/avg_val_loss=4.9859.ckpt" as top 10
INFO:root:best model path /content/drive/My Drive/KoGPT2-personachat/logs/cm_kogpt2/version_3/checkpoints/model_epoch=01-loss/avg_val_loss=3.8791.ckpt
