# GPT-2 Model Fine-tuning for Korean Community Website Posts

**DISCLAIMER**: *I have primarily used a paid instance of Google Colab (https://colab.research.google.com/signup) for training my model. I have not tested this on my local system. So I cannot guarantee that this will run on any system. However, I did modify the code for weight import to account for running on a local system. On Colab you have to load it from a Google Drive to retain persistence*.

## Import and Use Data from Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')




In [0]:
import os

import sys
sys.path.append('drive/My Drive/Colab Notebooks/')

In [0]:
!pip install -r drive/'My Drive'/'Colab Notebooks'/KoGPT2/requirements.txt

Collecting gluonnlp>=0.8.3
[?25l  Downloading https://files.pythonhosted.org/packages/c6/27/07b57d22496ed6c98b247e578712122402487f5c265ec70a747900f97060/gluonnlp-0.9.1.tar.gz (252kB)
[K     |████████████████████████████████| 256kB 3.4MB/s 
[?25hCollecting mxnet
[?25l  Downloading https://files.pythonhosted.org/packages/81/f5/d79b5b40735086ff1100c680703e0f3efc830fa455e268e9e96f3c857e93/mxnet-1.6.0-py2.py3-none-any.whl (68.7MB)
[K     |████████████████████████████████| 68.7MB 49kB/s 
[?25hCollecting sentencepiece>=0.1.6
[?25l  Downloading https://files.pythonhosted.org/packages/3b/88/49e772d686088e1278766ad68a463513642a2a877487decbd691dec02955/sentencepiece-0.1.90-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 50.5MB/s 
Collecting transformers>=2.4.1
[?25l  Downloading https://files.pythonhosted.org/packages/22/97/7db72a0beef1825f82188a4b923e62a146271ac2ced7928baa4d47ef2467/transformers-2.9.1-py3-none-any.whl (641kB)
[K     |███████████

In [0]:
!pip install drive/'My Drive'/'Colab Notebooks'/KoGPT2

Processing ./drive/My Drive/Colab Notebooks/KoGPT2
Building wheels for collected packages: kogpt2
  Building wheel for kogpt2 (setup.py) ... [?25l[?25hdone
  Created wheel for kogpt2: filename=kogpt2-0.1.0-cp36-none-any.whl size=22259 sha256=ca65d740256b3ec1fcb19d9dec42aeef94dd7af7c71cae9e3948336399cdda5f
  Stored in directory: /tmp/pip-ephem-wheel-cache-olv18w6m/wheels/13/fe/32/c11ad824f0076b67cb83a05ac071ca46ad18d8105c9ce3b1d1
Successfully built kogpt2
Installing collected packages: kogpt2
Successfully installed kogpt2-0.1.0


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import TensorDataset # 텐서데이터셋
from torch.utils.data import Dataset,DataLoader # 데이터로더

from kogpt2.utils import download, tokenizer, get_tokenizer
from kogpt2.model.torch_gpt2 import GPT2Config, GPT2LMHeadModel

from gluonnlp.data import SentencepieceTokenizer
import gluonnlp
import numpy as np
import os
import random

In [0]:
from urllib.request import urlopen
import pandas as pd
import json

In [0]:
def cleanup_data(data, data_type):
    data_new = {}
    data_title = {}
    data_content = {}
    j = 0

    for i in data:
        data_title[str(j)] = i['title']
        content = i['content']
        content = [c for c in content if 'http' not in c]
        content = [c for c in content if '\xa0' not in c]
        content = [c for c in content if '\n' not in c]
        content = [c for c in content if '- dc official App' not in c]
        content_str = ''.join(map(str, content))
        data_content[str(j)] = content_str.strip()
        j += 1

    data_new['title'] = data_title
    data_new['content'] = data_content

    json_final = json.dumps(data_new, ensure_ascii=False)
    df = pd.read_json(json_final)

    if data_type == "content":
        return df['content'].to_numpy()
    else:
        return df['title'].to_numpy()

In [0]:
class PostDataset(Dataset):
  """
  Community Post Dataset
  """
  def __init__(self, file_path, vocab, tokenizer):
    self.file_path = file_path
    self.data = []
    self.vocab = vocab
    self.tokenizer = tokenizer

    url = self.file_path
    json_data = urlopen(url).read().decode('utf-8')
    json_data = json.loads(json_data)

    clean_data = cleanup_data(json_data, "content")


    # Ensure utf-8 for Korean to load properly
    for text in clean_data:
      tokenized_line = tokenizer(text[:-1])
      index_of_words = [vocab[vocab.bos_token],] + vocab[tokenized_line]+ [vocab[vocab.eos_token]]
      self.data.append(index_of_words)

    print(np.shape(self.data))

    # file.close()

  def __len__(self):
    return len(self.data)
  def __getitem__(self, index):
    item = self.data[index]
    return self.data[index]
    # print(item)
    return item

In [0]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

## KoGPT2 Configuration (Code derived from Official KoGPT2 code [tweaked for fine-tuning])

#### Basic Configuration

In [0]:
ctx= 'cuda'
cachedir='~/kogpt2/'
save_path = 'drive/My Drive/Colab Notebooks/korean-post-generator/checkpoint/'

pytorch_kogpt2 = {
    'url':
    'https://kobert.blob.core.windows.net/models/kogpt2/pytorch/pytorch_kogpt2_676e9bcfa7.params',
    'fname': 'pytorch_kogpt2_676e9bcfa7.params',
    'chksum': '676e9bcfa7'
}
kogpt2_config = {
    "initializer_range": 0.02,
    "layer_norm_epsilon": 1e-05,
    "n_ctx": 1024,
    "n_embd": 768,
    "n_head": 12,
    "n_layer": 12,
    "n_positions": 1024,
    "vocab_size": 50000
}

#### Download the Model and Vocabulary

In [0]:
model_info = pytorch_kogpt2
model_path = download(model_info['url'],
                       model_info['fname'],
                       model_info['chksum'],
                       cachedir=cachedir)

vocab_info = tokenizer
vocab_path = download(vocab_info['url'],
                       vocab_info['fname'],
                       vocab_info['chksum'],
                       cachedir=cachedir)

[██████████████████████████████████████████████████]
[██████████████████████████████████████████████████]


#### Model configuration

In [0]:
kogpt2model = GPT2LMHeadModel(config=GPT2Config.from_dict(kogpt2_config))
kogpt2model.load_state_dict(torch.load(model_path))

device = torch.device(ctx)
kogpt2model.to(device)

kogpt2model.train()
vocab_b_obj = gluonnlp.vocab.BERTVocab.from_sentencepiece(vocab_path,
                                                     mask_token=None,
                                                     sep_token=None,
                                                     cls_token=None,
                                                     unknown_token='<unk>',
                                                     padding_token='<pad>',
                                                     bos_token='<s>',
                                                     eos_token='</s>')

#### Hyperparameters

In [0]:
batch_size = 8
learning_rate = 1e-5
epoch=200

#### Load Batch Data using DataLoader

In [0]:
tok_path = get_tokenizer()
model, vocab = kogpt2model, vocab_b_obj
sentencepieceTokenizer = SentencepieceTokenizer(tok_path)

data_path = 'https://raw.githubusercontent.com/duckonomy/cs344/master/project/api/dcinside.json'

post_dataset = PostDataset(data_path, vocab, sentencepieceTokenizer)
post_data_loader = DataLoader(post_dataset, batch_size=batch_size, shuffle=True, pin_memory=True)

using cached model
(14040,)


#### Compile Options

In [0]:
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

tmp_post_tens = None
models_folder = "trained_models"
if not os.path.exists(models_folder):
    os.mkdir(models_folder)

proc_seq_count = 0
sum_loss = 0.0
batch_count = 0

avg_loss = (0.0, 0.0)

## Fine-tuning

In [0]:
import torch
import torch.nn.functional as F


def top_k_logits(logits, k):
    if k == 0:
        return logits
    values, _ = torch.topk(logits, k)
    min_values = values[:, -1]
    return torch.where(logits < min_values, torch.ones_like(logits, dtype=logits.dtype) * -1e10, logits)


def top_p_logits(logits, top_p=0.0, filter_value=-float('Inf')):
    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        sorted_indices_to_remove = cumulative_probs >= top_p
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[:, indices_to_remove] = filter_value
    return logits


def sample_sequence(model, tok, vocab, sent, text_size, temperature, top_p, top_k):
    toked = tok(sent)
    count = 0
    generated_text = ''

    if len(toked) > 1022:
        return 0

    while 1:
        input_ids = torch.tensor([vocab[vocab.bos_token], ] + vocab[toked]).unsqueeze(0)
        predicts = model(input_ids)
        pred = predicts[0]

        logits = pred
        logits = logits[:, -1, :] / temperature
        logits = top_k_logits(logits, top_k)
        logits = top_p_logits(logits, top_p=top_p)

        log_probs = F.softmax(logits, dim=-1)
        prev = torch.multinomial(log_probs, num_samples=1)

        gen = vocab.to_tokens(prev.squeeze().tolist())

        if gen == '</s>' or gen == '|' or count > text_size:
            print('to_tokens:', vocab.to_tokens(torch.argmax(pred, axis=-1).squeeze().tolist()))
            sent += gen.replace('▁', ' ')
            generated_text += gen.replace('▁', ' ')
            sent += '\n'
            generated_text += '\n'
            toked = tok(sent)
            count = 0
            break

        sent += gen.replace('▁', ' ')
        generated_text += gen.replace('▁', ' ')
        toked = tok(sent)
        count += 1
    return sent

In [0]:
def auto_enter(text):
    text = (text.replace("   ", "\n"))
    text = text.split("\n")

    text = [t.lstrip() for t in text if t != '']
    return "\n\n".join(text)

In [0]:
    for epoch in range(epoch):
        count = 0
        for data in post_data_loader:
            optimizer.zero_grad()
            data = torch.stack(data)
            data = data.transpose(1,0)
            data = data.to(ctx)
            model = model.to(ctx)

            outputs = model(data, labels=data)
            loss, logits = outputs[:2]
            loss = loss.to(ctx)
            loss.backward()
            avg_loss = (avg_loss[0] * 0.99 + loss, avg_loss[1] * 0.99 + 1.0)
            optimizer.step()
            if count % 10 == 0:
                print('epoch no.{0} train no.{1}  loss = {2:.5f} avg_loss = {3:.5f}' . format(epoch, count, loss, avg_loss[0] / avg_loss[1]))

            if (count > 0 and count % 1000 == 0) or (len(data) < batch_size):
                sent = sample_sequence(model.to("cpu"), sentencepieceTokenizer, vocab, sent="뭐함", text_size=100, temperature=0.7, top_p=0.8, top_k=40)
                sent = sent.replace("<unused0>", "\n")
                sent = auto_enter(sent)
                print(sent)

                if count > 500000:
                    now = [int(n) for n in os.listdir(samples)]
                    now = max(now)
                    f = open(samples + str(now + 1), 'w', encoding="utf-8")
                    f.write(sent)
                    f.close()
            count += 1

            if (count > 0 and count % 10000 == 0) or (len(data) < batch_size):
                try:
                    torch.save({
                        'epoch': epoch,
                        'train_no': count,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'loss': loss
                    }, save_path+'korean_post_generator_checkpoint.tar')
                except:
                    pass

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
epoch no.7 train no.860  loss = 5.99233 avg_loss = 5.71201
epoch no.7 train no.870  loss = 5.31178 avg_loss = 5.73766
epoch no.7 train no.880  loss = 5.30597 avg_loss = 5.71235
epoch no.7 train no.890  loss = 5.94274 avg_loss = 5.74801
epoch no.7 train no.900  loss = 5.68206 avg_loss = 5.77670
epoch no.7 train no.910  loss = 5.66289 avg_loss = 5.76023
epoch no.7 train no.920  loss = 7.31908 avg_loss = 5.75084
epoch no.7 train no.930  loss = 4.66240 avg_loss = 5.73119
epoch no.7 train no.940  loss = 4.58455 avg_loss = 5.74912
epoch no.7 train no.950  loss = 5.36464 avg_loss = 5.73247
epoch no.7 train no.960  loss = 5.59571 avg_loss = 5.75186
epoch no.7 train no.970  loss = 5.85894 avg_loss = 5.73124
epoch no.7 train no.980  loss = 6.44532 avg_loss = 5.75419
epoch no.7 train no.990  loss = 5.43130 avg_loss = 5.76800
epoch no.7 train no.1000  loss = 4.95532 avg_loss = 5.76362
to_tokens: ['</s>', '</s>', '</s>']
뭐함</s>
epoch 