In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install kogpt2-transformers==0.3.0
!pip install transformers==3.0.2
!pip install torch
!pip install tokenizers==0.8.1rc1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import sys
sys.path.append('drive/My Drive/Colab Notebooks/')
sys.path.append('drive/My Drive/Colab Notebooks/model')

In [4]:
import os
import numpy as np
import tqdm as tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, dataloader
from transformers import GPT2Config, AdamW
from kogpt2_transformers import get_kogpt2_tokenizer, get_kogpt2_model

In [5]:
class WellnessAutoRegressiveDataset(Dataset):
  def __init__(self,
               path = "drive/MyDrive/Colab Notebooks/KoBERT_Wellness/dataset/wellness_autoregressive.txt",
               n_ctx = 1024
               ):
    self.path = path
    self.data =[]
    self.tokenizer = get_kogpt2_tokenizer()

    bos = [self.tokenizer.bos_token_id]
    eos = [self.tokenizer.eos_token_id]
    pad = [self.tokenizer.pad_token_id]

    f = open(self.path, 'r', encoding='utf-8')

    while True:
      line = f.readline()
      if not line: 
        break
      datas = line.split("    ")
      index_of_words = bos + self.tokenizer.encode(datas[0]) + eos + bos + self.tokenizer.encode(datas[1][:-1])+ eos
      pad_token_len = n_ctx - len(index_of_words)
      index_of_words += pad * pad_token_len
      self.data.append(index_of_words)

    f.close()

  def __len__(self):
    return len(self.data)

  def __getitem__(self,index):
    item = self.data[index]
    return item

In [6]:
def get_kogpt2_config():
  kogpt2_config = {
        "initializer_range": 0.02,
        "layer_norm_epsilon": 1e-05,
        "n_ctx": 1024,
        "n_embd": 768,
        "n_head": 12,
        "n_layer": 12,
        "n_positions": 1024,
        "vocab_size": 50000,
        "activation_function": "gelu"
    }
  
  return GPT2Config.from_dict(kogpt2_config)

In [7]:
class KoGPT2Dialogue(nn.Module):
  def __init__(self):
    super(KoGPT2Dialogue, self).__init__()
    self.kogpt2 = get_kogpt2_model()

  def generate(self,
               input_ids,
               do_sample=True,
               max_length= 60,
               top_p=0.92,
               top_k=50,
               temperature= 0.6,
               no_repeat_ngram_size =None,
               num_return_sequences=3,
               early_stopping=False,
               ):
      
    return self.kogpt2.generate(input_ids,
               do_sample=do_sample,
               max_length=max_length,
               top_p = top_p,
               top_k=top_k,
               temperature=temperature,
               no_repeat_ngram_size= no_repeat_ngram_size,
               num_return_sequences=num_return_sequences,
               early_stopping = early_stopping,
              )

  def forward(self, input, labels = None):
    if labels is not None:
      outputs = self.kogpt2(input, labels=labels)
    else:
      outputs = self.kogpt2(input)

    return outputs

In [8]:
checkpoint_path ="drive/My Drive/Colab Notebooks/checkpoint"
save_ckpt_path = f"{checkpoint_path}/kobert-wellnesee-classification.pth"

lr = 5e-5
epochs = 5
batch_size = 8
save_step = 100
ctx = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(ctx)

# Load Data from input
dataset= WellnessAutoRegressiveDataset()
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

model = KoGPT2Dialogue()
model.to(device)

loss_function = torch.nn.CrossEntropyLoss(ignore_index=3)

# Prepare optimizer and schedule (linear warmup and decay)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
  {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
  {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
]
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

Downloading:   0%|          | 0.00/577 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/510M [00:00<?, ?B/s]

In [None]:
losses =[]
for epoch in range(epochs):
    count = 0
    with tqdm(total=len(train_loader), desc=f"Train({epoch})") as pbar:
        for i, data in enumerate(train_loader):
            optimizer.zero_grad()
            data = torch.stack(data)  # list of Tensor로 구성되어 있기 때문에 list를 stack을 통해 변환해준다.
            data = data.transpose(1, 0)
            data= data.to(ctx)

            outputs = model(data, labels=data)
            _, logits = outputs[:2]

            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = data[..., 1:].contiguous()

            loss = loss_function(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            loss.backward()
            optimizer.step()

            losses.append(loss.item())

            # if count % 10 == 0:
            #     print('epoch no.{} train no.{}  loss = {}'.format(epoch, count + 1, loss))
            if (count > 0 and count % save_step == 0) or (len(data) < batch_size):
                torch.save({
                    'epoch': epoch,
                    'train_no': count,
                    'model_state_dict': model.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': loss
                }, save_ckpt_path)
            count += 1
            pbar.update(1)
            pbar.set_postfix_str(f"Loss: {loss.item():.3f} ({np.mean(losses):.3f})")