In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/data/project03/gpt

/content/drive/MyDrive/data/project03/gpt


In [3]:
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.12.3-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 3.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.0-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 6.3 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 54.9 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 47.4 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 43.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [4]:
import pandas as pd
import numpy as np

In [5]:
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import PreTrainedTokenizerFast

def load_dataset(file_path, tokenizer, block_size = 128):
    dataset = TextDataset(
        tokenizer = tokenizer,
        file_path = file_path,
        block_size = block_size,
    )
    return dataset

def load_data_collator(tokenizer, mlm = False):
    data_collator = DataCollatorForLanguageModeling(
        tokenizer = tokenizer,
        mlm = mlm,
    )
    return data_collator

def train(train_file_path, model_name, output_dir, overwrite_output_dir,
          per_device_train_batch_size, num_train_epochs, save_steps):
    tokenizer = PreTrainedTokenizerFast.from_pretrained(model_name,
                bos_token = '</s>', eos_token = '</s>', unk_token = '<unk>',
                pad_token = '<pad>', mask_token = '<mask>')
    train_dataset = load_dataset(train_file_path, tokenizer)
    data_collator = load_data_collator(tokenizer)

    tokenizer.save_pretrained(output_dir, legacy_format = False)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    model.save_pretrained(output_dir)

    training_args = TrainingArguments(
        output_dir = output_dir,
        overwrite_output_dir = overwrite_output_dir,
        per_device_eval_batch_size = per_device_train_batch_size,
        num_train_epochs = num_train_epochs,
    )

    trainer = Trainer(
        model = model,
        args = training_args,
        data_collator = data_collator,
        train_dataset = train_dataset,
    )

    trainer.train()
    trainer.save_model()

train_file_path = '/content/drive/MyDrive/data/project03/gpt/명언/동화.txt'
model_name = 'skt/kogpt2-base-v2'
output_dir = '/content/drive/MyDrive/data/project03/gpt/results'
overwrite_output_dir = False
per_device_train_batch_size = 8
num_train_epochs = 5.0
save_steps = 500

train(train_file_path = train_file_path,
      model_name = model_name,
      output_dir = output_dir,
      overwrite_output_dir = overwrite_output_dir,
      per_device_train_batch_size = per_device_train_batch_size,
      num_train_epochs = num_train_epochs,
      save_steps = save_steps
)


Downloading:   0%|          | 0.00/2.69M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.98k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


Downloading:   0%|          | 0.00/490M [00:00<?, ?B/s]

***** Running training *****
  Num examples = 264
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 165


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to /content/drive/MyDrive/data/project03/gpt/results
Configuration saved in /content/drive/MyDrive/data/project03/gpt/results/config.json
Model weights saved in /content/drive/MyDrive/data/project03/gpt/results/pytorch_model.bin


In [7]:
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel

def load_model(model_path):
  model = GPT2LMHeadModel.from_pretrained(model_path)
  return model

def load_tokenizer(tokenizer_path):
  tokenizer = PreTrainedTokenizerFast.from_pretrained(tokenizer_path)
  return tokenizer

def generate_text(sequence, max_lenth):
  model_path = '/content/drive/MyDrive/data/project03/gpt/results'
  model = load_model(model_path)
  tokenizer = load_tokenizer(model_path)
  ids = tokenizer.encode(f'{sequence},', return_tensors = 'pt')
  final_outputs = model.generate(
      ids,
      do_sample = True,
      max_length = max_length,
      pad_token_id = model.config.pad_token_id,
      tok_k = 50,
      top_p = 0.95
  )
  print(tokenizer.decode(final_outputs[0], skip_special_tokens = True))

input = '할머니 오리 즐거움'
sequence = input
max_length = 128 
print('input :' + sequence)
generate_text(sequence, max_length)

loading configuration file /content/drive/MyDrive/data/project03/gpt/results/config.json
Model config GPT2Config {
  "_name_or_path": "skt/kogpt2-base-v2",
  "_num_labels": 1,
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "author": "Heewon Jeon(madjakarta@gmail.com)",
  "bos_token_id": 0,
  "created_date": "2021-04-28",
  "embd_pdrop": 0.1,
  "eos_token_id": 1,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_epsilon": 1e-05,
  "license": "CC-BY-NC-SA 4.0",
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 3,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_la

input :할머니 오리 즐거움


All model checkpoint weights were used when initializing GPT2LMHeadModel.

All the weights of GPT2LMHeadModel were initialized from the model checkpoint at /content/drive/MyDrive/data/project03/gpt/results.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.
Didn't find file /content/drive/MyDrive/data/project03/gpt/results/added_tokens.json. We won't load it.
loading file None
loading file /content/drive/MyDrive/data/project03/gpt/results/special_tokens_map.json
loading file /content/drive/MyDrive/data/project03/gpt/results/tokenizer_config.json
loading file /content/drive/MyDrive/data/project03/gpt/results/tokenizer.json


할머니 오리 즐거움, 이 세상의 모든 것 즐거움도 사라졌다. 할머니가 아주 오랫동안 보아 온 모습이었다.
“저런, 아름다운 이 세상 이야기 좀 해줘요.”
이윽고 오리들은 서둘러 날아갔다.
이렇게 해서 인어들은 처음으로 세상에 왔다. 오리들은 이제껏 보지 못했던 이 세상의 모습이었다. 하지만 이제껏 보지 못했던 놀라운 세상이 펼쳐졌다. 인어들은 그렇게나 아름다운 그 이상한 모습들을 볼 수 있어서 무척 좋았다. 이제 이 세상 사람들과 더불어 사는 것이 얼마나 지겨운지 깨닫게 되었다. 
오리들은 자기들 집보다 훨씬 더 큰 땅을 갖고 있었다. 사람들은 그 넓은 집에 살고 있었는데, 그 집은 무척이나 따뜻하고 차분하고 사랑스러운
