# Train skt Kogpt trinity with Ahje set

## 1. Load Model

In [1]:
#!pip install ipywidgets

In [110]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
)

In [111]:
model_name = "skt/ko-gpt-trinity-1.2B-v0.5"

In [112]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)
#model

## 2. Load & Prepare Dataset

In [113]:
import pandas as pd
from pandas import read_json
from datasets import Dataset

In [114]:
train_path = "./data/train.json"
valid_path = "./data/valid.json"

In [115]:
with open(train_path, 'r') as f:
    raw_train_data = read_json(f, orient='id')
with open(valid_path, 'r') as f:
    raw_valid_data = read_json(f, orient='id')

In [116]:
train_data = pd.DataFrame({"question": [q['question'] for q in raw_train_data['qna']], "gag" : [q['gag'] for q in raw_train_data['qna']]})
valid_data = pd.DataFrame({"question": [q['question'] for q in raw_valid_data['qna']], "gag" : [q['gag'] for q in raw_valid_data['qna']]})
train_dataset = Dataset.from_pandas(train_data)
valid_dataset = Dataset.from_pandas(valid_data)

In [118]:
tokenizer.model_max_length = 100

In [140]:
def tokenize_function(examples):
        result = tokenizer(examples['question'], padding="max_length")
        label_result = tokenizer(examples['gag'], padding="max_length")
        result.update({'labels': label_result['input_ids']})
        return result

In [142]:
tokenized_train_datasets = train_dataset.map(
            tokenize_function,
            batched=True,
            num_proc=4,
            remove_columns=["question", "gag"],
        )
tokenized_valid_datasets = valid_dataset.map(
            tokenize_function,
            batched=True,
            num_proc=4,
            remove_columns=["question", "gag"],
        )

## 3. Train

In [130]:
from transformers import (
    Trainer,
    TrainingArguments,
)

In [131]:
training_args = TrainingArguments(
    "./outputs/",
    evaluation_strategy="epoch",
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=8,
    save_strategy='epoch',
    fp16=True,
    fp16_opt_level='O1',
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [132]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_valid_datasets,
)

Using amp fp16 backend


In [133]:
trainer.train()

***** Running training *****
  Num examples = 2141
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1340


ValueError: expected sequence of length 5 at dim 1 (got 14)

In [18]:
# import torch
# torch.cuda.empty_cache()

## 4. Inference

In [144]:
from transformers import AutoModelForCausalLM, AutoTokenizer

In [145]:
trained_model_dir = "./outputs/checkpoint-500"
model_name = "skt/ko-gpt-trinity-1.2B-v0.5"

In [146]:
trained_model = AutoModelForCausalLM.from_pretrained(trained_model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_name)

loading configuration file ./outputs/checkpoint-500/config.json
Model config GPT2Config {
  "_name_or_path": "skt/ko-gpt-trinity-1.2B-v0.5",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 8,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1920,
  "n_head": 16,
  "n_inner": 7680,
  "n_layer": 24,
  "n_positions": 1024,
  "pad_token_id": 8,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "torch_dtype": "float32",
  "transformers_version": "4.12.5",
  "use_cache": true,
  "vocab_size": 51200
}

loading weights file ./outputs/checkpoint-5

In [147]:
from pandas import read_json
import pandas as pd

valid_path = "./data/valid.json"
with open(valid_path, 'r') as f:
    raw_valid_data = read_json(f, orient='id')
valid_data = pd.DataFrame({"question": [q['question'] for q in raw_valid_data['qna']], "gag" : [q['gag'] for q in raw_valid_data['qna']]})

In [148]:
import random

r_idx = random.sample(range(len(valid_data["question"])), 20)
check = {'question':[], 'model':[], 'ans':[]}

In [149]:
for i in r_idx:
    check['question'].append(valid_data.iloc[i]['question'])
    check['ans'].append(valid_data.iloc[i]['gag'])
    model_output = trained_model.generate(
        tokenizer(valid_data.iloc[i]['question'], return_tensors='pt').input_ids,
        max_length=15,
        do_sample=True,
        top_p=0.95,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        bos_token_id=tokenizer.bos_token_id,
        bad_words_ids=[[30080], [3]],
        early_stopping=True,
        use_cache=True,
    )
    model_ans = tokenizer.decode(model_output[0])
    print(model_ans)
    check['model'].append(model_ans)

신이 가장 싫어하는 빵은?) 팔노다라~동짜 때 팔 때보)니까))기해라)도짜다다
고래는 포유류 개구리는 파충류, 그럼 오징어는? 때문이 때문?~짜') 때 팔~, ))
못생긴 여자가 얼굴에 오이 마사지를 하는 것을 뭐라고 하나? 때문자지 팔~다)보?이짜다이~라짜보
아동학대범이 중고장터에 가서 하는 말은?동노 때문짜약)기보)~) 일해서라 개 때문다)니까
꼴등을 추월하면 몇 등?네 팔기 일라라) 일리 때다서'~)서다라? 말?라
변호사가 더러운 이유? )니까 팔 때)기다니까,짜라! 개다~?)다 팔니까짜다니까 개
오징어를 땅에 묻으면?보해라 없으니까 팔~ 동자니까)니까이?라짜 팔래 동이짜는지다다
불쌍한 사람들이 타고 다니는 자동차는?다,일! 일)라서노라짜해서)~? 때보)해서?)'짜
열역학 나라의 신하 a, b, c 중 반란을 일으킨 사람은?)?기 때문서기) 열보)해서))다노
가면을 벗으면 얼굴이 두 개인 것은?이노~노~ 때문다?))~서니까다 일'))라서))
절대 말을 안하는 형제는?)?다짜해)?! 말다?해서)는))보해짜짜!)는
사자마자 후회하는 의자는? 팔)~도~)보니까다 때)해서 일서 때)약~해서짜)해서)
은하 철도는 왜 999인가?비니(~~이?! 때문 때문이라서~ 팔)짜니까~)~~'니까는
와이프가 배탈이 나면?보해서))라)이 때문?)~)해서!기 때짜)짜다~도)
금이 울면?)다다)) 때문 때문라이!금 금은 때문해서짜노~)보해)해서도이
낭떠러지 나무에 매달려 있는 사람이 싸는 똥은?보해서)해서~이기서라다라짜노자보
깨끗한 친구를 사귀려면 어디로 가야 할까?'해서~도지)지~이짜)다))도짜~자 때문이라서니까다
사람들이 가장 싫어하는 색은?)이) 때문) 때문?다?다보해기~는지이약)해 때문노기서
건배를 영어로 원샷, 불어로는?,)이다? 일 때지)다?,노보자는이짜 일니까 때문
안나가 엘사보다 노래를 잘 부르게 되는 이유는?)보라다)기)짜라는 때문지이다라)니까 나
