In [1]:
model_name = "skt/ko-gpt-trinity-1.2B-v0.5"
save_dir = model_name.replace("/", "__")
rm_model_name = "skt/kogpt2-base-v2"
rm_save_dir = rm_model_name.replace("/", "__")

### PPO 데이터셋 확인

In [2]:
import json
data_path_3_PPO = '/aiffel/KoChatGPT/data_kochatgpt/kochatgpt_3_PPO.jsonl'
with open(data_path_3_PPO, "r", encoding='utf-8-sig') as json_file:
    list_data_dict = json.load(json_file)

print(len(list_data_dict)) # 12000
list_data_dict[:3]

12000


[{'prompt': '번디는 자신이 탐정잡지, 범죄소설 그리고 성범죄 관련 실제 범죄 다큐멘터리들을 탐독했다고 누구에게 말했나?'},
 {'prompt': '개포주공아파트는 몇 단지로 이루어져 있나?'},
 {'prompt': '김영삼의 후보 시절 지역표심을 겨냥한 발언을 문제삼은 후보는?'}]

### 필요한 라이브러리 추가

In [3]:
from copy import deepcopy

import torch
from torch.optim import Adam
from chatgpt.models.base import RewardModel
from chatgpt.models.gpt import GPTActor, GPTCritic
from chatgpt.trainer import PPOTrainer
from chatgpt.trainer.strategies import NaiveStrategy
from transformers import AutoTokenizer, Adafactor

In [4]:
torch.__version__

'1.12.1'

### 모델학습에 사용할 옵티마이저와 모델을 준비

In [5]:
with NaiveStrategy().model_init_context():
    actor = GPTActor(pretrained=f"model/{save_dir}/output_1_SFT-e3", lora_rank=0).to(torch.cuda.current_device())
    critic = GPTCritic(pretrained=f"model/{rm_save_dir}/output_2_RM", lora_rank=0).to(torch.cuda.current_device())

    tokenizer = AutoTokenizer.from_pretrained(
        model_name, bos_token='</s>', eos_token='</s>', unk_token='</s>', pad_token='</s>',
        padding_side="right", 
        model_max_length=128
    )

    initial_model = deepcopy(actor)
    reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).to(torch.cuda.current_device())

In [6]:
actor_optim = Adafactor(actor.parameters(), lr=5e-6, relative_step=False)
critic_optim = Adafactor(critic.parameters(), lr=5e-6, relative_step=False)

(actor, actor_optim), (critic, critic_optim), reward_model, initial_model = NaiveStrategy().prepare(
    (actor, actor_optim), (critic, critic_optim), reward_model, initial_model)

### PPO 학습에 쓸 데이터를 불러와 토크나이징

In [7]:
with open('/aiffel/KoChatGPT/data_kochatgpt/kochatgpt_3_PPO.jsonl', "r", encoding='utf-8-sig') as json_file:
    list_data_dict = json.load(json_file)
    list_prompt = [tmp['prompt'] for tmp in list_data_dict]

def tokenize_fn(texts):
    batch = tokenizer(texts, return_tensors='pt', max_length=96, padding=True, truncation=True)
    return {k: v.cuda() for k, v in batch.items()}

print(tokenize_fn('It takes something more than intelligence to act intelligently.'))

len(list_prompt)

{'input_ids': tensor([[46390, 31369, 33712, 30541, 31338, 41607, 30586, 31024, 31482, 37404,
         31035, 30316, 32131,   460, 34763, 32017, 37762, 33441,   565, 37205,
         32131,   460, 34763, 32017, 31561, 36271,   390]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1]], device='cuda:0')}


12000

### PPO Trainer 선언

In [8]:
trainer = PPOTrainer(NaiveStrategy(),
                     actor,
                     critic,
                     reward_model,
                     initial_model,
                     actor_optim,
                     critic_optim,
                     max_epochs=1,  
                     train_batch_size=4, 
                     tokenizer=tokenize_fn,
                     max_length=128,
                     do_sample=True,
                     temperature=1.0,
                     top_k=50,
                     pad_token_id=tokenizer.pad_token_id,
                     eos_token_id=tokenizer.eos_token_id)

### PPO 학습 진행

In [None]:
trainer.fit(list_prompt, 
            num_episodes=10,  
            max_timesteps=3,
            update_timesteps=3)

actor.model.save_pretrained(f'./model/{save_dir}/output_3_PPO')

Episode [1/10]:   0%|          | 0/3 [00:00<?, ?it/s]

### RLHF가 적용된 custom chatgpt의 생성능력을 확인

In [None]:
def generation(input_text):
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(
        torch.cuda.current_device())
    outputs = actor.generate(input_ids,
                             max_length=250,
                             do_sample=True,
                             top_k=50,
                             top_p=0.95,
                             num_return_sequences=1)
    output = tokenizer.batch_decode(outputs[0], skip_special_tokens=True)[0]
    print()
    print(output)
    return output

PROMPT_DICT = {
    "prompt_input": (
        "### Instruction(명령어):\n{prompt}\n\n### Response(응답):"
    )
}

list_prompt = [
    '불고기용 고기 한우에요?', 
    '리처드 닉슨이 43대 부통령직을 수행한 년도는?', 
    '시카고 오헤어 국제공항은 어디에 있어',
    '오늘 미세먼지 어때?']

list_prompt = [PROMPT_DICT['prompt_input'].format_map({'prompt': tmp}) for tmp in list_prompt]

for input_text in list_prompt:
    output = generation(input_text)

#### 메모리 관리를 위해 캐시를 비우기

In [None]:
torch.cuda.empty_cache()