# Prompt generation with PPO algorithm

- `PPO`: Proximal policy optimization
- 강화학습 기반 시스템 프롬프트 생성

In [None]:
# 🔥하이퍼파라미터 설정
BASE_DIR = "/content/drive/MyDrive/강화학습"
REWARD_MODEL_DIR = "llama3"
POLICY_MODEL_DIR = "exaone"
TRAINED_OUTPUT = "ppo_exaone"
TRAIN_DATA = "train.csv"
SAMPLE_RESULT_CSV = "result_prompt.csv"
RANDOM_SEED = 42
LEARNING_RATE = 1e-5
MAX_ITER = 10
BATCH_SIZE = 8
GRADIENT_STEP = 1
LOG_STEP = 100
TOKENIZER_MAX_LENGTH = 1024
POLICY_MAX_NEW_TOKENS = 1024
POLICY_DO_SAMPLE = True
POLICY_TEMPERATURE = 0.1
REWARD_MAX_NEW_TOKENS = 32
REWARD_DO_SAMPLE = False
REWARD_TEMPERATURE = 0.1

In [None]:
import os

# CUDA 디버깅
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [None]:
import torch

assert torch.cuda.is_available(), "GPU를 사용하세요!"
device = "cuda"

In [None]:
!pip install -qq \
  fsspec==2025.3.2 \
  accelerate bitsandbytes \
  transformers trl

In [None]:
import gc
import ast
import random

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)
import pandas as pd
from trl import PPOConfig, PPOTrainer, AutoModelForCausalLMWithValueHead
from google.colab import drive

drive.mount("/content/drive", force_remount=False)


def join_path(*args):
    return os.path.join(BASE_DIR, *args)

In [None]:
# CUDA 최적화
torch.backends.cudnn.benchmark = True
if hasattr(torch.backends.cuda, "matmul") and hasattr(
    torch.backends.cuda.matmul, "allow_tf32"
):
    torch.backends.cuda.matmul.allow_tf32 = True

# 랜덤 시드 고정
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)

## 프롬프트 생성

In [None]:
def generate_policy_prompt() -> list:
    # EXAONE에서 사용할 프롬프트
    sys_prompt = "당신은 LLM이 주어진 질문과 선택지를 감정 없이 중립적이고 논리적으로 판단할 수 있도록 유도하는 시스템 프롬프트를 작성하는 전문가입니다."
    user_prompt = (
        "조건:\n"
        "- 프롬프트는 한 문장이어야 하며, 시스템 메시지로 사용될 수 있어야 합니다.\n"
        "- 편향, 감정 표현 없이 다양한 관점에서 사고를 유도해야 합니다.\n"
        "- [맥락], [질문], [선택지]가 사용자 입력으로 주어질 예정이므로, 이를 고려해 일반화된 형태로 작성하세요.\n"
        "- 마크다운 형식을 사용하지 마세요.\n"
        "- 프롬프트만 출력하고, 그 외 설명은 하지 마세요."
    )
    return [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": user_prompt},
    ]


def generate_reward_prompt(instruction, context, question, choices) -> str:
    # Llama3에서 사용할 프롬프트
    choices_str = "\n".join([f"{i}. {choice}" for i, choice in enumerate(choices, 1)])
    return (
        f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>"
        f"{instruction}<|eot_id|>\n"
        "<|start_header_id|>user<|end_header_id|>\n"
        f"맥락: {context.strip()}\n"
        f"질문: {question.strip()}\n"
        "선택지:\n"
        f"{choices_str}\n"
        "최종 답변은 설명 없이 1, 2, 3 중 하나로만 작성하시오.\n"
        "최종 답변:<|eot_id|>\n"
        "<|start_header_id|>assistant<|end_header_id|>"
    )


def extract_last_choice(raw_answer, choices) -> str:
    first_digit = next(
        (char for char in raw_answer if char.isdigit()), None
    )  # 처음 나오는 숫자만 추출
    if first_digit.isdigit():
        last_choice_idx = int(first_digit)
        if 1 <= last_choice_idx <= 3:
            last_choice = choices[last_choice_idx - 1]
            return last_choice

    raw_answer = raw_answer.strip().replace("\n", "")
    return raw_answer

## 모델 로드

In [None]:
# 모델 로드
quat_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

# LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct
policy_tokenizer = AutoTokenizer.from_pretrained(join_path(POLICY_MODEL_DIR))
if policy_tokenizer.pad_token_id is None:
    policy_tokenizer.pad_token_id = policy_tokenizer.eos_token_id

policy_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    join_path(POLICY_MODEL_DIR),
    quantization_config=quat_config,
    device_map=device,
    torch_dtype=torch.float16,
    trust_remote_code=True,
)

# meta-llama/Llama-3.1-8B-Instruct
reward_tokenizer = AutoTokenizer.from_pretrained(
    join_path(REWARD_MODEL_DIR), padding_side="left"
)
if reward_tokenizer.pad_token_id is None:
    reward_tokenizer.pad_token_id = reward_tokenizer.eos_token_id

reward_model = AutoModelForCausalLM.from_pretrained(
    join_path(REWARD_MODEL_DIR),
    quantization_config=quat_config,
    device_map=device,
    torch_dtype=torch.float16,
)

In [None]:
def generate_prompt_with_policy(messages, policy_model, policy_tokenizer, device):
    input_ids = policy_tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=TOKENIZER_MAX_LENGTH,
    ).to(device)
    response_id = policy_model.generate(
        input_ids,
        max_new_tokens=POLICY_MAX_NEW_TOKENS,
        tokenizer=policy_tokenizer,
        do_sample=POLICY_DO_SAMPLE,
        temperature=POLICY_TEMPERATURE,
        eos_token_id=policy_tokenizer.eos_token_id,
        pad_token_id=policy_tokenizer.pad_token_id,
        use_cache=True,
    )[0]
    action_result = (
        policy_tokenizer.decode(response_id)
        .split("[|assistant|]")[-1]
        .split("[|endofturn|]")[0]
        .strip()
    )
    return input_ids, response_id, action_result


@torch.no_grad()
def interact_with_reward_model(
    prompt, choices, true_answer, reward_model, reward_tokenizer, device
):
    input_ids = reward_tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=TOKENIZER_MAX_LENGTH,
    ).to(device)
    response_id = reward_model.generate(
        **input_ids,
        max_new_tokens=REWARD_MAX_NEW_TOKENS,
        do_sample=REWARD_DO_SAMPLE,
        temperature=REWARD_DO_SAMPLE,
        eos_token_id=reward_tokenizer.eos_token_id,
        pad_token_id=reward_tokenizer.pad_token_id,
        repetition_penalty=1.2,
        use_cache=True,
    )[0]
    result_state = (
        reward_tokenizer.decode(response_id, skip_special_tokens=True)
        .replace(prompt, "")
        .strip()
    )
    result_state = extract_last_choice(result_state, choices)
    # 정답이면 reward = +1, 아니면 -1
    reward = 1.0 if result_state == true_answer else -1.0
    return result_state, reward

## 데이터 로드

In [None]:
def get_train_set(row):
    context = row["context"].strip()
    question = row["question"].strip()
    choices = ast.literal_eval(row["choices"])
    true_answer = row["answer"].strip()
    return context, question, choices, true_answer


# 데이터 준비
df_train = pd.read_csv(join_path(TRAIN_DATA), encoding="utf-8-sig")

## 학습

In [None]:
class PPOBatch:
    def __init__(self, max_batch_size, device="cuda"):
        self.queries = []
        self.responses = []
        self.rewards = []
        self.max_batch_size = max_batch_size
        self.device = device

    def clear(self):
        self.queries.clear()
        self.responses.clear()
        self.rewards.clear()

    def append(self, query, response, reward):
        assert len(self) <= self.max_batch_size, "Batch size limit exceeded."
        self.queries.append(query.squeeze(0))
        self.responses.append(response.squeeze(0))
        self.rewards.append(reward)

    def get_all(self):
        reward_tensors = [
            torch.tensor(reward, dtype=torch.float32).to(self.device)
            for reward in self.rewards
        ]
        return self.queries, self.responses, reward_tensors

    def __len__(self):
        return len(self.queries)

In [None]:
# PPO 학습 설정
# PPOConfig와 PPOTrainer가 deprecated 버전임을 알고 있지만
# v2는 사용하는 모델과 충돌이 있어 v1을 사용합니다.
ppo_config = PPOConfig(
    model_name=policy_model,
    learning_rate=LEARNING_RATE,
    batch_size=BATCH_SIZE,
    mini_batch_size=1,
    gradient_accumulation_steps=GRADIENT_STEP,
    output_dir=join_path("checkpoint"),
    save_strategy="steps",
    save_steps=LOG_STEP,
    save_total_limit=3,
    log_with=None,
)
ppo_trainer = PPOTrainer(
    config=ppo_config,
    model=policy_model,
    tokenizer=policy_tokenizer,
    dataset=None,
)

In [None]:
os.makedirs(join_path("checkpoint"), exist_ok=True)
gc.collect()
torch.cuda.empty_cache()

# PPO 학습
total_reward = 0
num_total_reward = 0

for epoch in range(1, MAX_ITER + 1):
    ppo_batch = PPOBatch(BATCH_SIZE, device)

    for idx, row in df_train.iterrows():
        context, question, choices, true_answer = get_train_set(row)

        # Policy를 통해 프롬프트(action) 생성
        train_prompt = generate_policy_prompt()
        input_tokens, result_prompt_tokens, result_prompt = generate_prompt_with_policy(
            train_prompt, policy_model, policy_tokenizer, device
        )
        # print("===== Action =====\n", result_prompt)

        # Action를 통해 reward 생성
        result_prompt = generate_reward_prompt(
            result_prompt, context, question, choices
        )
        llm_answer, reward = interact_with_reward_model(
            result_prompt, choices, true_answer, reward_model, reward_tokenizer, device
        )
        # print("===== State =====\n", llm_answer)
        # print("===== Reward =====\n", reward, "\n")

        # 정보 기록
        total_reward += reward
        num_total_reward += 1
        ppo_batch.append(input_tokens, result_prompt_tokens, reward)

        if len(ppo_batch) == BATCH_SIZE:
            # 파라미터 학습
            queries, responses, rewards = ppo_batch.get_all()
            ppo_trainer.step(queries, responses, rewards)
            ppo_batch.clear()

        if idx % LOG_STEP == 0:
            # 학습 현황 출력
            avg_reward = total_reward / num_total_reward
            print(f"[{epoch}_{idx}] Average reward: {avg_reward:.3f}")

            total_reward = 0
            num_total_reward = 0

In [None]:
ppo_trainer.model.save_pretrained(join_path(TRAINED_OUTPUT))
policy_tokenizer.save_pretrained(join_path(TRAINED_OUTPUT))
print("🫠학습을 완료했습니다!")

gc.collect()
torch.cuda.empty_cache()

## 학습 결과 확인

In [None]:
def random_sample(df):
    radom_idx = random.randint(0, len(df) - 1)
    row = df.iloc[radom_idx]
    context = row["context"].strip()
    question = row["question"].strip()
    choices = ast.literal_eval(row["choices"])
    true_answer = row["answer"].strip()
    return context, question, choices, true_answer


@torch.no_grad()
def sampled_result_as_dataframe(
    df,
    policy_model,
    policy_tokenizer,
    reward_model,
    reward_tokenizer,
    save_path=None,
    num_sample=10,
    device="cuda",
):
    prompt_list = []
    llm_answer_list = []
    correct_answer_list = []

    for i in range(num_sample):
        context, question, choices, true_answer = random_sample(df)
        train_prompt = generate_policy_prompt()
        _, _, result_prompt = generate_prompt_with_policy(
            train_prompt, policy_model, policy_tokenizer, device
        )
        result_prompt = generate_reward_prompt(
            result_prompt, context, question, choices
        )
        llm_answer, _ = interact_with_reward_model(
            result_prompt, choices, true_answer, reward_model, reward_tokenizer, device
        )
        prompt_list.append(result_prompt)
        llm_answer_list.append(llm_answer)
        correct_answer_list.append(true_answer)

    df_sampled = pd.DataFrame(
        {
            "prompt": prompt_list,
            "response": llm_answer_list,
            "correct": correct_answer_list,
        }
    )
    if save_path is not None:
        # `save_path`가 있으면 파일로 저장
        if not save_path.endswith(".csv"):
            save_path += ".csv"
        df_sampled.to_csv(save_path, index=True)
    return df_sampled

In [None]:
sampled_result_path = join_path(SAMPLE_RESULT_CSV)
df_sampled = sampled_result_as_dataframe(
    df_train,
    policy_model,
    policy_tokenizer,
    reward_model,
    reward_tokenizer,
    save_path=sampled_result_path,
    num_sample=30,
    device=device,
)
print(f"🤔저장한 결과를 확인해 보세요: {sampled_result_path}")
df_sampled.head()