### dependency

In [1]:
%%capture
!pip install datasets trl torch transformers
!pip install sentencepiece
!pip install transformers[sentencepiece]

In [2]:
import torch
from datasets import load_dataset
from tqdm import tqdm

from transformers import AutoTokenizer, pipeline

from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer




### Implement

In [5]:
dataset = load_dataset("Dahoas/full-hh-rlhf", split="train")
dataset = dataset.rename_column("prompt", "query")
dataset = dataset.remove_columns(["response", "chosen", "rejected"])

In [3]:
config = PPOConfig(
    model_name="agi-css/hh-rlhf-sft",
    learning_rate=1.41e-5,
)

In [6]:
model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [8]:
next(model.parameters()).is_cuda

False

In [9]:
# rm_tokenizer = AutoTokenizer.from_pretrained("weqweasdas/hh_rlhf_rm_open_llama_3b")

rm_pipe = pipeline(
    "sentiment-analysis",
    model="weqweasdas/hh_rlhf_rm_open_llama_3b",
    device="cuda",
    tokenizer=tokenizer, #rm_tokenizer
    model_kwargs={"torch_dtype": torch.bfloat16}
)

In [11]:
def tokenize(sample):
    sample["input_ids"] = tokenizer.encode(sample["query"])
    return sample


In [10]:
dataset = dataset.map(tokenize, batched=False)

Map:   0%|          | 0/112052 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (520 > 512). Running this sequence through the model will result in indexing errors


In [None]:
pipe_kwargs = {
    "return_all_scores": True,
    "function_to_apply": "none",
    "batch_size": 1
}

In [13]:
ppo_trainer = PPOTrainer(
    model=model,
    config=config,
    dataset=dataset, # no train_dataset param
    tokenizer=tokenizer,
)
pipe_kwargs = {
    "return_all_scores": True,
    "function_to_apply": "none",
    "batch_size": 1
}

OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 0 has a total capacty of 22.19 GiB of which 147.50 MiB is free. Including non-PyTorch memory, this process has 22.04 GiB memory in use. Of the allocated memory 21.70 GiB is allocated by PyTorch, and 101.43 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
test_texts = [
  "###Human: My daughter wants to know how to convert fractions to decimals, but I'm not sure how to explain it. Can you help? ###Assistant: Sure. So one way of converting fractions to decimals is to ask “how many halves are there?” and then write this as a decimal number. But that's a little tricky. Here's a simpler way:  if a fraction is expressed as a/b, then it's decimal equivalent is just a/b * 1.0  So, for example, the decimal equivalent of 1/2 is 1/2 * 1.0 = 0.5.",
  "###Human: I have fresh whole chicken in my fridge. What dish can I prepare using it that will take me less than an hour to cook? ###Assistant: Are you interested in a quick and easy recipe you can prepare with chicken you have on hand, or something more involved?  In terms of both effort and time, what are you looking for?"]

pipe_outputs = rm_pipe(test_texts, **pipe_kwargs)
rewards = [output[0]["score"] for output in pipe_outputs]

### Training Loop

In [None]:
for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    query_tensors = batch["input_ids"]

    #### Get response from SFTModel
    response_tensors = ppo_trainer.generate(query_tensors, **pipe_kwargs)
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

    #### Compute reward score
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    # pipe_outputs = reward_model(texts)
    # rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]
    pipe_outputs = rm_pipe(texts, **pipe_kwargs)
    rewards = [output[0]["score"] for output in pipe_outputs]

    #### Run PPO step
    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)

#### Save model
ppo_trainer.save_model("hhrlhf_ppo_model")