# [InstructGPT : Training language models to follow instructions with human feedback](https://arxiv.org/pdf/2203.02155.pdf)


### Reference Code 
- https://github.com/xrsrke/instructGOOSE/tree/main

<img src="./figures/instructGPT.png" title="instructGPT" />


In [1]:
# !pip3 install instruct_goose

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader, random_split
from torch import optim

from instruct_goose import Agent, RewardModel, RLHFTrainer, RLHFConfig, create_reference_model

import torch
import os
# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

## 1. Load dataset

In [2]:
dataset = load_dataset("imdb", split="train")
dataset, _ = random_split(
    dataset, 
    lengths=[10, len(dataset) - 10]
) # for demenstration purposes
dataset

Found cached dataset imdb (/home/todsavadt/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


<torch.utils.data.dataset.Subset at 0x7f3ea014f340>

In [3]:
train_dataloader = DataLoader(
    dataset, 
    batch_size=16, 
    shuffle=True
)

In [4]:
# for batch in train_dataloader:
#     break

In [5]:
# batch['text']

In [6]:
# batch['label']

## 2. Load the pre-trained model and tokenizer

In [7]:
model_name_or_path = "gpt2"

model_base = AutoModelForCausalLM.from_pretrained(model_name_or_path) # for demonstration purposes
reward_model = RewardModel(model_name_or_path)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side="left")
eos_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

## 3. Create the RL-based language model agent and the reference model

In [8]:
model = Agent(model_base)
ref_model = create_reference_model(model)

In [10]:
# ref_model.to(device)

In [12]:
# model.to(device)

## 4. Training

In [13]:
max_new_tokens = 20
generation_kwargs = {
    "min_length":-1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": max_new_tokens
}

config = RLHFConfig()
number_epochs = 1 # for demonstration purposes
trainer = RLHFTrainer(model, ref_model, config)
optimizer = optim.SGD(model.parameters(), lr=1e-3)

In [14]:
for epoch in range(number_epochs):
    for step, batch in enumerate(tqdm(train_dataloader, leave=True)):
    # for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{number_epochs}", leave=False):
        inputs = tokenizer(
            batch["text"], 
            padding=True, 
            truncation=True, 
            return_tensors="pt"
        )
        response_ids = model.generate(
            inputs["input_ids"], attention_mask=inputs["attention_mask"],
            **generation_kwargs
        )
        # extract the generated text
        response_ids = response_ids[:, -max_new_tokens:]
        response_attention_mask = torch.ones_like(response_ids)

        # evaluate from the reward model
        with torch.no_grad():
            text_input_ids = torch.stack([torch.concat([q, r]) for q, r in zip(inputs["input_ids"], response_ids)], dim=0)
            rewards = reward_model(text_input_ids)

        # calculate PPO loss
        loss = trainer.compute_loss(
            query_ids=inputs["input_ids"],
            query_attention_mask=inputs["attention_mask"],
            response_ids=response_ids,
            response_attention_mask=response_attention_mask,
            rewards=rewards
        )
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f"loss={loss}")

100%|██████████| 1/1 [00:47<00:00, 47.24s/it]

loss=-253.5457305908203



