# [InstructGPT : Training language models to follow instructions with human feedback](https://arxiv.org/pdf/2203.02155.pdf)


### Reference Code 
- https://github.com/xrsrke/instructGOOSE/tree/main

<img src="../figures/instructGPT.png" title="instructGPT" />


In [1]:
# !pip3 install instruct_goose

In [2]:
from datasets import load_dataset
from tqdm.auto import tqdm

import torch
from torch.utils.data import DataLoader, random_split
from torch import optim

import os
# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

os.environ['http_proxy']  = 'http://192.41.170.23:3128'
os.environ['https_proxy'] = 'http://192.41.170.23:3128'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

  from .autonotebook import tqdm as notebook_tqdm


device(type='cuda')

## 1. Load dataset

In [3]:
dataset = load_dataset("imdb", split="train")
dataset

Found cached dataset imdb (/home/todsavadt/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0)


Dataset({
    features: ['text', 'label'],
    num_rows: 25000
})

In [4]:
dataset, _ = random_split(
    dataset, 
    lengths=[10, len(dataset) - 10]
) # for demenstration purposes

In [5]:
train_dataloader = DataLoader(
    dataset, 
    batch_size=16, 
    shuffle=True
)

In [6]:
# for batch in train_dataloader:
#     break

# batch['text'], batch['label']

## 2. Load the pre-trained model and tokenizer

In [7]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from instruct_goose import (
    Agent, 
    RewardModel, 
    RLHFTrainer, 
    RLHFConfig, 
    create_reference_model
)

model_name_or_path = "gpt2"

model_base = AutoModelForCausalLM.from_pretrained(model_name_or_path) # for demonstration purposes
reward_model = RewardModel(model_name_or_path)

tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, padding_side="left")
eos_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

## 3. Create the RL-based language model agent and the reference model

In [8]:
model = Agent(model_base)
ref_model = create_reference_model(model)

In [9]:
# ref_model.to(device)

In [10]:
# model.to(device)

## 4. Training

In [11]:
max_new_tokens = 20
generation_kwargs = {
    "min_length":-1,
    "top_k": 0.0,
    "top_p": 1.0,
    "do_sample": True,
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": max_new_tokens
}

config = RLHFConfig()
trainer = RLHFTrainer(model, ref_model, config)

In [12]:
'''
def forward(
        self,
        query_ids: TensorType["batch_size", "seq_len"],
        query_attention_mask: TensorType["batch_size", "seq_len"],
        response_ids: TensorType["batch_size", "seq_len"],
        response_attention_mask: TensorType["batch_size", "seq_len"]
    ) -> Tuple[
        TensorType["batch_size"], # main model's logprobs
        TensorType["batch_size"], # entropy
        TensorType["batch_size"], # value
        TensorType["batch_size"], # reference model's log prob
    ]:
        input_ids = torch.cat([query_ids, response_ids], dim=1)
        attention_mask = torch.cat([query_attention_mask, response_attention_mask], dim=1)

        _, logprobs, entropy, value = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        _, ref_logprob, _, _ = self.ref_model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        return logprobs, entropy, value, ref_logprob

def compute_loss(
    self,
    query_ids: TensorType["batch_size", "seq_len"],
    query_attention_mask: TensorType["batch_size", "seq_len"],
    response_ids: TensorType["batch_size", "seq_len"],
    response_attention_mask: TensorType["batch_size", "seq_len"],
    rewards: TensorType["batch_size"],
) -> TensorType["1"]:
    """Calculate PPO's loss."""
    logprobs, values, entropies, ref_logprobs = self.forward(
        query_ids=query_ids,
        query_attention_mask=query_attention_mask,
        response_ids=response_ids,
        response_attention_mask=response_attention_mask
    )

    ratio = (logprobs - ref_logprobs).exp()
    clipped_ratio = torch.clamp(ratio, min=1-self.epsilon, max=1+self.epsilon)

    advantages, returns = self.compute_advantage_and_return(rewards, values)
    value_loss = (values - returns).pow(2).mean()

    pg_loss_1 = ratio * advantages
    pg_loss_2 = ratio * clipped_ratio
    pg_loss = torch.min(pg_loss_1, pg_loss_2).mean()

    loss = pg_loss - self.ent_coef * entropies.mean() + self.vf_coef * value_loss
    return loss
'''

'\ndef compute_loss(\n    self,\n    query_ids: TensorType["batch_size", "seq_len"],\n    query_attention_mask: TensorType["batch_size", "seq_len"],\n    response_ids: TensorType["batch_size", "seq_len"],\n    response_attention_mask: TensorType["batch_size", "seq_len"],\n    rewards: TensorType["batch_size"],\n) -> TensorType["1"]:\n    """Calculate PPO\'s loss."""\n    logprobs, values, entropies, ref_logprobs = self.forward(\n        query_ids=query_ids,\n        query_attention_mask=query_attention_mask,\n        response_ids=response_ids,\n        response_attention_mask=response_attention_mask\n    )\n\n    ratio = (logprobs - ref_logprobs).exp()\n    clipped_ratio = torch.clamp(ratio, min=1-self.epsilon, max=1+self.epsilon)\n\n    advantages, returns = self.compute_advantage_and_return(rewards, values)\n    value_loss = (values - returns).pow(2).mean()\n\n    pg_loss_1 = ratio * advantages\n    pg_loss_2 = ratio * clipped_ratio\n    pg_loss = torch.min(pg_loss_1, pg_loss_2).mean

In [13]:
num_epochs = 3 # for demonstration purposes
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [14]:
for epoch in range(num_epochs):
    for step, batch in enumerate(tqdm(train_dataloader, leave=True)):
        inputs = tokenizer(
            batch["text"], 
            padding=True, 
            truncation=True, 
            return_tensors="pt"
        )
        
        response_ids = model.generate(
            inputs["input_ids"], attention_mask=inputs["attention_mask"],
            **generation_kwargs
        )
        
        # extract the generated text
        response_ids = response_ids[:, -max_new_tokens:]
        response_attention_mask = torch.ones_like(response_ids)

        # evaluate from the reward model
        with torch.no_grad():
            text_input_ids = torch.stack([torch.concat([q, r]) for q, r in zip(inputs["input_ids"], response_ids)], dim=0)
            rewards = reward_model(text_input_ids)

        # calculate PPO loss
        loss = trainer.compute_loss(
            query_ids=inputs["input_ids"],
            query_attention_mask=inputs["attention_mask"],
            response_ids=response_ids,
            response_attention_mask=response_attention_mask,
            rewards=rewards
        )
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        print(f'Epoch: {epoch+1} | loss = {loss:.4f}')

100%|██████████| 1/1 [00:34<00:00, 34.01s/it]


Epoch: 1 | loss = -7.5626


100%|██████████| 1/1 [00:31<00:00, 31.00s/it]


Epoch: 2 | loss = 2.6821


100%|██████████| 1/1 [00:26<00:00, 26.60s/it]


Epoch: 3 | loss = 0.0515


100%|██████████| 1/1 [00:26<00:00, 26.12s/it]


Epoch: 4 | loss = -0.8420


100%|██████████| 1/1 [00:30<00:00, 30.54s/it]

Epoch: 5 | loss = 16.6571





In [21]:
# Encode input text
input_text = dataset[0]['text']
input_ids = tokenizer.encode(input_text, return_tensors="pt")

# Generate output
output = model_base.generate(input_ids, max_length=256, num_beams=5, no_repeat_ngram_size=2, top_k=50, top_p=0.95, temperature=0.7)

# Decode and print the generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated text:\n", generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated text:
 I stumbled on this series rather by accident. After half an episode, I was hooked. American Gothic was a dark, strange series with Gary Cole as the mysterious, probably evil Sheriff Buck who is trying to gain control of his illegitimate son Caleb, played by Lucas Black. I was impressed with Gary Cole's sinister sheriff and I was even more impressed with Lucas Black. Lucas Black's Caleb was able to stand up against Sheriff Buck, one of the most frightening characters ever created for a TV series. I have rarely seen a child actor with as much presence or talent as Lucas Black. If you were not lucky enough to see Lucas in American Gothic, see him in Slingblade.<br /><br />It was a remarkable show with many ambiguities and mysteries that were never explained during it's short run. chance chance chances chance on chance free chance
 chance no chance hope chance not chance name chance opportunity chance one chanceon chance that chance life chance test chance time chance prom