In [13]:
%load_ext autoreload
%autoreload 2

import torch
print ('number of GPUs:', torch.cuda.device_count())

from tqdm.notebook import tqdm

from minichatgpt.experiments.imdb import config, sent_kwargs
from minichatgpt import Lab
from minichatgpt.processdata.collators import imdb_dataloader_collator

# For the sake of the speed of this demonstration, the batch_size is temporarily decreased from 256 to 4
batch_size = 4
config.batch_size = batch_size
config.forward_batch_size = batch_size//2
config.seed

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
number of GPUs: 2


0

In [None]:
lab = Lab(config)

dataset = lab.build_dataset(dataset_name="imdb",input_min_text_length=2,input_max_text_length=8)

new_policy, old_policy, tokenizer = lab.init_policies_tokenizer()

lab.set_generation_config(do_sample=True,output_min_length=4,output_max_length=16,pad_token_id=tokenizer.eos_token_id)

ppo_trainer = lab.init_ppo_trainer(
    config, 
    new_policy, 
    old_policy, 
    tokenizer, 
    dataset, 
    dataloader_collator = imdb_dataloader_collator,
)

reward_model = lab.init_reward_model()

In [12]:
print(new_policy.pretrained_model.device)
print(new_policy.v_head.summary.weight.device)

cuda:0
cuda:0


In [14]:
for batch_step, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    
    queries = batch['input_ids']
    
    #### Get response from gpt2
    responses = []
    for query in queries:
        gen_len = lab.output_length_sampler()
        lab.generation_kwargs["max_new_tokens"] = gen_len
        response = ppo_trainer.generate(query, **lab.generation_kwargs)
        responses.append(response.squeeze()[-gen_len:])
        
    batch['response'] = [tokenizer.decode(r.squeeze()) for r in responses]
    
    #### Compute sentiment score
    texts = [q + r for q,r in zip(batch['query'], batch['response'])]
    pipe_outputs = lab.reward_model(texts, **sent_kwargs)
    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]
    
    #### Run PPO step 
    stats = ppo_trainer.step(queries, responses, rewards)
    ppo_trainer.log_stats(stats, batch, rewards)
    
    break

0it [00:00, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


tensor([[0.8712, 0.7366, 1.3744, 0.0074, 0.1823, 0.5760, 1.2990]],
       device='cuda:0') 0.2
tensor([[-0.1447,  0.1331, -2.1336, -1.6075, -1.5900,  1.6841,  0.1250, -1.1569,
          2.1941]], device='cuda:0') 0.2
tensor([[ 2.8135, -0.7756,  0.8125, -0.3667]], device='cuda:0') 0.2
tensor([[ 1.1719,  1.1426,  0.3003,  0.0666,  0.6122, -0.7323,  1.8992,  1.2045,
         -0.6964, -0.0479,  0.7701, -0.4125,  0.3040,  0.6388,  0.7322]],
       device='cuda:0') 0.2
tensor([[0.8712, 0.7366, 1.3744, 0.0074, 0.1823, 0.5760, 1.2990]],
       device='cuda:0') 0.2
tensor([[-0.1447,  0.1331, -2.1336, -1.6075, -1.5900,  1.6841,  0.1250, -1.1569,
          2.1941]], device='cuda:0') 0.2
tensor([[ 1.1719,  1.1426,  0.3003,  0.0666,  0.6122, -0.7323,  1.8992,  1.2045,
         -0.6964, -0.0479,  0.7701, -0.4125,  0.3040,  0.6388,  0.7322]],
       device='cuda:0') 0.2
tensor([[ 2.8135, -0.7756,  0.8125, -0.3667]], device='cuda:0') 0.2
tensor([[0.8712, 0.7366, 1.3744, 0.0074, 0.1823, 0.5760, 1.2990]