In [1]:
!pip install transformers
!pip install trl

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m69.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3
Looking in indexes: https://pypi.org/simple, https://us

In [2]:
import os
import numpy as np
from google.colab import drive
from transformers import AutoTokenizer, AutoModelForSequenceClassification, GPT2LMHeadModel, GPT2Tokenizer
from trl import PPOTrainer, PPOConfig, AutoModelForCausalLMWithValueHead, create_reference_model
from trl.core import respond_to_batch

drive.mount('/content/drive')
project_path = './drive/MyDrive/Colab Notebooks/GPT_community/'

Mounted at /content/drive


In [3]:
# Get model to train and reference model for KL divergence
gpt2_tokenizer = AutoTokenizer.from_pretrained('gpt2')
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token
kant_model = AutoModelForCausalLMWithValueHead.from_pretrained('Linus4Lyf/Kant_Metaphysics_Of_Morals').to("cuda")
kant_model_ref = create_reference_model(kant_model)

# Get reward model
bert_tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
reward_model = AutoModelForSequenceClassification.from_pretrained('rjuggins/philosophy_reward_test', num_labels=1).to("cuda")

# Load in prompts
prompts_file = os.path.join(project_path, 'data/brighton_philosophy_prompts.txt')
with open(prompts_file) as file:
    prompts = [line.rstrip() for line in file]

# Define and initialise Trainer
ppo_config = PPOConfig(batch_size=1)
ppo_trainer = PPOTrainer(ppo_config, kant_model, kant_model_ref, gpt2_tokenizer)

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/962 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/510M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/788 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]



In [4]:
num_steps = 1000
for step in range(num_steps):
  # Select random prompt from list as query
  query_text = np.random.choice(prompts)

  # Encode query and get response from model
  query_ids = gpt2_tokenizer.encode(query_text, return_tensors='pt').to('cuda')
  response_ids  = respond_to_batch(kant_model, query_ids)
  response_text = gpt2_tokenizer.batch_decode(response_ids)[0]

  # Encode response and get reward value for it
  response_bert = bert_tokenizer.encode(query_text + response_text, return_tensors='pt').to('cuda')
  reward = [reward_model(response_bert).logits[0]]
  # print(query_txt + response_text, '\n')
  # print(reward[0].item(), '\n')

  # train model for one step with ppo
  train_stats = ppo_trainer.step([query_ids[0]], [response_ids[0]], reward)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [5]:
# Test model vs reference model on a load of prompts

ref_rewards = []
model_rewards = []

test_steps = 25
for step in range(test_steps):
  # Select random prompt from list as query
  query_text = np.random.choice(prompts)

  # Encode query and get response from ref model and model
  query_ids = gpt2_tokenizer.encode(query_text, return_tensors='pt').to('cuda')
  ref_response_ids  = respond_to_batch(kant_model_ref, query_ids)
  ref_response_text = gpt2_tokenizer.batch_decode(ref_response_ids)[0]
  model_response_ids  = respond_to_batch(kant_model, query_ids)
  model_response_text = gpt2_tokenizer.batch_decode(model_response_ids)[0]

  # Encode reference response and get reward value for it
  ref_response_bert = bert_tokenizer.encode(query_text + ref_response_text, return_tensors='pt').to('cuda')
  ref_reward = reward_model(ref_response_bert).logits.item()
  ref_rewards.append(ref_reward)

  # Encode model response and get reward value for it
  model_response_bert = bert_tokenizer.encode(query_text + model_response_text, return_tensors='pt').to('cuda')
  model_reward = reward_model(model_response_bert).logits.item()
  model_rewards.append(model_reward)

print(f"Mean reference model reward = {np.mean(ref_rewards)}")
print(f"Mean fine-tuned model reward = {np.mean(model_rewards)}")

Mean reference model reward = 0.516905267238617
Mean fine-tuned model reward = 0.5801490080356598


In [6]:
kant_model_ref.pretrained_model.transformer.wte.weight

Parameter containing:
tensor([[-0.1100, -0.0390,  0.0352,  ..., -0.1373,  0.0148,  0.0453],
        [ 0.0423, -0.0484,  0.0459,  ...,  0.0866,  0.0037,  0.0426],
        [-0.1238,  0.0439,  0.1907,  ...,  0.0889, -0.1273, -0.0891],
        ...,
        [-0.0432, -0.0557,  0.0172,  ...,  0.1040,  0.0963, -0.0679],
        [ 0.1852,  0.0175,  0.0490,  ..., -0.0952,  0.0787, -0.0225],
        [ 0.0477, -0.0219,  0.0553,  ...,  0.0137,  0.1611,  0.1183]],
       device='cuda:0')

In [7]:
kant_model.pretrained_model.transformer.wte.weight

Parameter containing:
tensor([[-0.1098, -0.0399,  0.0333,  ..., -0.1385,  0.0138,  0.0459],
        [ 0.0432, -0.0472,  0.0454,  ...,  0.0867,  0.0041,  0.0423],
        [-0.1269,  0.0451,  0.1900,  ...,  0.0889, -0.1264, -0.0894],
        ...,
        [-0.0445, -0.0569,  0.0170,  ...,  0.1036,  0.0971, -0.0661],
        [ 0.1826,  0.0141,  0.0514,  ..., -0.0963,  0.0808, -0.0222],
        [ 0.0484, -0.0210,  0.0529,  ...,  0.0158,  0.1590,  0.1171]],
       device='cuda:0', requires_grad=True)