In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
from tqdm import tqdm, trange
# # install open assistant model_training module (e.g. run `pip install -e .` in `model/` directory of open-assistant repository)
# import model_training.models.reward_model

# We instead make a copy of the reward model code here, so that we can import it
import reward_model
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
sns.set_context("paper", font_scale=1.5)

In [None]:
gpt4_alpaca = load_dataset("vicgalle/alpaca-gpt4")
gpt35_alpaca = load_dataset("tatsu-lab/alpaca")

In [None]:
model_name = "OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
)

In [None]:
model.eval()
model.cuda()

In [None]:
def format_example(example):
    # "<|prompter|>Hi how are you?<|endoftext|><|assistant|>Hi, I am Open-Assistant a large open-source language model trained by LAION AI. How can I help you today?<|endoftext|>"
    text = f"<|prompter|>{(example['instruction'] + ' ' + example['input']).strip()}<|endoftext|><|assistant|>{example['output']}"
    return text

def get_score(example):
    text = format_example(example)
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.cuda() for k, v in inputs.items()}
    outputs = model(**inputs)
    return outputs.logits[0][0].item()

In [None]:
examples = []
for index in trange(len(gpt35_alpaca["train"])):
    assert gpt35_alpaca["train"][index]["instruction"] == gpt4_alpaca["train"][index]["instruction"]
    examples.extend([format_example(gpt35_alpaca["train"][index]), format_example(gpt4_alpaca["train"][index])])

In [None]:
batch_size = 8
scores = []
for i in trange(0, len(examples), batch_size):
    inputs = tokenizer(examples[i:i+batch_size], return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.cuda() for k, v in inputs.items()}
    outputs = model(**inputs)
    scores.extend(outputs.logits[:, 0].detach().cpu().numpy().tolist())

In [None]:
router_dataset = []

for index in trange(len(gpt35_alpaca["train"])):
    assert gpt35_alpaca["train"][index]["instruction"] == gpt4_alpaca["train"][index]["instruction"]
    input_text = f"{gpt35_alpaca['train'][index]['instruction']} {gpt35_alpaca['train'][index]['input']}".strip()
    score_35 = scores[index * 2]
    score_4 = scores[index * 2 + 1]
    router_dataset.append({
        "score_35": score_35,
        "score_4": score_4,
        "preference": torch.sigmoid(torch.tensor(score_4 - score_35)).item(),
        "input": input_text,
    })

In [None]:
preference_list = [example["preference"] for example in router_dataset]

In [None]:
plt.figure(figsize=(10, 5))
_ = plt.hist(preference_list, bins=20)

In [None]:
torch.save(router_dataset, "router_dataset.pt")