Package and setup

In [None]:
import os
import json
import numpy as np

from alpaca_farm.auto_annotations import PairwiseAutoAnnotator

apikey = ""  # openai api key
output_dir = ""  # the dir of generation json files
dataset = "prism"  # or "psoups"

In [None]:
import openai
import os

openai.api_key = apikey
decoding_kwargs = dict(
    openai_api_key = openai.api_key,
    openai_organization_ids = None, # can set multiple orgs to avoid rate limits
)

In [None]:
if dataset == "prism":
    with open("./prism_winrate_gpt_prompt.txt", 'r') as f:
        winrate_prompt = f.read()

    with open ("../data/prism_selected_examples.json", 'r') as f:
        selected_examples = json.load(f)

    with open ("../data/prism_data_dialog.json", 'r') as f:
        data_dialog = json.load(f)

# for psoups evaluation by alcapa farm 
if dataset == "psoups":
    user_annotator_mapping = {"1": "p1a", "2": "p1b", "3": "p2a", "4": "p2b", "5": "p3a", "6": "p3b"}


In [None]:
# build entry for alpacafarm

def split_output_text(output_text, input_text, dataset="prism"):
    if dataset == "prism":
        assistant_split = "<|start_header_id|>assistant<|end_header_id|>\n\n"
    elif dataset == "psoups":
        assistant_split = "<|assistant|>\n"

    # if no new text, mark as wrong completion
    if (len(output_text.split(assistant_split)) 
        < len(input_text.split(assistant_split))
    ):
        print(f"Wrong completion!")
        completion = "Wrong completion."
    else:
        completion = output_text.split(assistant_split)[-1]
    return completion


def to_annotate_format(filename_1, filename_2="chosen", seed=123, dataset="prism"):
    # load outputs
    with open(os.path.join(output_dir, filename_1), 'r') as f:
        outputs_1 = json.load(f)
    if filename_2 != "chosen":
        with open(os.path.join(output_dir, filename_2), 'r') as f:
            outputs_2 = json.load(f)
        assert len(outputs_1) == len(outputs_2), "must have the same number!"
    
    # use labels to shuffle 1&2 position
    np.random.seed(seed)
    labels = np.random.choice(a=[1,2], size=len(outputs_1))
    to_annotate = {
        prompt_id: {
            "label": labels[idx].item(),
            "comparisons": []
        } for idx, prompt_id in enumerate(list(outputs_1.keys()))
    }

    for prompt_id in outputs_1.keys():
        if dataset == "prism":
            user_id, dialog_id, turn_nb = prompt_id.split("_")

        # split completion from output text
        completion_1 = split_output_text(outputs_1[prompt_id]["output_text"], outputs_1[prompt_id]["input_text"], dataset)
        if filename_2 != "chosen":
            completion_2 = split_output_text(outputs_2[prompt_id]["output_text"], outputs_2[prompt_id]["input_text"], dataset)
        elif dataset == "prism":
            completion_2 = selected_examples[dialog_id]["turns"][turn_nb]["chosen_utterance"]
        else:
            print("No chosen response for psoups! Please compare to sft generations.")
            return

        if completion_1 == completion_2:
            print(f"{prompt_id}: same completions!")

        # get prompt
        if dataset == "prism":
            prompt = selected_examples[dialog_id]["turns"][turn_nb]["history"]
        elif dataset == "psoups":
            prompt = outputs_1[prompt_id]["prompt"].replace("<|user|>\n", "").replace(" \n<|assistant|>\n", "")

        # conform to alpacafarm format
        if to_annotate[prompt_id]["label"] == 1:
            entry = {
                "instruction": prompt,
                "input": "",
                "output_1": completion_1,
                "output_2": completion_2,
            }
        elif to_annotate[prompt_id]["label"] == 2:
            entry = {
                "instruction": prompt,
                "input": "",
                "output_1": completion_2,
                "output_2": completion_1,
            }
        
        to_annotate[prompt_id]["comparisons"].append(entry)
       
    return to_annotate

In [None]:
# only for prism
# generate and save prompt for each entry: replace <user_info>, <previous_choices>, <user_reasoning> in winrate_prompt
def build_eval_prompts(to_annotate): 
    if not os.path.exists("./alpaca_farm/auto_annotations/prism_winrate_eval_prompts"):
        os.makedirs("./alpaca_farm/auto_annotations/prism_winrate_eval_prompts")
    np.random.seed(123)
    labels = [1,2] * 500  # force the shuffle of response order in prompt
    labels_idx = 0
    for prompt_id in to_annotate.keys():
        user_id, dialog_id, turn_nb = prompt_id.split("_")
        dialog = data_dialog[dialog_id]

        # build <user_info>
        user_info = ""
        for k in selected_examples[dialog_id]["demographics"]:
            if k != "preference":
                user_info += f"- {k.replace('_', ' ')}: {selected_examples[dialog_id]['demographics'][k]}\n"
        eval_prompt = winrate_prompt.replace("<user_info>", user_info[:-1])

        # build <user_reasoning>
        user_reasoning = f"- Their expectation for the assistant: {selected_examples[dialog_id]['system_string']}\n"
        user_reasoning += f"- Their feedback on the chosen response: {selected_examples[dialog_id]['open_feedback']}\n"
        preference = ", ".join(selected_examples[dialog_id]["demographics"]["preference"])
        user_reasoning += f"- Top three aspects of their preferences: {preference}\n"
        eval_prompt = eval_prompt.replace("<user_reasoning>", user_reasoning[:-1])

        # build <previous_choices>
        previous_choices = f"a batch of {len(dialog['turns'])-1} examples.\n\n"
        num = 1
        for idx, turn in enumerate(dialog["turns"]):
            if turn["turn_nb"] == int(turn_nb):  # skip the turn to be annotated
                continue
            if turn['rejected_utterance'] == "":  # tie
                example = f"### Example {num}\n\n#### Instruction {num}:\n{turn['user_utterance']}\n\n#### Output (a) for example {num}:\n{turn['chosen_utterance']}\n\n#### Output (b) for example {num}:\n{turn['chosen_utterance']}\n\n#### Result for example {num}:\nTIE\n\n"
            elif labels[labels_idx] == 1:  # chosen in a
                example = f"### Example {num}\n\n#### Instruction {num}:\n{turn['user_utterance']}\n\n#### Output (a) for example {num}:\n{turn['chosen_utterance']}\n\n#### Output (b) for example {num}:\n{turn['rejected_utterance']}\n\n#### Result for example {num}:\nOutput (a)\n\n"
                labels_idx += 1
            elif labels[labels_idx] == 2:  # chosen in b
                example = f"### Example {num}\n\n#### Instruction {num}:\n{turn['user_utterance']}\n\n#### Output (a) for example {num}:\n{turn['rejected_utterance']}\n\n#### Output (b) for example {num}:\n{turn['chosen_utterance']}\n\n#### Result for example {num}:\nOutput (b)\n\n"
                labels_idx += 1
            previous_choices += example
            num += 1
        eval_prompt = eval_prompt.replace("<previous_examples>", previous_choices[:-2])

        with open(f"./alpaca_farm/auto_annotations/prism_winrate_eval_prompts/{prompt_id}.txt", 'w') as f:
            f.write(eval_prompt)
    return


In [None]:
# only for prism
# build list of annotator config dict for each entry

def build_annotator_configs(to_annotate, gptmodel="gpt-4o-mini"):
    configs = {}
    for prompt_id in to_annotate.keys():
        config = {}
        config["prompt_templates"] = {"without_inputs": f"./prism_winrate_eval_prompts/{prompt_id}.txt"}
        config["fn_decoder"] = "openai_completions"
        config["decoder_kwargs"] = {"model_name": gptmodel,
                                    "max_tokens": 50,
                                    "temperature": 0,
                                    "tokens_to_avoid": ['Both', 'Neither', 'None', ' Both', ' Neither', 'Either', 'depends', 'context','It', 'both','Sorry'],
                                    "tokens_to_favor": ["Output (a)", "Output (b)", "TIE"]
                                }
        config["outputs_to_match"] = {  1: '(?:^|\n|: )Output \(a\)',
                                        2: '(?:^|\n|: )Output \(b\)',
                                        3: 'TIE'
                                    }
        config["batch_size"] = 1
        configs[prompt_id] = {prompt_id: config}
    return configs

Run win-rate annotation

In [None]:
# please assign the following variables
gptmodel = "gpt-4o-mini"  # "gpt-3.5-turbo-0125" "gpt-4o-2024-08-06"
filename_1 = ""  # the generation json files to be evaluated
filename_2 = ""  # will compare file1 against file2

if dataset == "psoups":
    print("Use default gptmodel as in psoups paper&code: GPT-4 !")

to_annotate = to_annotate_format(filename_1, filename_2, dataset=dataset)
if dataset == "prism":
    build_eval_prompts(to_annotate)
    configs = build_annotator_configs(to_annotate, gptmodel)

print(len(to_annotate))

In [None]:
# annotate
num = 0
if not os.path.exists(f"./{dataset}_winrate_eval_results/{gptmodel}_{filename_1}_{filename_2}"):
    os.makedirs(f"./{dataset}_winrate_eval_results/{gptmodel}_{filename_1}_{filename_2}")

for prompt_id, sample in list(to_annotate.items()):
    if dataset == "prism":
       annotator = PairwiseAutoAnnotator(
            annotators_config = configs[prompt_id],
            saving_path = f"./{dataset}_winrate_eval_results/{gptmodel}_{filename_1}_{filename_2}/{prompt_id}.json", 
            is_avoid_reannotations = False,
            **decoding_kwargs
        )
    elif dataset == "psoups":
        user_id, sample_id = prompt_id.split("_")
        if user_id == '0':  # only evaluate user 1-6
            continue
        pref = user_annotator_mapping[user_id]
        annotator = PairwiseAutoAnnotator(
            annotators_config = f"annotators/criteria_wise_eval_gpt4/{pref}.yaml",
            saving_path = f"./{dataset}_winrate_eval_results/{gptmodel}_{filename_1}_{filename_2}/{prompt_id}.json", 
            is_avoid_reannotations = False,
            **decoding_kwargs
        )
    
    to_annotate[prompt_id]["annotation"] = annotator.annotate_pairs(to_annotate[prompt_id]["comparisons"])
    num += 1

with open(f"./{dataset}_winrate_eval_results/{gptmodel}_{filename_1}_{filename_2}.json", "w") as f:
    json.dump(to_annotate, f, indent=4)

len(to_annotate)

In [None]:
# win-rate
win_lose_tie = [0, 0, 0]
per_user_win_lose_tie = {}
per_user_winrate = []

for prompt_id in to_annotate.keys():
    if dataset == "prism":
        user_id, dialog_id, turn_nb = prompt_id.split("_")
    elif dataset == "psoups":
        user_id, sample_id = prompt_id.split("_")
        if user_id == '0':  # only evaluate user 1-6
            continue
        
    if user_id not in per_user_win_lose_tie.keys():
        per_user_win_lose_tie[user_id] = [0, 0, 0]

    try:
        annotation = to_annotate[prompt_id]["annotation"][0]
    except:
        print("no annotation for ", prompt_id)
        continue

    if int(annotation["preference"]) == 3:
        win_lose_tie[2] += 1
        per_user_win_lose_tie[user_id][2] += 1
    elif int(annotation["preference"]) == to_annotate[prompt_id]["label"]:
        win_lose_tie[0] += 1
        per_user_win_lose_tie[user_id][0] += 1
    elif int(annotation["preference"]) != to_annotate[prompt_id]["label"]:
        win_lose_tie[1] += 1
        per_user_win_lose_tie[user_id][1] += 1

if dataset == "prism":
    # consider TIE as both sides win
    for user_id in per_user_win_lose_tie.keys():
        per_user_winrate.append((per_user_win_lose_tie[user_id][0]+per_user_win_lose_tie[user_id][2])/(per_user_win_lose_tie[user_id][0]+per_user_win_lose_tie[user_id][2]+per_user_win_lose_tie[user_id][1]+per_user_win_lose_tie[user_id][2]))
    print("macro avg: ", win_lose_tie, (win_lose_tie[0]+win_lose_tie[2])/(win_lose_tie[0]+win_lose_tie[2]+win_lose_tie[1]+win_lose_tie[2]))
    print("per user avg: ", sum(per_user_winrate)/len(per_user_winrate))

elif dataset == "psoups":
    # not consider TIE, to be consistent with the win-rate computation in the Personalized Soups paper
    for user_id in per_user_win_lose_tie.keys():
        per_user_winrate.append(per_user_win_lose_tie[user_id][0]/(per_user_win_lose_tie[user_id][0]+per_user_win_lose_tie[user_id][1]+per_user_win_lose_tie[user_id][2]))
        print("user ", user_id, " winrate: ", per_user_winrate[-1])
    print("macro avg: ", win_lose_tie, win_lose_tie[0]/(win_lose_tie[0]+win_lose_tie[1]+win_lose_tie[2]))
    print("per user avg: ", sum(per_user_winrate)/len(per_user_winrate))

