## Calculate percentage (%) scores

In [1]:
from pathlib import Path
import pandas as pd
import re

In [2]:
GPT35_RESULT_DIR = Path().resolve() / "gpt35_comparison_results"
LLAMA_BASE_RESULT_DIR = Path().resolve() / "llama_base_comparison_results"
assert GPT35_RESULT_DIR.exists()
assert LLAMA_BASE_RESULT_DIR.exists()

GPT4_COMPARISON_RESPONSE = "gpt4_comparison_response"
IS_SHUFFLED = "is_shuffled"

In [3]:
def get_scores(file: Path) -> int:
    def get_score(text: str, is_shuffled: bool) -> float:
        score_mapping = {"A": 0, "B": 1, "C": 0.5}
        if text == "Error in response":
            return score_mapping["A"]
        pattern = r"\[\[([ABC])\]\]"
        matches = re.findall(pattern, text)
        assert len(matches) == 1, f"text is {text}\nmatches is {matches}"
        final_verdict = matches[0]
        if final_verdict == "C" or not is_shuffled:
            return score_mapping[final_verdict]
        return score_mapping["B"] if final_verdict == "A" else score_mapping["A"]
    df = pd.read_csv(file)
    winners = df.apply(lambda row: get_score(row[GPT4_COMPARISON_RESPONSE], row[IS_SHUFFLED]), axis = 1)
    # print(winners) # TODO: remove later, currently used to debug
    return f"{winners.mean() * 100:.0f}%"

In [4]:
gpt35_result_files = sorted(list(GPT35_RESULT_DIR.glob("*.csv")))
gpt35_win_rates = {}
for result_file in gpt35_result_files:
    try: 
        gpt35_win_rates[result_file.stem] = get_scores(result_file)
    except Exception as e:
        print(e)
display(gpt35_win_rates)

{'cleaned_llama2_lr_3e-4_ckpt117_GPT4_comparison_evaluations': '10%',
 'cleaned_llama2_lr_3e-5_ckpt117_GPT4_comparison_evaluations': '80%',
 'cleaned_llama2_lr_3e-6_ckpt117_GPT4_comparison_evaluations': '50%',
 'cleaned_llama3_lr_3e-4_ckpt117_GPT4_comparison_evaluations': '100%',
 'cleaned_llama3_lr_3e-5_ckpt117_GPT4_comparison_evaluations': '90%',
 'cleaned_llama3_lr_3e-6_ckpt117_GPT4_comparison_evaluations': '30%'}

In [5]:
llama_base_result_files = sorted(list(LLAMA_BASE_RESULT_DIR.glob("*.csv")))
llama_base_win_rates = {}
for result_file in llama_base_result_files:
    try: 
        llama_base_win_rates[result_file.stem] = get_scores(result_file)
    except Exception as e:
        print(e)
display(llama_base_win_rates)

{'cleaned_llama2_lr_3e-4_ckpt117_LlaMA_base_comparison_evaluations': '50%',
 'cleaned_llama2_lr_3e-5_ckpt117_LlaMA_base_comparison_evaluations': '50%',
 'cleaned_llama2_lr_3e-6_ckpt117_LlaMA_base_comparison_evaluations': '50%',
 'cleaned_llama3_lr_3e-4_ckpt117_LlaMA_base_comparison_evaluations': '50%',
 'cleaned_llama3_lr_3e-5_ckpt117_LlaMA_base_comparison_evaluations': '40%',
 'cleaned_llama3_lr_3e-6_ckpt117_LlaMA_base_comparison_evaluations': '45%'}