# RULER evaluation

In [29]:
import re 
import sys 
sys.path.append('..')
from utils.utils import read_jsonl, extract_hash_answer
from utils.calibration_utils import auroc, ece, brier_score, nll

def ruler_eval_func(base_url, predict_url, conf_type="single"):
    base = read_jsonl(base_url)
    check = read_jsonl(predict_url)
    predicts = []
    targets = []
    conf = []
    for b, predict in zip(base, check):
        if conf_type == "multi":
            try:
                conf.append(int(re.sub(r'[^0-9]', '', predict['confidence'].split("<evidence_confidence>")[-1].split("</evidence_confidence>")[0]))*0.01)
            except:
                conf.append(0.0)
        else:
            conf.append(int(predict['confidence'])*0.01)
        targets.append([p.lower() for p in predict['reference']])
        if b['task_label'] == 'cwe' or b['task_label'] == 'fwe' or b['task_label'] == 'vt':
            predicts.append([re.sub(r'[^a-zA-Z]', '', w).lower() for w in predict['completion'].split(" ")])
        elif b['task_label'] == 'niah_multikey_1' or b['task_label'] == 'niah_multikey_2' or b['task_label'] == 'niah_single_1' or b['task_label'] == 'niah_single_2':
            predicts.append([re.sub(r'[^0-9]', '', predict['completion']).lower()]) 
        elif b['task_label'] == 'niah_multikey_3' or b['task_label'] == 'niah_single_3':
            predicts.append([re.sub(r'[^a-zA-Z0-9-]', '', predict['completion']).lower()])
        elif b['task_label'] == 'niah_multiquery' or b['task_label'] == 'niah_multivalue':
            predicts.append([re.sub(r'[^0-9]', '', w).lower() for w in predict['completion'].split(" ")])
        else:
            predicts.append([predict['completion'].lower()])
    tf = []
    #new_p = []
    for b, p, t in zip(base, predicts, targets):
        p = [x for x in p if x != '']
        #new_p.append(p)
        if 'qa' in b['task_label']:
            if set(p).issubset(set(t)):
                tf.append(1)
            else:
                tf.append(0)
        else:
            try:
                if set(p) == set(t):
                    tf.append(1)
                else:
                    tf.append(0) 
            except:
                print(p)
    return tf, conf 

In [30]:
def print_results(tf, conf):
    print("AUROC: ", auroc(tf, conf))
    print("ACC: ", sum(tf)/len(tf))
    print("ECE: ", ece(tf, conf, n_bins=10))
    print("BS: ", brier_score(tf, conf))
    #print("NLL: ", nll(tf, conf))

In [36]:
tf, conf = ruler_eval_func(
    "../data/processed/ruler_4k_test.jsonl",
    "/mnt/home/chaeyun-jang/gcsft/logs/zero_shot_test_evals/Llama-3.2-3B-Instruct_csft_multi_seed0_lr0.0001_kl1.0_bs1_gs32_ms2000_ck1_checkpoint-2000/Llama-3.2-3B-Instruct_ruler_8k_test.jsonl",
    conf_type="multi"
)
print_results(tf, conf)

AUROC:  0.6957016840417001
ACC:  0.6692307692307692
ECE:  0.37153846153846154
BS:  0.37170769230769235


In [20]:
def gsm_eval_func(eval_url, conf_url):
    data = read_jsonl(eval_url)
    conf_data = read_jsonl(conf_url)
    gold_answer = [extract_hash_answer(x['gold_answer']) for x in data]
    predicted_answer = [x['parsed'].split("**Model's Final Answer is:** ")[-1] for x in data]
    tf = [re.sub(r'[^0-9]', '', predicted_answer[i]).lower() == re.sub(r'[^0-9]', '', gold_answer[i]).lower() for i in range(len(data))]
    conf = [int(re.sub(r'[^0-9]', '', x['confidence']))*0.01 for x in conf_data]
    return tf, conf

tf, conf = gsm_eval_func(
    "/mnt/home/chaeyun-jang/gcsft/logs/zero_shot_test_evals/llama_3b_gsm_test_parsed.jsonl",
    "/mnt/home/chaeyun-jang/gcsft/logs/zero_shot_test_evals/llama_3b_gsm_test.jsonl"
    )
print_results(tf, conf) 

AUROC:  0.4688139031720253
ACC:  0.6921910538286581
ECE:  0.24116755117513272
BS:  0.2747422289613343


In [19]:
def math_eval_func(eval_url, conf_url):
    data = read_jsonl(eval_url)
    conf_data = read_jsonl(conf_url)
    tf = [1 if "yes" in x['parsed'].lower() else 0 for x in data]
    conf = [int(x['confidence'])*0.01 for x in conf_data]
    return tf, conf

tf, conf = math_eval_func(
    "/mnt/home/chaeyun-jang/gcsft/logs/zero_shot_test_evals/qwen_4b_math_test_parsed.jsonl",
    "/mnt/home/chaeyun-jang/gcsft/logs/zero_shot_test_evals/qwen_4b_math_test.jsonl"
)
print_results(tf, conf) 

AUROC:  0.8244270178457195
ACC:  0.7884
ECE:  0.19105999999999979
BS:  0.180475
