# RULER evaluation

In [14]:
import re 
import sys 
sys.path.append('..')
from utils.utils import read_jsonl, extract_hash_answer
from utils.calibration_utils import auroc, ece, brier_score, nll

def ruler_eval_func(base_url, predict_url, conf_type="single"):
    base = read_jsonl(base_url)
    check = read_jsonl(predict_url)
    predicts = []
    targets = []
    conf = []
    tasks = []
    for b, predict in zip(base, check):
        if conf_type == "multi":
            try:
                conf.append(int(re.sub(r'[^0-9]', '', predict['confidence'].split("<evidence_confidence>")[-1].split("</evidence_confidence>")[0]))*0.01)
            except:
                conf.append(0.0)
        else:
            try:
                conf.append(int(predict['confidence'])*0.01)
            except:
                conf.append(0.0)
        targets.append([p.lower() for p in predict['reference']])
        if b['task_label'] == 'cwe' or b['task_label'] == 'fwe' or b['task_label'] == 'vt':
            predicts.append([re.sub(r'[^a-zA-Z]', '', w).lower() for w in predict['completion'].split(" ")])
        elif b['task_label'] == 'niah_multikey_1' or b['task_label'] == 'niah_multikey_2' or b['task_label'] == 'niah_single_1' or b['task_label'] == 'niah_single_2':
            predicts.append([re.sub(r'[^0-9]', '', predict['completion']).lower()]) 
        elif b['task_label'] == 'niah_multikey_3' or b['task_label'] == 'niah_single_3':
            predicts.append([re.sub(r'[^a-zA-Z0-9-]', '', predict['completion']).lower()])
        elif b['task_label'] == 'niah_multiquery' or b['task_label'] == 'niah_multivalue':
            predicts.append([re.sub(r'[^0-9]', '', w).lower() for w in predict['completion'].split(" ")])
        else:
            predicts.append([predict['completion'].lower()])
        tasks.append(b['task_label'])
    tf = []
    #new_p = []
    for b, p, t in zip(base, predicts, targets):
        p = [x for x in p if x != '']
        #new_p.append(p)
        if 'qa' in b['task_label']:
            if set(p).issubset(set(t)):
                tf.append(1)
            else:
                tf.append(0)
        else:
            try:
                if set(p) == set(t):
                    tf.append(1)
                else:
                    tf.append(0) 
            except:
                print(p)
    return tf, conf, tasks

In [15]:
import numpy as np 

def print_results(results_by_ds, n_bins=10):
    """
    results_by_ds: dict[str, tuple(tf, conf)]
      - tf: iterable of {0,1}
      - conf: iterable of floats in [0,1]
    """
    per_ds = {}
    for ds_name, (tf, conf) in results_by_ds.items():
        acc = sum(tf) / len(tf)
        conf = [c for c in conf if c >= 0 and c <=1]
        tf = [t for t, c in zip(tf, conf) if c >= 0 and c <=1]
        tf = np.asarray(tf, dtype=float)
        conf = np.asarray(conf, dtype=float)

        per_ds[ds_name] = {
            "N": len(tf),
            "ACC": acc,
            "ECE": float(ece(tf, conf, n_bins=n_bins)),
            "BS": float(brier_score(tf, conf)),
            "AUROC": float(auroc(tf, conf)),
        }

    # macro-average (each dataset equally weighted)
    keys = ["ACC", "ECE", "BS", "AUROC"]
    macro = {k: float(np.mean([per_ds[d][k] for d in per_ds])) for k in keys}

    # print per-dataset + macro
    for d in per_ds:
        m = per_ds[d]
        print(f"[{d}] N={m['N']} | ACC={m['ACC']:.4f} | ECE={m['ECE']:.4f} | BS={m['BS']:.4f} | AUROC={m['AUROC']:.4f}")

    print("----------------------------------------------------------------")
    print(f"[MACRO] (equal weight per dataset) | ACC={macro['ACC']:.4f} | ECE={macro['ECE']:.4f} | BS={macro['BS']:.4f} | AUROC={macro['AUROC']:.4f}")

In [28]:
tf_4k, conf_4k, task_4k  = ruler_eval_func("/mnt/home/chaeyun-jang/gcsft/data/processed/ruler_4k_test.jsonl",
                                 "/mnt/home/chaeyun-jang/gcsft/logs/multi_finals/prev_multi_seed0_lr0.0001_kl1.0_checkpoint-2000/Llama-3.2-3B-Instruct_ruler_4k_test.jsonl",
                                 "multi")
tf_8k, conf_8k, task_8k = ruler_eval_func("/mnt/home/chaeyun-jang/gcsft/data/processed/ruler_8k_test.jsonl",
                                 "/mnt/home/chaeyun-jang/gcsft/logs/multi_finals/prev_multi_seed0_lr0.0001_kl1.0_checkpoint-2000/Llama-3.2-3B-Instruct_ruler_8k_test.jsonl",
                                 "multi")

In [29]:
tf, conf, task = tf_4k + tf_8k, conf_4k + conf_8k, task_4k + task_8k
print_results({
    "ruler_4k": (tf_4k, conf_4k),
    "ruler_8k": (tf_8k, conf_8k),
    "ruler_all": (tf, conf)
})

[ruler_4k] N=650 | ACC=0.7262 | ECE=0.1338 | BS=0.1338 | AUROC=0.8536
[ruler_8k] N=650 | ACC=0.6692 | ECE=0.1492 | BS=0.1492 | AUROC=0.8332
[ruler_all] N=1300 | ACC=0.6977 | ECE=0.1415 | BS=0.1415 | AUROC=0.8423
----------------------------------------------------------------
[MACRO] (equal weight per dataset) | ACC=0.6977 | ECE=0.1415 | BS=0.1415 | AUROC=0.8431


In [30]:
#print_results(tf, conf)
all_task = ['qa']
for at in all_task:
    print(f"----- {at} -----")
    idxs = [i for i, t in enumerate(task) if at in t]
    tf_sub = [tf[i] for i in idxs]
    conf_sub = [conf[i] for i in idxs]
    print_results({at: (tf_sub, conf_sub)})

----- qa -----
[qa] N=200 | ACC=0.4550 | ECE=0.4200 | BS=0.4200 | AUROC=0.5766
----------------------------------------------------------------
[MACRO] (equal weight per dataset) | ACC=0.4550 | ECE=0.4200 | BS=0.4200 | AUROC=0.5766


In [19]:
data = read_jsonl("/mnt/home/chaeyun-jang/gcsft/logs/new_multi_csft/Llama-3.2-3B-Instruct_csft_multi_seed0_lr1e-05_kl0.0_bs1_gs64_ms5000_ck0_checkpoint-1500/Llama-3.2-3B-Instruct_ruler_4k_test.jsonl")


In [20]:
data

[{'input': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 27 Jan 2026\n\nYou are an expert assistant that provides clear and helpful answers.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nYou are an answering assistant.\nGiven a question, provide the final answer.\nRespond with the answer enclosed in <answer> and </answer> tags.\nThe <answer> tag must contain only the final answer, with no additional explanation.\n\nBelow is a numbered list of words. In these words, some appear more often than others. Memorize the ones that appear most often.\n1. rape 2. blend 3. harm 4. kitty 5. snotty 6. wonderful 7. sticker 8. hall 9. wonderful 10. library 11. hall 12. polliwog 13. blend 14. polliwog 15. latex 16. bridge 17. pit 18. captain 19. blend 20. latex 21. harm 22. hate 23. modeling 24. rape 25. rape 26. captain 27. assignment 28. speaker 29. latex 30. harm 31. everyone 32. brace 33. snotty 34. captain 35. polliwog 36

# GSM Evaluation

In [None]:
def gsm_eval_func(eval_url, conf_url, conf_type="multi"):
    data = read_jsonl(eval_url)
    conf_data = read_jsonl(conf_url)
    gold_answer = [extract_hash_answer(x['gold_answer']) for x in data]
    predicted_answer = [x['parsed'].split("**Model's Final Answer is:** ")[-1] for x in data]
    tf = [re.sub(r'[^0-9]', '', predicted_answer[i]).lower() == re.sub(r'[^0-9]', '', gold_answer[i]).lower() for i in range(len(data))]
    if conf_type == "multi":
        conf = []
        for c in conf_data:
            try:
                conf.append(int(re.sub(r'[^0-9]', '', c['confidence'].split("</reasoning_confidence>")[0]))*0.01)
            except:
                conf.append(0.0)
    else:
        conf = [int(re.sub(r'[^0-9]', '', x['confidence']))*0.01 for x in conf_data]
    return tf, conf

gsm_tf, gsm_conf = gsm_eval_func(
    "/mnt/home/chaeyun-jang/gcsft/logs/new_multi_csft/Llama-3.2-3B-Instruct_csft_multi_seed0_lr1e-05_kl0.0_bs1_gs64_ms5000_ck0_checkpoint-1500/Llama-3.2-3B-Instruct_gsm_test.jsonl",
    "/mnt/home/chaeyun-jang/gcsft/logs/new_multi_csft/Llama-3.2-3B-Instruct_csft_multi_seed0_lr1e-05_kl0.0_bs1_gs64_ms5000_ck0_checkpoint-1500/Llama-3.2-3B-Instruct_gsm_test.jsonl",
   conf_type="multi"
   )
print_results({
    "gsm8k": (gsm_tf, gsm_conf),
})

[gsm8k] N=1319 | ACC=0.7339 | ECE=0.1832 | BS=0.2268 | AUROC=0.5087
----------------------------------------------------------------
[MACRO] (equal weight per dataset) | ACC=0.7339 | ECE=0.1832 | BS=0.2268 | AUROC=0.5087


# MATH evaluation

In [8]:
def math_eval_func(eval_url, conf_url):
    data = read_jsonl(eval_url)
    conf_data = read_jsonl(conf_url)
    tf = [1 if "yes" in x['parsed'].lower() else 0 for x in data]
    conf = []
    for c in conf_data:
        try:
            conf.append(int(re.sub(r'[^0-9]', '', c['confidence'].split("</confidence>")[0]))*0.01)
        except:
            conf.append(0.0)
    return tf, conf

math_tf, math_conf = math_eval_func(
    "/mnt/home/chaeyun-jang/gcsft/logs/trained_models/Llama-3.2-3B-Instruct_brier_gsm_seed42_lr1e-05_kl0.6_bs1_gs32_ms1000_checkpoint-800/Llama-3.2-3B-Instruct_math_test_parsed.jsonl",
    "/mnt/home/chaeyun-jang/gcsft/logs/trained_models/Llama-3.2-3B-Instruct_brier_gsm_seed42_lr1e-05_kl0.6_bs1_gs32_ms1000_checkpoint-800/Llama-3.2-3B-Instruct_math_test.jsonl"
)
print_results({
    "math": (math_tf, math_conf),
})

[math] N=5000 | ACC=0.3788 | ECE=0.5507 | BS=0.5412 | AUROC=0.5557
----------------------------------------------------------------
[MACRO] (equal weight per dataset) | ACC=0.3788 | ECE=0.5507 | BS=0.5412 | AUROC=0.5557


In [9]:
def mmlu_eval_func(eval_url, conf_type="single"):
    data = read_jsonl(eval_url)
    predicted_answer = [x['completion'].split("<answer>")[-1] for x in data]
    print(predicted_answer)
    tf = []
    for i in range(len(data)):
        if re.sub(r'[^a-zA-Z]', '', predicted_answer[i]).lower() == re.sub(r'[^a-zA-Z]', '', str(data[i]['reference'])).lower():
            tf.append(1)
        else:
            # data[i]['input_prompt']에서 f'{data[i]['reference']}. ' 뒤의 답을 Extract해서 그 답에 predicted 가 들어가면 인정
            true_answer_text = data[i]['input'].split(f"{data[i]['reference']}. ")[-1].split("\n")[0]
            if re.sub(r'[^a-zA-Z]', '', predicted_answer[i]).lower() in re.sub(r'[^a-zA-Z]', '', true_answer_text).lower():
                tf.append(1)
            else:
                tf.append(0)
    num = 0
    if conf_type == "multi":
        conf = []
        for c in data:
            try:
                r_c = int(re.sub(r'[^0-9]', '', c['confidence'].split("</reasoning_confidence>")[0]))*0.01
                e_c = int(re.sub(r'[^0-9]', '', c['confidence'].split("<evidence_confidence>")[-1].split("</evidence_confidence>")[0]))*0.01
                conf.append((r_c + e_c)/2)
                # add lower value
                #conf.append(min(r_c, e_c))
            except:
                try:
                    r_c = int(re.sub(r'[^0-9]', '', c['confidence'].split("</reasoning_confidence>")[0]))*0.01
                    conf.append(r_c)
                except:
                    num+=1
                    conf.append(0.0)
    else:
        #conf = [int(re.sub(r'[^0-9]', '', x['confidence']))*0.01 for x in data]
        conf = []
        for c in data:
            try:
                conf.append(int(re.sub(r'[^0-9]', '', c['confidence']))*0.01)
            except:
                 num+=1
                 conf.append(-1)
    print(num)
    return tf, conf

mmlu_tf, mmlu_conf = mmlu_eval_func("/mnt/home/chaeyun-jang/gcsft/logs/trained_models/Llama-3.2-3B-Instruct_brier_gsm_seed42_lr1e-05_kl0.6_bs1_gs32_ms1000_checkpoint-800/Llama-3.2-3B-Instruct_mmlu_test.jsonl",
                          "single")

['6', '2', '0', ' C ', ' C ', 'False, False', ' C ', ' C ', '4', '3', ' C ', ' D ', '5', ' C', '12', ' C ', ' C ', ' C ', ' B', ' C ', ' C ', '0', ' C ', ' C ', 'False, False', ' C ', 'False, False', ' B ', ' C ', ' C ', ' C ', ' B ', ' C ', ' C ', ' C ', '4', ' C ', ' D ', '3', ' C ', '0', '3', ' C ', 'False, True', ' C ', ' C ', ' C ', '1', ' C ', '240', 'False, True', '1', '4', ' C ', '2', 'False, True', ' C ', '3', ' C ', ' C ', ' C ', ' C ', ' C ', ' D ', '0', ' C ', ' C ', ' C', ' C', ' C ', ' B ', '4', '3', '3', ' C ', ' C ', ' C ', '0', ' C ', ' C ', ' C ', ' C ', ' C ', 'False, False', ' C ', ' C ', ' C ', '19', '6', 'False, True', ' C ', '2Z', ' C ', '12', ' C ', ' C ', '0', '6', '8', 'False, False', ' C ', 'AB', ' A ', ' B ', ' B', ' A ', 'AB', ' B ', 'B', ' C ', ' B ', ' D ', ' B ', ' C ', ' C ', ' A ', 'BC', ' D ', ' A ', ' C ', 'B', 'C', ' C ', ' C ', ' D ', ' C ', ' C ', ' A ', ' C ', ' D ', ' C ', ' B ', ' B ', ' D ', 'AB', ' A ', ' D ', ' C ', ' D ', ' C ', ' C', ' B '

In [10]:
import numpy as np 

def print_results(results_by_ds, n_bins=10):
    """
    results_by_ds: dict[str, tuple(tf, conf)]
      - tf: iterable of {0,1}
      - conf: iterable of floats in [0,1]
    """
    per_ds = {}
    for ds_name, (tf, conf) in results_by_ds.items():
        tf = np.asarray(tf, dtype=float)
        conf = np.asarray(conf, dtype=float)

        per_ds[ds_name] = {
            "N": len(tf),
            "ACC": float(tf.mean()),
            "ECE": float(ece(tf, conf, n_bins=n_bins)),
            "BS": float(brier_score(tf, conf)),
            "AUROC": float(auroc(tf, conf)),
        }

    # macro-average (each dataset equally weighted)
    keys = ["ACC", "ECE", "BS", "AUROC"]
    macro = {k: float(np.mean([per_ds[d][k] for d in per_ds])) for k in keys}

    # print per-dataset + macro
    for d in per_ds:
        m = per_ds[d]
        print(f"[{d}] N={m['N']} | ACC={m['ACC']:.4f} | ECE={m['ECE']:.4f} | BS={m['BS']:.4f} | AUROC={m['AUROC']:.4f}")

    print("----------------------------------------------------------------")
    print(f"[MACRO] (equal weight per dataset) | ACC={macro['ACC']:.4f} | ECE={macro['ECE']:.4f} | BS={macro['BS']:.4f} | AUROC={macro['AUROC']:.4f}")

print_results({
    "gsm8k": (gsm_tf, gsm_conf),
    "math":  (math_tf, math_conf),
    "mmlu":  (mmlu_tf, mmlu_conf),
})

[gsm8k] N=1319 | ACC=0.7339 | ECE=0.1832 | BS=0.2268 | AUROC=0.5087
[math] N=5000 | ACC=0.3788 | ECE=0.5507 | BS=0.5412 | AUROC=0.5557
[mmlu] N=14042 | ACC=0.7978 | ECE=0.1541 | BS=0.1970 | AUROC=0.5293
----------------------------------------------------------------
[MACRO] (equal weight per dataset) | ACC=0.6368 | ECE=0.2960 | BS=0.3217 | AUROC=0.5312


# ContractNLI evaluation

In [13]:
def contractnli_eval_func(eval_url, conf_type="multi"):
    data = read_jsonl(eval_url)
    predicted_answer = [x['completion'].split("<answer>")[-1] for x in data]
    tf = [re.sub(r'[^0-9]', '', predicted_answer[i]).lower() == re.sub(r'[^0-9]', '', str(data[i]['reference'])).lower() for i in range(len(data))]
    num = 0
    if conf_type == "multi":
        conf = []
        for c in data:
            try:
                r_c = int(re.sub(r'[^0-9]', '', c['confidence'].split("</reasoning_confidence>")[0]))*0.01
                e_c = int(re.sub(r'[^0-9]', '', c['confidence'].split("<evidence_confidence>")[-1].split("</evidence_confidence>")[0]))*0.01
                conf.append((r_c + e_c)/2)
                # add lower value
                #conf.append(min(r_c, e_c))
            except:
                try:
                    r_c = int(re.sub(r'[^0-9]', '', c['confidence'].split("</reasoning_confidence>")[0]))*0.01
                    conf.append(r_c)
                except:
                    num+=1
                    conf.append(0.0)
    else:
        conf = [int(re.sub(r'[^0-9]', '', x['confidence']))*0.01 for x in data]
    print(num)
    return tf, conf

# data = read_jsonl("/mnt/home/chaeyun-jang/gcsft/logs/multi_trained_test_evals/Llama-3.2-3B-Instruct_csft_multi_seed0_lr0.0001_kl1.0_bs1_gs32_ms2000_ck1_checkpoint-2000/Llama-3.2-3B-Instruct_contract_nli_test.jsonl")

tf, conf = contractnli_eval_func("/mnt/home/chaeyun-jang/gcsft/logs/new_multi_csft/Llama-3.2-3B-Instruct_csft_multi_seed0_lr1e-05_kl0.0_bs1_gs64_ms5000_ck0_checkpoint-1500/Llama-3.2-3B-Instruct_contract_nli_test.jsonl",
                                 "multi")
print_results({
    "contract_nli": (tf, conf),
})

0
[contract_nli] N=2091 | ACC=0.4725 | ECE=0.2201 | BS=0.2993 | AUROC=0.6109
----------------------------------------------------------------
[MACRO] (equal weight per dataset) | ACC=0.4725 | ECE=0.2201 | BS=0.2993 | AUROC=0.6109


# MMLU

In [13]:
import pandas as pd 

data = pd.read_csv("/mnt/home/chaeyun-jang/gcsft/data/train_data/Llama-3.2-3B-Instruct/csft/ruler_4k_train.csv")

In [17]:
print(data['conf_input_multi'][0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 15 Jan 2026

You are an expert assistant that provides clear and helpful answers.<|eot_id|><|start_header_id|>user<|end_header_id|>

You are an answering assistant.
Given a question, provide the final answer.
Respond with the answer enclosed in <answer> and </answer> tags.
The <answer> tag must contain only the final answer, with no additional explanation.

Below is a numbered list of words. In these words, some appear more often than others. Memorize the ones that appear most often.
1. gratitude 2. wheel 3. attorney 4. company 5. tortilla 6. mortise 7. develop 8. boyhood 9. mortise 10. chin 11. boyhood 12. denominator 13. wheel 14. denominator 15. hygienic 16. district 17. armchair 18. freak 19. wheel 20. hygienic 21. attorney 22. overview 23. lovely 24. gratitude 25. gratitude 26. freak 27. clearance 28. runway 29. hygienic 30. attorney 31. bench 32. blush 33. tortilla 34. f