In [2]:
import re 
import sys 
sys.path.append('..')
import pandas as pd 
from utils.utils import read_jsonl, extract_hash_answer
from utils.prompt_hub import get_confidence_prompt

def ruler_eval_func(base_url, predict_url):
    base = read_jsonl(base_url)
    check = read_jsonl(predict_url)
    predicts = []
    targets = []
    task_labels = []
    for b, predict in zip(base, check):
        targets.append([p.lower() for p in predict['reference']])
        if b['task_label'] == 'cwe' or b['task_label'] == 'fwe' or b['task_label'] == 'vt':
            predicts.append([re.sub(r'[^a-zA-Z]', '', w).lower() for w in predict['completion'].split(" ")])
        elif b['task_label'] == 'niah_multikey_1' or b['task_label'] == 'niah_multikey_2' or b['task_label'] == 'niah_single_1' or b['task_label'] == 'niah_single_2':
            predicts.append([re.sub(r'[^0-9]', '', predict['completion']).lower()]) 
        elif b['task_label'] == 'niah_multikey_3' or b['task_label'] == 'niah_single_3':
            predicts.append([re.sub(r'[^a-zA-Z0-9-]', '', predict['completion']).lower()])
        elif b['task_label'] == 'niah_multiquery' or b['task_label'] == 'niah_multivalue':
            predicts.append([re.sub(r'[^0-9]', '', w).lower() for w in predict['completion'].split(" ")])
        else:
            predicts.append([predict['completion'].lower()])
        task_labels.append(b['task_label'])
    tf = []
    #new_p = []
    for b, p, t in zip(base, predicts, targets):
        p = [x for x in p if x != '']
        #new_p.append(p)
        if 'qa' in b['task_label']:
            if set(p).issubset(set(t)):
                tf.append(1)
            else:
                tf.append(0)
        else:
            try:
                if set(p) == set(t):
                    tf.append(1)
                else:
                    tf.append(0) 
            except:
                print(p)
    return tf, task_labels 


In [3]:
train_tf, train_task = ruler_eval_func("/mnt/home/chaeyun-jang/gcsft/data/processed/ruler_4k_train.jsonl",
                                 "/mnt/home/chaeyun-jang/gcsft/logs/trash/llama/ruler_4k_train_seed_samples/outputs_base_argmax.jsonl")
val_tf, val_task = ruler_eval_func("/mnt/home/chaeyun-jang/gcsft/data/processed/ruler_4k_valid.jsonl",
                                 "/mnt/home/chaeyun-jang/gcsft/logs/trash/llama/ruler_4k_valid_seed_samples/outputs_base_argmax.jsonl")

In [16]:
train_conf_c = list(set(train_task))
train_conf = []

for c in train_conf_c:
    train_indices = [i for i, x in enumerate(train_task) if x == c]
    train_c_conf = sum([train_tf[i] for i in train_indices]) / len(train_indices)
    train_conf.append(f"{int(train_c_conf*100)}")
    
valid_conf_c = list(set(val_task))
valid_conf = []

for c in valid_conf_c:
    val_indices = [i for i, x in enumerate(val_task) if x == c]
    val_c_conf = sum([val_tf[i] for i in val_indices]) / len(val_indices)
    valid_conf.append(f"{int(val_c_conf*100)}")

In [17]:
print(train_conf)
print(valid_conf)

train_conf_labels = []
for t in train_task:
    idx = train_conf_c.index(t)
    train_conf_labels.append(train_conf[idx])

valid_conf_labels = []
for t in val_task:
    idx = valid_conf_c.index(t)
    valid_conf_labels.append(valid_conf[idx])

['98', '100', '99', '97', '41', '48', '14', '100', '87', '97', '39', '96', '0']
['98', '100', '100', '98', '38', '46', '12', '98', '92', '100', '34', '98', '0']


In [18]:
ruler_4k_train_base = read_jsonl('../logs/trash/llama/ruler_4k_train_seed_samples/outputs_base_argmax.jsonl')
ruler_4k_valid_base = read_jsonl('../logs/trash/llama/ruler_4k_valid_seed_samples/outputs_base_argmax.jsonl')

In [19]:
ruler_train_data = pd.DataFrame({
    'input_prompt': [x['input'] for x in ruler_4k_train_base],
    'predicted_answer': [x['completion'] for x in ruler_4k_train_base],
    'target_answer': [x['reference'] for x in ruler_4k_train_base],
    'conf_input_single': [x['input'] + x['completion'] + "</answer>\n\n" +get_confidence_prompt('default')[:-12] for x in ruler_4k_train_base],
    'conf_input_multi': [x['input'] + x['completion'] + "</answer>\n\n" +get_confidence_prompt('multi')[:-22] + "<reasoning_confidence>N/A</reasoning_confidence>\n<evidence_confidence>" for x in ruler_4k_train_base],
    'conf_label_single': ["<confidence>" + c + "</confidence>" for c in train_conf_labels],
    'conf_label_multi': [ c + "</evidence_confidence>" for c in train_conf_labels],
    'task_type': ['ruler' for _ in range(len(ruler_4k_train_base))]})

ruler_valid_data = pd.DataFrame({
    'input_prompt': [x['input'] for x in ruler_4k_valid_base],
    'predicted_answer': [x['completion'] for x in ruler_4k_valid_base],
    'target_answer': [x['reference'] for x in ruler_4k_valid_base],
    'conf_input_single': [x['input'] + x['completion'] + "</answer>\n\n" + get_confidence_prompt('default')[:-12] for x in ruler_4k_valid_base],
    'conf_input_multi': [x['input'] + x['completion'] + "</answer>\n\n" + get_confidence_prompt('multi')[:-22] + "<reasoning_confidence>N/A</reasoning_confidence>\n<evidence_confidence>" for x in ruler_4k_valid_base],
    'conf_label_single': ["<confidence>" + c + "</confidence>" for c in valid_conf_labels],
    'conf_label_multi': [c + "</evidence_confidence>" for c in valid_conf_labels],
    'task_type': ['ruler' for _ in range(len(ruler_4k_valid_base))]})

In [20]:
print(ruler_train_data['conf_input_multi'][0]+ruler_train_data['conf_label_multi'][0])
print('---')
print(ruler_valid_data['conf_input_multi'][0]+ruler_valid_data['conf_label_multi'][0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 15 Jan 2026

You are an expert assistant that provides clear and helpful answers.<|eot_id|><|start_header_id|>user<|end_header_id|>

You are an answering assistant.
Given a question, provide the final answer.
Respond with the answer enclosed in <answer> and </answer> tags.
The <answer> tag must contain only the final answer, with no additional explanation.

Below is a numbered list of words. In these words, some appear more often than others. Memorize the ones that appear most often.
1. gratitude 2. wheel 3. attorney 4. company 5. tortilla 6. mortise 7. develop 8. boyhood 9. mortise 10. chin 11. boyhood 12. denominator 13. wheel 14. denominator 15. hygienic 16. district 17. armchair 18. freak 19. wheel 20. hygienic 21. attorney 22. overview 23. lovely 24. gratitude 25. gratitude 26. freak 27. clearance 28. runway 29. hygienic 30. attorney 31. bench 32. blush 33. tortilla 34. f

In [21]:
ruler_train_data.to_csv("../data/train_data/Llama-3.2-3B-Instruct/sft_base/ruler_4k_train.csv", index=False)
ruler_valid_data.to_csv("../data/train_data/Llama-3.2-3B-Instruct/sft_base/ruler_4k_valid.csv", index=False)

## MATH data

In [24]:
math_base = read_jsonl('/mnt/home/chaeyun-jang/gcsft/logs/training_data/llama_3b_math_train.jsonl')
math_parsed = read_jsonl('/mnt/home/chaeyun-jang/gcsft/logs/training_data/llama_3b_math_train_parsed.jsonl')
base_csv = pd.read_csv("/mnt/home/chaeyun-jang/gcsft/data/processed/math_train.csv")

math_tasks = base_csv['type'].tolist()
math_samples_tf = [1 if 'yes' in x['parsed'].lower() else 0 for x in math_parsed]

In [25]:
math_base[1]

{'input': '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 27 Jan 2026\n\nYou are an expert assistant that provides clear and helpful answers.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nYou are a reasoning assistant.\nGiven a question, think step by step to arrive at the correct answer.\nFirst, provide your reasoning enclosed in <think> and </think> tags.\nThen, provide the final answer enclosed in <answer> and </answer> tags.\nThe <answer> tag must contain only the final answer, with no additional explanation.\n\nTom has a red marble, a green marble, a blue marble, and three identical yellow marbles. How many different groups of two marbles can Tom choose?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n<think>',
 'completion': " \nTo solve this problem, we need to consider the different combinations of marbles Tom can choose. \nFirst, we'll consider the cases where Tom chooses two marbles of the sam

In [26]:
conf = []
math_conf_c = list(set(math_tasks))
for c in math_conf_c:
    math_indices = [i for i, x in enumerate(math_tasks) if x == c]
    math_c_conf = sum([math_samples_tf[i] for i in math_indices]) / len(math_indices)
    conf.append(f"{int(math_c_conf*100)}")
    
conf_labels = [conf[math_conf_c.index(t)] for t in math_tasks]

In [27]:
full_data = pd.DataFrame({
    'input_prompt': [x['input'] for x in math_base],
    'predicted_answer': [x['completion'] for x in math_base],
    'target_answer': [x['reference'] for x in math_base],
    'conf_input_single': [x['input'] + x['completion'] + "</answer>\n\n" + get_confidence_prompt('default')[:-12] for x in math_base],
    'conf_input_multi': [x['input'] + x['completion'] + "</answer>\n\n" + get_confidence_prompt('multi')[:-22] + "<reasoning_confidence>" for x in math_base],
    'conf_label_single': ["<confidence>" + c + "</confidence>" for c in conf_labels],
    'conf_label_multi': [ c + "</reasoning_confidence>" for c in conf_labels],
    'task_type': ['math' for _ in range(len(math_base))]})

full_data = full_data.sample(frac=1, random_state=42).reset_index(drop=True)

math_train_data = full_data.iloc[:int(len(full_data)*0.8)]
math_valid_data = full_data.iloc[int(len(full_data)*0.8):].reset_index(drop=True)

print(full_data['conf_input_multi'][0] + full_data['conf_label_multi'][0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 27 Jan 2026

You are an expert assistant that provides clear and helpful answers.<|eot_id|><|start_header_id|>user<|end_header_id|>

You are a reasoning assistant.
Given a question, think step by step to arrive at the correct answer.
First, provide your reasoning enclosed in <think> and </think> tags.
Then, provide the final answer enclosed in <answer> and </answer> tags.
The <answer> tag must contain only the final answer, with no additional explanation.

Find the polynomial $p(x),$ with real coefficients, such that
\[p(x^3) - p(x^3 - 2) = [p(x)]^2 + 12\]for all real numbers $x.$<|eot_id|><|start_header_id|>assistant<|end_header_id|>

<think>First, we are given the equation $p(x^3) - p(x^3 - 2) = [p(x)]^2 + 12$. We can start by letting $y = x^3$ to simplify the equation. This gives us $p(y) - p(y - 2) = [p(x)]^2 + 12$. We can then substitute $x^3$ back in for $y$ to get $p(x^

In [29]:
math_train_data.to_csv("../data/train_data/Llama-3.2-3B-Instruct/sft_base/math_train.csv", index=False)
math_valid_data.to_csv("../data/train_data/Llama-3.2-3B-Instruct/sft_base/math_valid.csv", index=False)