In [1]:
import re 
import sys 
sys.path.append('..')
import pandas as pd 
from utils.utils import read_jsonl, extract_hash_answer
from utils.prompt_hub import get_confidence_prompt

def ruler_eval_func(base_url, predict_url):
    base = read_jsonl(base_url)
    check = read_jsonl(predict_url)
    predicts = []
    targets = []
    for b, predict in zip(base, check):
        targets.append([p.lower() for p in predict['reference']])
        if b['task_label'] == 'cwe' or b['task_label'] == 'fwe' or b['task_label'] == 'vt':
            predicts.append([re.sub(r'[^a-zA-Z]', '', w).lower() for w in predict['completion'].split(" ")])
        elif b['task_label'] == 'niah_multikey_1' or b['task_label'] == 'niah_multikey_2' or b['task_label'] == 'niah_single_1' or b['task_label'] == 'niah_single_2':
            predicts.append([re.sub(r'[^0-9]', '', predict['completion']).lower()]) 
        elif b['task_label'] == 'niah_multikey_3' or b['task_label'] == 'niah_single_3':
            predicts.append([re.sub(r'[^a-zA-Z0-9-]', '', predict['completion']).lower()])
        elif b['task_label'] == 'niah_multiquery' or b['task_label'] == 'niah_multivalue':
            predicts.append([re.sub(r'[^0-9]', '', w).lower() for w in predict['completion'].split(" ")])
        else:
            predicts.append([predict['completion'].lower()])
    tf = []
    #new_p = []
    for b, p, t in zip(base, predicts, targets):
        p = [x for x in p if x != '']
        #new_p.append(p)
        if 'qa' in b['task_label']:
            if set(p).issubset(set(t)):
                tf.append(1)
            else:
                tf.append(0)
        else:
            try:
                if set(p) == set(t):
                    tf.append(1)
                else:
                    tf.append(0) 
            except:
                print(p)
    return tf

## Make Llama-3.2-3B-Instruct

In [2]:
## ruler training datasets
ruler_4k_base = read_jsonl('../logs/llama/ruler_4k_train_seed_samples/outputs_base_argmax.jsonl')

ruler_4k_samples_tf = []
for i in range(10):
    temp_tf = ruler_eval_func('../data/processed/ruler_4k_train.jsonl', f'../logs/llama/ruler_4k_train_seed_samples/outputs_seed_{i}.jsonl')
    ruler_4k_samples_tf.append(temp_tf)
    
conf_labels = []
for i in range(len(ruler_4k_base)):
    count = 0
    for j in range(10):
        count += ruler_4k_samples_tf[j][i]
    conf_labels.append(str(int(count*10)))
    
ruler_train_data = pd.DataFrame({
    'input_prompt': [x['input'] for x in ruler_4k_base],
    'predicted_answer': [x['completion'] for x in ruler_4k_base],
    'target_answer': [x['reference'] for x in ruler_4k_base],
    'conf_input_single': [x['input'] + x['completion'] + get_confidence_prompt('default') for x in ruler_4k_base],
    'conf_label_single': [c + "</confidence>" for c in conf_labels],
    'conf_label_multi': ["N/A</reasoning_confidence>\n<evidence_confidence>" + c + "</evidence_confidence>" for c in conf_labels],
    'task_type': ['ruler' for _ in range(len(ruler_4k_base))]})

## ruelr validation datasets 
ruler_4k_base = read_jsonl('../logs/llama/ruler_4k_valid_seed_samples/outputs_base_argmax.jsonl')

ruler_4k_samples_tf = []
for i in range(10):
    temp_tf = ruler_eval_func('../data/processed/ruler_4k_valid.jsonl', f'../logs/llama/ruler_4k_valid_seed_samples/outputs_seed_{i}.jsonl')
    ruler_4k_samples_tf.append(temp_tf)
    
conf_labels = []
for i in range(len(ruler_4k_base)):
    count = 0
    for j in range(10):
        count += ruler_4k_samples_tf[j][i]
    conf_labels.append(str(int(count*10)))
    
ruler_valid_data = pd.DataFrame({
    'input_prompt': [x['input'] for x in ruler_4k_base],
    'predicted_answer': [x['completion'] for x in ruler_4k_base],
    'target_answer': [x['reference'] for x in ruler_4k_base],
    'conf_input_single': [x['input'] + x['completion'] + get_confidence_prompt('default') for x in ruler_4k_base],
    'conf_input_multi': [x['input'] + x['completion'] + get_confidence_prompt('multi') for x in ruler_4k_base],
    'conf_label_single': [c + "</confidence>" for c in conf_labels],
    'conf_label_multi': ["N/A</reasoning_confidence>\n<evidence_confidence>" + c + "</evidence_confidence>" for c in conf_labels],
    'task_type': ['ruler' for _ in range(len(ruler_4k_base))]})

In [3]:
gsm_base = read_jsonl('../logs/llama/gsm_seed_samples/outputs_base_argmax.jsonl')

gsm_samples_tf = []
for i in range(10):
    temp_data = read_jsonl(f'../logs/llama/gsm_seed_samples/outputs_seed_{i}_parsed.jsonl')
    gold_answer = [extract_hash_answer(x['gold_answer']) for x in temp_data]
    predicted_answer = [x['parsed'].split("**Model's Final Answer is:** ")[-1] for x in temp_data]
    temp_tf = [re.sub(r'[^0-9]', '', predicted_answer[i]).lower() == re.sub(r'[^0-9]', '', gold_answer[i]).lower() for i in range(len(temp_data))]
    gsm_samples_tf.append(temp_tf)

conf_labels = []
for i in range(len(gsm_base)):
    count = 0
    for j in range(10):
        count += gsm_samples_tf[j][i]
    conf_labels.append(str(int(count*10)))

full_data = pd.DataFrame({
    'input_prompt': [x['input'] for x in gsm_base],
    'predicted_answer': [x['completion'] for x in gsm_base],
    'target_answer': [extract_hash_answer(x['reference']) for x in gsm_base],
    'conf_input_single': [x['input'] + x['completion'] + get_confidence_prompt('default') for x in gsm_base],
    'conf_input_multi': [x['input'] + x['completion'] + get_confidence_prompt('multi') for x in gsm_base],
    'conf_label_single': [c + "</confidence>" for c in conf_labels],
    'conf_label_multi': [c + "</reasoning_confidence>\n<evidence_confidence>N/A</evidence_confidence>"  for c in conf_labels],
    'task_type': ['gsm' for _ in range(len(gsm_base))]})

full_data = full_data.sample(frac=1, random_state=42).reset_index(drop=True)

gsm_train_data = full_data.iloc[:int(len(full_data)*0.8)]
gsm_valid_data = full_data.iloc[int(len(full_data)*0.8):].reset_index(drop=True)

In [4]:
ruler_train_data.to_csv("../data/train_data/Llama-3.2-3B-Instruct/csft/ruler_4k_train.csv", index=False)
ruler_valid_data.to_csv("../data/train_data/Llama-3.2-3B-Instruct/csft/ruler_4k_valid.csv", index=False)

gsm_train_data.to_csv("../data/train_data/Llama-3.2-3B-Instruct/csft/gsm_train.csv", index=False)
gsm_valid_data.to_csv("../data/train_data/Llama-3.2-3B-Instruct/csft/gsm_valid.csv", index=False)