In [1]:
import re 
import sys 
sys.path.append('..')
import pandas as pd 
from utils.utils import read_jsonl, extract_hash_answer
from utils.prompt_hub import get_confidence_prompt

def ruler_eval_func(base_url, predict_url):
    base = read_jsonl(base_url)
    check = read_jsonl(predict_url)
    predicts = []
    targets = []
    for b, predict in zip(base, check):
        targets.append([p.lower() for p in predict['reference']])
        if b['task_label'] == 'cwe' or b['task_label'] == 'fwe' or b['task_label'] == 'vt':
            predicts.append([re.sub(r'[^a-zA-Z]', '', w).lower() for w in predict['completion'].split(" ")])
        elif b['task_label'] == 'niah_multikey_1' or b['task_label'] == 'niah_multikey_2' or b['task_label'] == 'niah_single_1' or b['task_label'] == 'niah_single_2':
            predicts.append([re.sub(r'[^0-9]', '', predict['completion']).lower()]) 
        elif b['task_label'] == 'niah_multikey_3' or b['task_label'] == 'niah_single_3':
            predicts.append([re.sub(r'[^a-zA-Z0-9-]', '', predict['completion']).lower()])
        elif b['task_label'] == 'niah_multiquery' or b['task_label'] == 'niah_multivalue':
            predicts.append([re.sub(r'[^0-9]', '', w).lower() for w in predict['completion'].split(" ")])
        else:
            predicts.append([predict['completion'].lower()])
    tf = []
    #new_p = []
    for b, p, t in zip(base, predicts, targets):
        p = [x for x in p if x != '']
        #new_p.append(p)
        if 'qa' in b['task_label']:
            if set(p).issubset(set(t)):
                tf.append(1)
            else:
                tf.append(0)
        else:
            try:
                if set(p) == set(t):
                    tf.append(1)
                else:
                    tf.append(0) 
            except:
                print(p)
    return tf

## Make Llama-3.2-3B-Instruct

In [2]:
## ruler training datasets
ruler_4k_base = read_jsonl('../logs/trash/llama/ruler_4k_train_seed_samples/outputs_base_argmax.jsonl')

ruler_4k_samples_tf = []
for i in range(10):
    temp_tf = ruler_eval_func('../data/processed/ruler_4k_train.jsonl', f'../logs/trash/llama/ruler_4k_train_seed_samples/outputs_seed_{i}.jsonl')
    ruler_4k_samples_tf.append(temp_tf)
    
conf_labels = []
for i in range(len(ruler_4k_base)):
    count = 0
    for j in range(10):
        count += ruler_4k_samples_tf[j][i]
    conf_labels.append(str(int(count*10)))
    
ruler_train_data = pd.DataFrame({
    'input_prompt': [x['input'] for x in ruler_4k_base],
    'predicted_answer': [x['completion'] for x in ruler_4k_base],
    'target_answer': [x['reference'] for x in ruler_4k_base],
    'conf_input_single': [x['input'] + x['completion'] + "</answer>\n\n" +get_confidence_prompt('default')[:-12] for x in ruler_4k_base],
    'conf_input_multi': [x['input'] + x['completion'] + "</answer>\n\n" +get_confidence_prompt('multi')[:-22] + "<reasoning_confidence>N/A</reasoning_confidence>\n<evidence_confidence>" for x in ruler_4k_base],
    'conf_label_single': ["<confidence>" + c + "</confidence>" for c in conf_labels],
    'conf_label_multi': [ c + "</evidence_confidence>" for c in conf_labels],
    'task_type': ['ruler' for _ in range(len(ruler_4k_base))]})

## ruelr validation datasets 
ruler_4k_base = read_jsonl('../logs/trash/llama/ruler_4k_valid_seed_samples/outputs_base_argmax.jsonl')

ruler_4k_samples_tf = []
for i in range(10):
    temp_tf = ruler_eval_func('../data/processed/ruler_4k_valid.jsonl', f'../logs/trash/llama/ruler_4k_valid_seed_samples/outputs_seed_{i}.jsonl')
    ruler_4k_samples_tf.append(temp_tf)
    
conf_labels = []
for i in range(len(ruler_4k_base)):
    count = 0
    for j in range(10):
        count += ruler_4k_samples_tf[j][i]
    conf_labels.append(str(int(count*10)))
    
ruler_valid_data = pd.DataFrame({
    'input_prompt': [x['input'] for x in ruler_4k_base],
    'predicted_answer': [x['completion'] for x in ruler_4k_base],
    'target_answer': [x['reference'] for x in ruler_4k_base],
    'conf_input_single': [x['input'] + x['completion'] + "</answer>\n\n" + get_confidence_prompt('default')[:-12] for x in ruler_4k_base],
    'conf_input_multi': [x['input'] + x['completion'] + "</answer>\n\n" + get_confidence_prompt('multi')[:-22] + "<reasoning_confidence>N/A</reasoning_confidence>\n<evidence_confidence>" for x in ruler_4k_base],
    'conf_label_single': ["<confidence>" + c + "</confidence>" for c in conf_labels],
    'conf_label_multi': [c + "</evidence_confidence>" for c in conf_labels],
    'task_type': ['ruler' for _ in range(len(ruler_4k_base))]})

In [3]:
print(ruler_train_data['conf_input_multi'][0]+ruler_train_data['conf_label_multi'][0])
print('---')
print(ruler_valid_data['conf_input_multi'][0]+ruler_valid_data['conf_label_multi'][0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 15 Jan 2026

You are an expert assistant that provides clear and helpful answers.<|eot_id|><|start_header_id|>user<|end_header_id|>

You are an answering assistant.
Given a question, provide the final answer.
Respond with the answer enclosed in <answer> and </answer> tags.
The <answer> tag must contain only the final answer, with no additional explanation.

Below is a numbered list of words. In these words, some appear more often than others. Memorize the ones that appear most often.
1. gratitude 2. wheel 3. attorney 4. company 5. tortilla 6. mortise 7. develop 8. boyhood 9. mortise 10. chin 11. boyhood 12. denominator 13. wheel 14. denominator 15. hygienic 16. district 17. armchair 18. freak 19. wheel 20. hygienic 21. attorney 22. overview 23. lovely 24. gratitude 25. gratitude 26. freak 27. clearance 28. runway 29. hygienic 30. attorney 31. bench 32. blush 33. tortilla 34. f

In [4]:
gsm_base = read_jsonl('../logs/trash/llama/gsm_seed_samples/outputs_base_argmax.jsonl')

gsm_samples_tf = []
for i in range(10):
    temp_data = read_jsonl(f'../logs/trash/llama/gsm_seed_samples/outputs_seed_{i}_parsed.jsonl')
    gold_answer = [extract_hash_answer(x['gold_answer']) for x in temp_data]
    predicted_answer = [x['parsed'].split("**Model's Final Answer is:** ")[-1] for x in temp_data]
    temp_tf = [re.sub(r'[^0-9]', '', predicted_answer[i]).lower() == re.sub(r'[^0-9]', '', gold_answer[i]).lower() for i in range(len(temp_data))]
    gsm_samples_tf.append(temp_tf)

conf_labels = []
for i in range(len(gsm_base)):
    count = 0
    for j in range(10):
        count += gsm_samples_tf[j][i]
    conf_labels.append(str(int(count*10)))

full_data = pd.DataFrame({
    'input_prompt': [x['input'] for x in gsm_base],
    'predicted_answer': [x['completion'] for x in gsm_base],
    'target_answer': [extract_hash_answer(x['reference']) for x in gsm_base],
    'conf_input_single': [x['input'] + x['completion'] + "</answer>\n\n" + get_confidence_prompt('default')[:-12] for x in gsm_base],
    'conf_input_multi': [x['input'] + x['completion'] + "</answer>\n\n" + get_confidence_prompt('multi')[:-22] + "<reasoning_confidence>" for x in gsm_base],
    'conf_label_single': ["<confidence>" + c + "</confidence>" for c in conf_labels],
    'conf_label_multi': [ c + "</reasoning_confidence>" for c in conf_labels],
    'task_type': ['gsm' for _ in range(len(gsm_base))]})

full_data = full_data.sample(frac=1, random_state=42).reset_index(drop=True)

gsm_train_data = full_data.iloc[:int(len(full_data)*0.8)]
gsm_valid_data = full_data.iloc[int(len(full_data)*0.8):].reset_index(drop=True)

print(full_data['conf_input_multi'][0] + full_data['conf_label_multi'][0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 14 Jan 2026

You are an expert assistant that provides clear and helpful answers.<|eot_id|><|start_header_id|>user<|end_header_id|>

You are a reasoning assistant.
Given a question, think step by step to arrive at the correct answer.
First, provide your reasoning enclosed in <think> and </think> tags.
Then, provide the final answer enclosed in <answer> and </answer> tags.
The <answer> tag must contain only the final answer, with no additional explanation.

In Professor Plum's biology class there are 40 students. Of those students, 80 percent have puppies. Of those who have puppies, 25% also have parrots. How many students have both puppies and parrots?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

<think>First, we need to find out how many students have puppies. To do this, we multiply the total number of students by the percentage of students who have puppies. 40 st

In [5]:
ruler_train_data.to_csv("../data/train_data/Llama-3.2-3B-Instruct/csft_new/ruler_4k_train.csv", index=False)
ruler_valid_data.to_csv("../data/train_data/Llama-3.2-3B-Instruct/csft_new/ruler_4k_valid.csv", index=False)

gsm_train_data.to_csv("../data/train_data/Llama-3.2-3B-Instruct/csft_new/gsm_train.csv", index=False)
gsm_valid_data.to_csv("../data/train_data/Llama-3.2-3B-Instruct/csft_new/gsm_valid.csv", index=False)

## Make Qwen3-8B training data

In [6]:
## ruler training datasets
ruler_4k_base = read_jsonl('../logs/trash/qwen/38b_ruler_seed_samples/train_base_argmax.jsonl')

ruler_4k_samples_tf = []
for i in range(10):
    temp_tf = ruler_eval_func('../data/processed/ruler_4k_train.jsonl', f'../logs/trash/qwen/38b_ruler_seed_samples/train_seed_{i}.jsonl')
    ruler_4k_samples_tf.append(temp_tf)
    
conf_labels = []
for i in range(len(ruler_4k_base)):
    count = 0
    for j in range(10):
        count += ruler_4k_samples_tf[j][i]
    conf_labels.append(str(int(count*10)))
    
ruler_train_data = pd.DataFrame({
    'input_prompt': [x['input'] for x in ruler_4k_base],
    'predicted_answer': [x['completion'] for x in ruler_4k_base],
    'target_answer': [x['reference'] for x in ruler_4k_base],
    'conf_input_single': [x['input'] + x['completion'] + "</answer>\n\n" + get_confidence_prompt('default')[:-12] for x in ruler_4k_base],
    'conf_input_multi': [x['input'] + x['completion'] + "</answer>\n\n" + get_confidence_prompt('multi')[:-22] + "<reasoning_confidence>N/A</reasoning_confidence>\n<evidence_confidence>" for x in ruler_4k_base],
    'conf_label_single': ["<confidence>" + c + "</confidence>" for c in conf_labels],
    'conf_label_multi': [c + "</evidence_confidence>" for c in conf_labels],
    'task_type': ['ruler' for _ in range(len(ruler_4k_base))]})

## ruelr validation datasets 
ruler_4k_base = read_jsonl('/mnt/home/chaeyun-jang/gcsft/logs/trash/qwen8/ruler_4k_seed_samples/valid_base_argmax.jsonl')

ruler_4k_samples_tf = []
for i in range(10):
    temp_tf = ruler_eval_func('../data/processed/ruler_4k_valid.jsonl', f'/mnt/home/chaeyun-jang/gcsft/logs/trash/qwen8/ruler_4k_seed_samples/valid_seed_{i}.jsonl')
    ruler_4k_samples_tf.append(temp_tf)
    
conf_labels = []
for i in range(len(ruler_4k_base)):
    count = 0
    for j in range(10):
        count += ruler_4k_samples_tf[j][i]
    conf_labels.append(str(int(count*10)))
    
ruler_valid_data = pd.DataFrame({
    'input_prompt': [x['input'] for x in ruler_4k_base],
    'predicted_answer': [x['completion'] for x in ruler_4k_base],
    'target_answer': [x['reference'] for x in ruler_4k_base],
    'conf_input_single': [x['input'] + x['completion'] + "</answer>\n\n" +  get_confidence_prompt('default')[:-12] for x in ruler_4k_base],
    'conf_input_multi': [x['input'] + x['completion'] + "</answer>\n\n" + get_confidence_prompt('multi')[:-22] + "<reasoning_confidence>N/A</reasoning_confidence>\n<evidence_confidence>" for x in ruler_4k_base],
    'conf_label_single': ["<confidence>" + c + "</confidence>" for c in conf_labels],
    'conf_label_multi': [c + "</evidence_confidence>" for c in conf_labels],
    'task_type': ['ruler' for _ in range(len(ruler_4k_base))]})

print(ruler_valid_data['conf_input_single'][0] + ruler_valid_data['conf_label_single'][0])

<|im_start|>user
You are an answering assistant.
Given a question, provide the final answer.
Respond with the answer enclosed in <answer> and </answer> tags.
The <answer> tag must contain only the final answer, with no additional explanation.

Below is a numbered list of words. In these words, some appear more often than others. Memorize the ones that appear most often.
1. spiffy 2. wit 3. atheist 4. pumpkinseed 5. satisfaction 6. livestock 7. multiply 8. boatyard 9. livestock 10. sinuosity 11. boatyard 12. semester 13. wit 14. semester 15. troubled 16. pint 17. health-care 18. wreck 19. wit 20. troubled 21. atheist 22. introduce 23. hacienda 24. spiffy 25. spiffy 26. wreck 27. entree 28. solidity 29. troubled 30. atheist 31. sentence 32. transportation 33. satisfaction 34. wreck 35. semester 36. livestock 37. satisfaction 38. transportation 39. boatyard 40. transportation
Question: What are the 10 most common words in the above list? Answer: The top 10 words that appear most often in 

In [8]:
gsm_base = read_jsonl('/mnt/home/chaeyun-jang/gcsft/logs/trash/qwen/math_seed_samples/train_base_argmax.jsonl')

gsm_samples_tf = []
for i in range(10):
    temp_data = read_jsonl(f'/mnt/home/chaeyun-jang/gcsft/logs/trash/qwen/math_seed_samples/outputs_seed_{i}_parsed.jsonl')
    #gold_answer = [extract_hash_answer(x['gold_answer']) for x in temp_data]
    #predicted_answer = [x['parsed'].split("**Model's Final Answer is:** ")[-1] for x in temp_data]
    temp_tf = [1 if 'yes' in x['parsed'].lower() else 0 for x in temp_data]
    #temp_tf = [re.sub(r'[^0-9]', '', predicted_answer[i]).lower() == re.sub(r'[^0-9]', '', gold_answer[i]).lower() for i in range(len(temp_data))]
    gsm_samples_tf.append(temp_tf)

conf_labels = []
for i in range(len(gsm_base)):
    count = 0
    for j in range(10):
        count += gsm_samples_tf[j][i]
    conf_labels.append(str(int(count*10)))

full_data = pd.DataFrame({
    'input_prompt': [x['input'] for x in gsm_base],
    'predicted_answer': [x['completion'] for x in gsm_base],
    'target_answer': [x['reference'] for x in gsm_base],
    'conf_input_single': [x['input'] + x['completion'] + "</answer>\n\n" + get_confidence_prompt('default')[:-12] for x in gsm_base],
    'conf_input_multi': [x['input'] + x['completion'] + "</answer>\n\n" + get_confidence_prompt('multi')[:-22] + "<reasoning_confidence>" for x in gsm_base],
    'conf_label_single': ["<confidence>" + c + "</confidence>" for c in conf_labels],
    'conf_label_multi': [ c + "</reasoning_confidence>" for c in conf_labels],
    'task_type': ['gsm' for _ in range(len(gsm_base))]})

full_data = full_data.sample(frac=1, random_state=42).reset_index(drop=True)

gsm_train_data = full_data.iloc[:int(len(full_data)*0.8)]
gsm_valid_data = full_data.iloc[int(len(full_data)*0.8):].reset_index(drop=True)

print(full_data['conf_input_multi'][0] + full_data['conf_label_multi'][0])

<|im_start|>user
You are a reasoning assistant.
Given a question, think step by step to arrive at the correct answer.
First, provide your reasoning enclosed in <think> and </think> tags.
Then, provide the final answer enclosed in <answer> and </answer> tags.
The <answer> tag must contain only the final answer, with no additional explanation.

Find the polynomial $p(x),$ with real coefficients, such that
\[p(x^3) - p(x^3 - 2) = [p(x)]^2 + 12\]for all real numbers $x.$<|im_end|>
<|im_start|>assistant
<think>
Okay, so I need to find a polynomial p(x) with real coefficients that satisfies the equation p(x³) - p(x³ - 2) = [p(x)]² + 12 for all real numbers x. Hmm, let me start by thinking about what kind of polynomial p(x) could be. 

First, since the equation has to hold for all real numbers x, maybe I can assume a general form for p(x) and then try to determine its coefficients. Let me consider the degrees of the polynomials on both sides. Let's suppose that p(x) is a polynomial of degree 

In [11]:
ruler_train_data.to_csv("/mnt/home/chaeyun-jang/gcsft/data/train_data/Qwen3-8B/csft_new/ruler_4k_train.csv", index=False)
ruler_valid_data.to_csv("/mnt/home/chaeyun-jang/gcsft/data/train_data/Qwen3-8B/csft_new/ruler_4k_valid.csv", index=False)
gsm_train_data.to_csv("/mnt/home/chaeyun-jang/gcsft/data/train_data/Qwen3-8B/csft_new/gsm_train.csv", index=False)
gsm_valid_data.to_csv("/mnt/home/chaeyun-jang/gcsft/data/train_data/Qwen3-8B/csft_new/gsm_valid.csv", index=False)

In [12]:
print(len(ruler_train_data), len(ruler_valid_data), len(gsm_train_data), len(gsm_valid_data))

5200 650 3990 998


## RL datasets

In [1]:
import re 
import sys 
sys.path.append('..')
import pandas as pd 
from utils.utils import read_jsonl, extract_hash_answer
from utils.prompt_hub import get_confidence_prompt

def ruler_eval_func(base_url, predict_url):
    base = read_jsonl(base_url)
    check = read_jsonl(predict_url)
    predicts = []
    targets = []
    for b, predict in zip(base, check):
        targets.append([p.lower() for p in predict['reference']])
        if b['task_label'] == 'cwe' or b['task_label'] == 'fwe' or b['task_label'] == 'vt':
            predicts.append([re.sub(r'[^a-zA-Z]', '', w).lower() for w in predict['completion'].split(" ")])
        elif b['task_label'] == 'niah_multikey_1' or b['task_label'] == 'niah_multikey_2' or b['task_label'] == 'niah_single_1' or b['task_label'] == 'niah_single_2':
            predicts.append([re.sub(r'[^0-9]', '', predict['completion']).lower()]) 
        elif b['task_label'] == 'niah_multikey_3' or b['task_label'] == 'niah_single_3':
            predicts.append([re.sub(r'[^a-zA-Z0-9-]', '', predict['completion']).lower()])
        elif b['task_label'] == 'niah_multiquery' or b['task_label'] == 'niah_multivalue':
            predicts.append([re.sub(r'[^0-9]', '', w).lower() for w in predict['completion'].split(" ")])
        else:
            predicts.append([predict['completion'].lower()])
    tf = []
    #new_p = []
    for b, p, t in zip(base, predicts, targets):
        p = [x for x in p if x != '']
        #new_p.append(p)
        if 'qa' in b['task_label']:
            if set(p).issubset(set(t)):
                tf.append(1)
            else:
                tf.append(0)
        else:
            try:
                if set(p) == set(t):
                    tf.append(1)
                else:
                    tf.append(0) 
            except:
                print(p)
    return tf

In [3]:
# train base: '../logs/llama/ruler_4k_train_seed_samples/outputs_base_argmax.jsonl'
# valid base: '../logs/llama/ruler_4k_valid_seed_samples/outputs_base_argmax.jsonl'
train_base = read_jsonl('../logs/trash/llama/ruler_4k_train_seed_samples/outputs_base_argmax.jsonl')
valid_base = read_jsonl('../logs/trash/llama/ruler_4k_valid_seed_samples/outputs_base_argmax.jsonl')

train_tf = ruler_eval_func('../data/processed/ruler_4k_train.jsonl', '../logs/trash/llama/ruler_4k_train_seed_samples/outputs_base_argmax.jsonl')
valid_tf = ruler_eval_func('../data/processed/ruler_4k_valid.jsonl', '../logs/trash/llama/ruler_4k_valid_seed_samples/outputs_base_argmax.jsonl')

In [7]:
prev_prompt = """You are an answering assistant.
Given a question, provide the final answer.
Respond with the answer enclosed in <answer> and </answer> tags.
The <answer> tag must contain only the final answer, with no additional explanation."""

new_prompt = """You are an answering assistant.
Given a question, provide the final answer.
Respond with the answer enclosed in <answer> and </answer> tags.
The <answer> tag must contain only the final answer, with no additional explanation.
Also provide your confidence enclosed in <confidence> and </confidence> tags.
The <confidence> tag must contain only a single number from 0 to 100."""

In [8]:
ruler_train_data = pd.DataFrame({
    'prompt': [x['input'].replace(prev_prompt, new_prompt) for x in train_base],
    #'predicted_answer': [x['completion'] for x in train_base],
    'true_answer': [x['reference'] for x in train_base],
    #'prompt': [x['input'] + x['completion'] + '<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n' + get_confidence_prompt('default')[:-13] + "<|eot_id|>" + "\n<|start_header_id|>assistant<|end_header_id|>" + "<confidence>" for x in train_base],
    #'tf': train_tf
})

ruler_valid_data = pd.DataFrame({
    'prompt': [x['input'].replace(prev_prompt, new_prompt)  for x in valid_base],
    #'predicted_answer': [x['completion'] for x in valid_base],
    'true_answer': [x['reference'] for x in valid_base],
    #'prompt': [x['input'] + x['completion'] + '<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n' + get_confidence_prompt('default')[:-13] + "<|eot_id|>" + "\n<|start_header_id|>assistant<|end_header_id|>" + "<confidence>" for x in valid_base],
    #'tf': valid_tf
})

print(ruler_train_data['prompt'][0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 15 Jan 2026

You are an expert assistant that provides clear and helpful answers.<|eot_id|><|start_header_id|>user<|end_header_id|>

You are an answering assistant.
Given a question, provide the final answer.
Respond with the answer enclosed in <answer> and </answer> tags.
The <answer> tag must contain only the final answer, with no additional explanation.
Also provide your confidence enclosed in <confidence> and </confidence> tags.
The <confidence> tag must contain only a single number from 0 to 100.

Below is a numbered list of words. In these words, some appear more often than others. Memorize the ones that appear most often.
1. gratitude 2. wheel 3. attorney 4. company 5. tortilla 6. mortise 7. develop 8. boyhood 9. mortise 10. chin 11. boyhood 12. denominator 13. wheel 14. denominator 15. hygienic 16. district 17. armchair 18. freak 19. wheel 20. hygienic 21. attorney 22.

In [9]:
prev_prompt = """You are a reasoning assistant.
Given a question, think step by step to arrive at the correct answer.
First, provide your reasoning enclosed in <think> and </think> tags.
Then, provide the final answer enclosed in <answer> and </answer> tags.
The <answer> tag must contain only the final answer, with no additional explanation."""
new_prompt = """You are a reasoning assistant.
Given a question, think step by step to arrive at the correct answer.
First, provide your reasoning enclosed in <think> and </think> tags.
Then, provide the final answer enclosed in <answer> and </answer> tags.
The <answer> tag must contain only the final answer, with no additional explanation.
Also provide your confidence enclosed in <confidence> and </confidence> tags.
The <confidence> tag must contain only a single number from 0 to 100."""

In [12]:
gsm_base = read_jsonl('../logs/trash/llama/gsm_seed_samples/outputs_base_argmax.jsonl')
gsm_base_parsed = read_jsonl('../logs/trash/llama/gsm_seed_samples/outputs_base_argmax_parsed.jsonl')
   
gsm_tf = []
for i in range(len(gsm_base)):
    try:
        if re.sub(r'[^0-9]', '', gsm_base_parsed[i]['parsed'].split("**Model's Final Answer is:** ")[-1]).lower() == re.sub(r'[^0-9]', '', extract_hash_answer(gsm_base[i]['reference'])).lower():
            gsm_tf.append(1)
        else:
            gsm_tf.append(0)
    except:
        gsm_tf.append(0)

full_data = pd.DataFrame({
    'prompt': [x['input'].replace(prev_prompt, new_prompt) for x in gsm_base],
    #'predicted_answer': [x['completion'] for x in gsm_base],
    'true_answer': [extract_hash_answer(x['reference']) for x in gsm_base],
    #'prompt': [x['input'] + x['completion'] + '<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n' + get_confidence_prompt('default')[:-13] + "<|eot_id|>" + "\n<|start_header_id|>assistant<|end_header_id|>" + "<confidence>" for x in gsm_base],
    #'tf': gsm_tf
})

full_data = full_data.sample(frac=1, random_state=42).reset_index(drop=True)

gsm_train_data = full_data.iloc[:int(len(full_data)*0.8)]
gsm_valid_data = full_data.iloc[int(len(full_data)*0.8):].reset_index(drop=True)

In [13]:
print(gsm_train_data['prompt'][0])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 14 Jan 2026

You are an expert assistant that provides clear and helpful answers.<|eot_id|><|start_header_id|>user<|end_header_id|>

You are a reasoning assistant.
Given a question, think step by step to arrive at the correct answer.
First, provide your reasoning enclosed in <think> and </think> tags.
Then, provide the final answer enclosed in <answer> and </answer> tags.
The <answer> tag must contain only the final answer, with no additional explanation.
Also provide your confidence enclosed in <confidence> and </confidence> tags.
The <confidence> tag must contain only a single number from 0 to 100.

In Professor Plum's biology class there are 40 students. Of those students, 80 percent have puppies. Of those who have puppies, 25% also have parrots. How many students have both puppies and parrots?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

<think>


In [14]:
ruler_train_data.to_csv("../data/train_data/Llama-3.2-3B-Instruct/rl_base/ruler_4k_train.csv", index=False)
ruler_valid_data.to_csv("../data/train_data/Llama-3.2-3B-Instruct/rl_base/ruler_4k_valid.csv", index=False)
gsm_train_data.to_csv("../data/train_data/Llama-3.2-3B-Instruct/rl_base/gsm_train.csv", index=False)
gsm_valid_data.to_csv("../data/train_data/Llama-3.2-3B-Instruct/rl_base/gsm_valid.csv", index=False)