In [5]:
# corrected baselines
ocw_ok_5 = "outputs/n5_baseline_dt.ocw/chatgpt1106/model_selection_prompts/n5_baseline.jsonl"
ocw_ok_10 = "outputs/n10_baseline_dt.ocw/chatgpt1106/model_selection_prompts/n10_baseline.jsonl"
gsm_ok_15 = "outputs/n15_baseline_dt.gsm/chatgpt1106/model_selection_prompts/n15_baseline.jsonl"
math_ok_5 = "outputs/n5_baseline_dt.math/chatgpt1106/model_selection_prompts/n5_baseline.jsonl"


# leftovers chatgpt1106
GSM15="outputs_dgx/gsm8K_test_dt.gsm/chatgpt1106/model_selection_prompts/err_n15_baseline.jsonl"
OCW10="outputs_dgx/ocw_course_dt.ocw/chatgpt1106/model_selection_prompts/err_n10_baseline.jsonl"
OCW5="outputs/ocw_course_dt.ocw/chatgpt1106/model_selection_prompts/err_n5_baseline.jsonl"
MATH5="outputs/MATH-full_dt.math/chatgpt1106/model_selection_prompts/err_n5_baseline.jsonl"


In [8]:
import pandas as pd
import jsonlines as jsl

right_err_pairs = [
    (ocw_ok_5, OCW5),
    (ocw_ok_10, OCW10),
    (gsm_ok_15, GSM15),
    (math_ok_5, MATH5)
]

df_r_e_pairs = [
    (pd.DataFrame(jsl.open(right)), pd.DataFrame(jsl.open(err))) for right, err in right_err_pairs
]

# solmap, ansmap empty checked again
df_r_e_pairs[0][1].solmap

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
6   NaN
7   NaN
Name: solmap, dtype: float64

In [12]:
sanity_check = [e["question" if "question" in e.columns else "problem"].isin(r["question" if "question" in e.columns else "problem"]).mean() for r,e in df_r_e_pairs]
sanity_check # some of those were error pruned rows

[1.0, 1.0, 0.8181818181818182, 0.9866666666666667]

In [30]:
# make the files to run rims
from pathlib import Path

def pick_rows(src:pd.DataFrame=None, by:pd.DataFrame=None)->pd.DataFrame:
    q_key = "question" if "question" in src.columns else "problem"
    return src[src[q_key].isin(by[q_key])]

target_paths = [ Path(r).parent/Path(r).name.replace("_baseline", "_baseline_picked") for r,e in right_err_pairs]

contents = [pick_rows(src=df_r, by=df_e) for df_r, df_e in df_r_e_pairs]

for path, content in zip(target_paths, contents):
    with jsl.open(path, "w") as writer:
        writer.write_all(content.to_dict(orient="records")) 
        print(f"{len(content)} rows to \n\t{path}")



8 rows to 
	outputs/n5_baseline_dt.ocw/chatgpt1106/model_selection_prompts/n5_baseline_picked.jsonl
23 rows to 
	outputs/n10_baseline_dt.ocw/chatgpt1106/model_selection_prompts/n10_baseline_picked.jsonl
18 rows to 
	outputs/n15_baseline_dt.gsm/chatgpt1106/model_selection_prompts/n15_baseline_picked.jsonl
296 rows to 
	outputs/n5_baseline_dt.math/chatgpt1106/model_selection_prompts/n5_baseline_picked.jsonl


In [31]:
original_err_rows = [len(e) for r,e in df_r_e_pairs]
original_err_rows # 0, 0, 4, 4 loss. acceptable

[8, 23, 22, 300]

In [32]:
cmds = [] 
for gsm_jslf, n, dataset_type in zip(target_paths, [5, 10, 15, 5], ["ocw", "ocw", "gsm", "math"]):
    cmd = f"python run_inference.py rims_inference \\\n \
            --backbone chatgpt1106 \\\n \
            --gsm_jslf {gsm_jslf} \\\n \
            --n {n} \\\n \
            --dataset_type {dataset_type} \\\n \
            --n_jobs 8"
    cmds.append(cmd)
    print(cmd)

python run_inference.py rims_inference \
             --backbone chatgpt1106 \
             --gsm_jslf outputs/n5_baseline_dt.ocw/chatgpt1106/model_selection_prompts/n5_baseline_picked.jsonl \
             --n 5 \
             --dataset_type ocw \
             --n_jobs 8
python run_inference.py rims_inference \
             --backbone chatgpt1106 \
             --gsm_jslf outputs/n10_baseline_dt.ocw/chatgpt1106/model_selection_prompts/n10_baseline_picked.jsonl \
             --n 10 \
             --dataset_type ocw \
             --n_jobs 8
python run_inference.py rims_inference \
             --backbone chatgpt1106 \
             --gsm_jslf outputs/n15_baseline_dt.gsm/chatgpt1106/model_selection_prompts/n15_baseline_picked.jsonl \
             --n 15 \
             --dataset_type gsm \
             --n_jobs 8
python run_inference.py rims_inference \
             --backbone chatgpt1106 \
             --gsm_jslf outputs/n5_baseline_dt.math/chatgpt1106/model_selection_prompts/n5_baseline

In [1]:
# FIXED_BASELINES --> no need to merge. 

mkdir -p outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/model_selection_prompts/
mkdir -p outputs/0_final_results/gsm8K_test_dt.gsm/chatgpt1106/model_selection_prompts/
mkdir -p outputs/0_final_results/MATH-full_dt.math/chatgpt1106/model_selection_prompts/

ocw_ok_5=outputs/n5_baseline_dt.ocw/chatgpt1106/model_selection_prompts/n5_baseline.jsonl
ocw_ok_5_to=outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/model_selection_prompts/n5_baseline.jsonl
cp $ocw_ok_5 $ocw_ok_5_to

ocw_ok_10=outputs/n10_baseline_dt.ocw/chatgpt1106/model_selection_prompts/n10_baseline.jsonl
ocw_ok_10_to=outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/model_selection_prompts/n10_baseline.jsonl
cp $ocw_ok_10 $ocw_ok_10_to

gsm_ok_15=outputs/n15_baseline_dt.gsm/chatgpt1106/model_selection_prompts/n15_baseline.jsonl
gsm_ok_15_to=outputs/0_final_results/gsm8K_test_dt.gsm/chatgpt1106/model_selection_prompts/n15_baseline.jsonl
cp $gsm_ok_15 $gsm_ok_15_to

math_ok_5=outputs/n5_baseline_dt.math/chatgpt1106/model_selection_prompts/n5_baseline.jsonl
math_ok_5_to=outputs/0_final_results/MATH-full_dt.math/chatgpt1106/model_selection_prompts/n5_baseline.jsonl
cp $math_ok_5 $math_ok_5_to


SyntaxError: invalid decimal literal (3381292144.py, line 4)

In [21]:

# FIXED + ORIGINAL RIMS 
# FIXED: find outputs/ -name "n*baseline_dt*/**/*rims*.jsonl" 
# ORIGINAL_RIMS results files
"""

find **/chatgpt1106 -name "n*rims_T*.jsonl"

src/results_paths.txt
"""

import pandas as pd 
import jsonlines as jsl
from pathlib import Path

# original rims results files + picked files 
def check_and_merge_results(jslf1, jslf2):
    """
    usually the jsonl records does not expected to overlap, but math result do. 
    I guess this comes from error: true with different reason than 429 error in math (rims run on ~error)
    """
    # load and drop error lines first: expects most of overlaps will disappear.
    df1 = pd.read_json(jslf1, lines=True) if isinstance(jslf1,str) else pd.DataFrame(jslf1) # records
    df2 =  pd.read_json(jslf2, lines=True) if isinstance(jslf2,str) else pd.DataFrame(jslf2)
    df1_ = df1[~(df1.error)]
    df2_ = df2[~(df2.error)]
    print("dropped errors (jslf1, jslf2)")
    print(-len(df1_)+len(df1), len(df1))
    print(-len(df2_)+len(df2), len(df2))

    # check overlaps 
    q_key = "question" if "question" in df1.columns else "problem"  
    overlap = (df1_[q_key].isin(df2_[q_key]))
    print(f"{overlap.sum()=} rows")
    df1__ = df1_[~overlap]

    # merge
    df_merged = pd.concat([df1__, df2_], axis="index")    
    
    return df_merged.to_dict(orient="records")

def chunkify(df, num_chunks):
    avg_len = len(df) / num_chunks
    chunks = []
    start = 0

    l = 0
    for i in range(num_chunks):
        end = round(start + avg_len)
        chunks.append(df.iloc[start:end])
        l+=round(avg_len)
        start = end 
    
    tail = len(df)-l
    if tail>0:
        chunks.append(df.iloc[-tail:])
        print(tail)
    return print(chunks)

# chunkify(list(range(16)), 3)


AttributeError: 'list' object has no attribute 'iloc'

In [4]:

# to merge rows... (from results_paths.txt)
JSLF_PAIRS = {
    "math_n5_rims":
    [("outputs/MATH-full_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt1/n5_rims_T0.5.jsonl",
     "outputs/n5_baseline_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt1/n5_rims_T0.5.jsonl"),
    ("outputs/MATH-full_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt1/n5_rims_T0.2.jsonl",
     "outputs/n5_baseline_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt1/n5_rims_T0.2.jsonl"),
    ("outputs/MATH-full_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.5.jsonl",
     "outputs/n5_baseline_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.5.jsonl"),
    ("outputs/MATH-full_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.2.jsonl",
     "outputs/n5_baseline_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.2.jsonl")],
"ocw_n10_rims":
     [("outputs_dgx/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n10_rims_T0.5.jsonl",
      "outputs/n10_baseline_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n10_rims_T0.5.jsonl"),
     ("outputs_dgx/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n10_rims_T0.2.jsonl",
      "outputs/n10_baseline_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n10_rims_T0.2.jsonl"),
     ("outputs_dgx/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n10_rims_T0.5.jsonl",
      "outputs/n10_baseline_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n10_rims_T0.5.jsonl"),
     ("outputs_dgx/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n10_rims_T0.2.jsonl",
      "outputs/n10_baseline_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n10_rims_T0.2.jsonl")],

"gsm_n15_rims":
    [("outputs_dgx/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_pal2p2c.cot2p2c.cot2pal.txt/n15_rims_T0.2.jsonl",
      "outputs/n15_baseline_dt.gsm/chatgpt1106/rewrote.p2c_gsm_pal2p2c.cot2p2c.cot2pal.txt/n15_rims_T0.2.jsonl"),
    ("outputs_dgx/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_pal2p2c.cot2p2c.cot2pal.txt/n15_rims_T0.5.jsonl",
      "outputs/n15_baseline_dt.gsm/chatgpt1106/rewrote.p2c_gsm_pal2p2c.cot2p2c.cot2pal.txt/n15_rims_T0.5.jsonl"),
    ("outputs_dgx/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_newer_best_p2c2cot.pal2p2c.pal2cot.txt/n15_rims_T0.2.jsonl",
      "outputs/n15_baseline_dt.gsm/chatgpt1106/rewrote.p2c_gsm_newer_best_p2c2cot.pal2p2c.pal2cot.txt/n15_rims_T0.2.jsonl"),
    ("outputs_dgx/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_newer_best_p2c2cot.pal2p2c.pal2cot.txt/n15_rims_T0.5.jsonl",
      "outputs/n15_baseline_dt.gsm/chatgpt1106/rewrote.p2c_gsm_newer_best_p2c2cot.pal2p2c.pal2cot.txt/n15_rims_T0.5.jsonl"),
    ("outputs_dgx/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_cot2p2c.pal2cot.pal2p2c.txt/n15_rims_T0.2.jsonl",
      "outputs/n15_baseline_dt.gsm/chatgpt1106/rewrote.p2c_gsm_cot2p2c.pal2cot.pal2p2c.txt/n15_rims_T0.2.jsonl"),
    ("outputs_dgx/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_cot2p2c.pal2cot.pal2p2c.txt/n15_rims_T0.5.jsonl",
      "outputs/n15_baseline_dt.gsm/chatgpt1106/rewrote.p2c_gsm_cot2p2c.pal2cot.pal2p2c.txt/n15_rims_T0.5.jsonl")],

"ocw_n5_rims":
       [("outputs/n5_baseline_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.5.jsonl",
     "outputs/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.5.jsonl"),
       ("outputs/n5_baseline_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.2.jsonl",
     "outputs/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.2.jsonl"),
       ("outputs/n5_baseline_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n5_rims_T0.5.jsonl",
     "outputs/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n5_rims_T0.5.jsonl"),
       ("outputs/n5_baseline_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n5_rims_T0.2.jsonl",
     "outputs/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n5_rims_T0.2.jsonl")],
}

def get_prefix(path):
    return Path(path).parts[1]


ROOTDIR = Path("outputs/0_final_results")

for key, pairs in JSLF_PAIRS.items():
    for jslf1, jslf2 in pairs:
        jslf1, jslf2 = Path(jslf1), Path(jslf2)
        assert Path(jslf1).name == Path(jslf2).name

        for f in [jslf1, jslf2]:
            prefix = get_prefix(f)
            if prefix.startswith("ocw_course") or prefix.startswith("MATH-full") or prefix.startswith("gsm8K_test"):
                newpathparts = f.parts[1:]
                newpath = ROOTDIR/"/".join(newpathparts)
                newpath.parent.mkdir(parents=True, exist_ok=True)
                break
        assert prefix.startswith("ocw_course") or prefix.startswith("MATH-full") or prefix.startswith("gsm8K_test")
        
        merged = check_and_merge_results(jslf1, jslf2)
        with jsl.open(newpath, "w") as writer:
            writer.write_all(merged)
            print(f"wrote {len(merged)} rows to \n\t{newpath}")

ValueError: DataFrame constructor not properly called!

In [10]:
math_10_files = [
    "outputs/MATH-full_dt.math/chatgpt1106/model_selection_prompts/n10_baseline.jsonl",
    "outputs/MATH-full_dt.math/chatgpt1106/model_selection_prompts/n10_baseline.jsonl_leftovers",
    "outputs/MATH-full_dt.math/chatgpt1106/model_selection_prompts/n10_baseline.jsonl_leftovers_",
    "outputs/MATH-full_dt.math/chatgpt1106/model_selection_prompts/n10_baseline.jsonl_leftovers__",
]
dataset = "../dataset/MATH/MATH-full.jsonl"

anchor = pd.read_json(math_10_files[0], lines=True)
for f in math_10_files[1:]:
    merged = check_and_merge_results(anchor, f)

df_done = pd.DataFrame(merged)

todo_mask = ~(anchor.question.isin(df_done.question))
anchor_todo = anchor[todo_mask]

# split anchor dataframe into 15 equal-lengthed dataframes
anchor_todo_split = chunkify(anchor_todo, 15)
root = Path("outputs/MATH-full_dt.math/chatgpt1106/model_selection_prompts/")
for i, df in enumerate(anchor_todo_split):
    df.to_json(root/f"todo_{i}.jsonl", orient="records", lines=True)
    print(f"written {len(df)} rows to \n\t{root/f'anchor_todo_{i}.jsonl'}")


# split full set into 15 equal-lengthed dataframes 
anchor_split = chunkify(anchor, 15)
root = Path("../dataset/MATH/")
for i, df in enumerate(anchor_split):
    df.to_json(root/f"math_pt_{i}.jsonl", orient="records", lines=True)
    print(f"written {len(df)} rows to \n\t{root/f'anchor_{i}.jsonl'}")

dropped errors (jslf1, jslf2)
0 558
0 12
overlap.sum()=0 rows
dropped errors (jslf1, jslf2)
0 558
0 708
overlap.sum()=0 rows
dropped errors (jslf1, jslf2)
0 558
0 711
overlap.sum()=0 rows
written 0 rows to 
	outputs/MATH-full_dt.math/chatgpt1106/model_selection_prompts/anchor_todo_0.jsonl
written 0 rows to 
	outputs/MATH-full_dt.math/chatgpt1106/model_selection_prompts/anchor_todo_1.jsonl
written 0 rows to 
	outputs/MATH-full_dt.math/chatgpt1106/model_selection_prompts/anchor_todo_2.jsonl
written 0 rows to 
	outputs/MATH-full_dt.math/chatgpt1106/model_selection_prompts/anchor_todo_3.jsonl
written 0 rows to 
	outputs/MATH-full_dt.math/chatgpt1106/model_selection_prompts/anchor_todo_4.jsonl
written 0 rows to 
	outputs/MATH-full_dt.math/chatgpt1106/model_selection_prompts/anchor_todo_5.jsonl
written 0 rows to 
	outputs/MATH-full_dt.math/chatgpt1106/model_selection_prompts/anchor_todo_6.jsonl
written 0 rows to 
	outputs/MATH-full_dt.math/chatgpt1106/model_selection_prompts/anchor_todo_7.js

In [20]:
anchor[2:4]

Unnamed: 0,question,level,type,solution,answer,ansmap,solmap,plan,error,error_msg,running_at,majority_ans,idx2chosen_method,majvote_ans,candid_answers,inference_mode,dataset_type,prompt_file,temperatures
2,Find $x$ such that $\lceil x \rceil + x = \dfr...,Level 4,Algebra,"First, we note that $x$ must be positive, sinc...",\dfrac{9}{7},"{'cot': ['$\boxed{\dfrac{2}{7}}$', '$. So, $'...","{'cot': ['To solve for $x$, we first notice th...",[1. Rewrite the equation with the ceiling func...,False,,baseline_complete_row,"[$x = \boxed{\frac{2}{7}}$, $\boxed{\frac{2}{7...","{'2': 'cot', '3': 'cot', '4': 'pal', '9': 'cot'}","[$\boxed{\dfrac{2}{7}}$, $.\n\nSo, $, None, No...","[$\boxed{\dfrac{2}{7}}$, $.\n\nSo, $, $x = \bo...","[majority_vote, majority_vote, model-selection...",math,prompt_construction_src/newer_prompts_3/model_...,"{'cot_temperature': 0.5, 'pal_temperature': 0...."
3,Evaluate $i^5+i^{-25}+i^{45}$.,Level 5,Algebra,We have $i^5 = i^4\cdot i = 1\cdot (i) = i$. ...,i,"{'cot': ['$\boxed{0}$', '$$i^5 + i^{-25} + i^{...","{'cot': ['To evaluate the given expression, we...",[1. Calculate the value of each term.\n2. Add ...,False,,baseline_complete_row,"[1j, 1j]",{},"[1j, 1j, 1j, 1j, 1j, 1j, 1j, 1j, 1j, 1j]","[1j, 1j, 1j, 1j, 1j, 1j, 1j, 1j, 1j, 1j]","[majority_vote, majority_vote, majority_vote, ...",math,prompt_construction_src/newer_prompts_3/model_...,"{'cot_temperature': 0.5, 'pal_temperature': 0...."


In [5]:
# make n15/n5/n10 results from other rims files
'''
outputs/0_final_results/MATH-full_dt.math/chatgpt1106/model_selection_prompts/n5_baseline.jsonl

outputs/0_final_results/MATH-full_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt1/n5_rims_T0.5.jsonl
outputs/0_final_results/MATH-full_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt1/n5_rims_T0.2.jsonl
outputs/0_final_results/MATH-full_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.5.jsonl
outputs/0_final_results/MATH-full_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.2.jsonl

outputs/0_final_results/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_pal2p2c.cot2p2c.cot2pal.txt/n15_rims_T0.2.jsonl
outputs/0_final_results/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_pal2p2c.cot2p2c.cot2pal.txt/n15_rims_T0.5.jsonl
outputs/0_final_results/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_newer_best_p2c2cot.pal2p2c.pal2cot.txt/n15_rims_T0.2.jsonl
outputs/0_final_results/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_newer_best_p2c2cot.pal2p2c.pal2cot.txt/n15_rims_T0.5.jsonl
outputs/0_final_results/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_cot2p2c.pal2cot.pal2p2c.txt/n15_rims_T0.2.jsonl
outputs/0_final_results/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_cot2p2c.pal2cot.pal2p2c.txt/n15_rims_T0.5.jsonl

outputs/0_final_results/gsm8K_test_dt.gsm/chatgpt1106/model_selection_prompts/chatgpt1106_gsm_simplegreedy_sc5.jsonl
outputs/0_final_results/gsm8K_test_dt.gsm/chatgpt1106/model_selection_prompts/chatgpt1106_gsm_simplegreedy_sc10.jsonl
outputs/0_final_results/gsm8K_test_dt.gsm/chatgpt1106/model_selection_prompts/n15_baseline.jsonl

outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/model_selection_prompts/chatgpt1106_ocw_simplegreedy_sc15.jsonl
outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/model_selection_prompts/n5_baseline.jsonl
outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/model_selection_prompts/n10_baseline.jsonl

outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.5.jsonl
outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n10_rims_T0.5.jsonl
outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.7.jsonl
outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n10_rims_T0.2.jsonl
outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.2.jsonl
outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n5_rims_T0.5.jsonl
outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n10_rims_T0.5.jsonl
outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n10_rims_T0.2.jsonl
outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n5_rims_T0.2.jsonl

'''

'0_final_results'

In [None]:
# run_evaluation_new_n.py script for all.

# find outputs/0_final_results/**/*ocw*/** -name "*.jsonl" > out1
# find outputs/0_final_results/**/*MATH*/** -name "*.jsonl" >> out1
# find outputs/0_final_results/**/*gsm*/** -name "*.jsonl"  >> out1

4_eval.sh


In [None]:
# re-organize the result files for later and analyses


In [None]:
# prepare full script for sjjung to run 