In [5]:
# corrected baselines
ocw_ok_5 = "outputs/n5_baseline_dt.ocw/chatgpt1106/model_selection_prompts/n5_baseline.jsonl"
ocw_ok_10 = "outputs/n10_baseline_dt.ocw/chatgpt1106/model_selection_prompts/n10_baseline.jsonl"
gsm_ok_15 = "outputs/n15_baseline_dt.gsm/chatgpt1106/model_selection_prompts/n15_baseline.jsonl"
math_ok_5 = "outputs/n5_baseline_dt.math/chatgpt1106/model_selection_prompts/n5_baseline.jsonl"


# leftovers chatgpt1106
GSM15="outputs_dgx/gsm8K_test_dt.gsm/chatgpt1106/model_selection_prompts/err_n15_baseline.jsonl"
OCW10="outputs_dgx/ocw_course_dt.ocw/chatgpt1106/model_selection_prompts/err_n10_baseline.jsonl"
OCW5="outputs/ocw_course_dt.ocw/chatgpt1106/model_selection_prompts/err_n5_baseline.jsonl"
MATH5="outputs/MATH-full_dt.math/chatgpt1106/model_selection_prompts/err_n5_baseline.jsonl"


In [8]:
import pandas as pd
import jsonlines as jsl

right_err_pairs = [
    (ocw_ok_5, OCW5),
    (ocw_ok_10, OCW10),
    (gsm_ok_15, GSM15),
    (math_ok_5, MATH5)
]

df_r_e_pairs = [
    (pd.DataFrame(jsl.open(right)), pd.DataFrame(jsl.open(err))) for right, err in right_err_pairs
]

# solmap, ansmap empty checked again
df_r_e_pairs[0][1].solmap

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
5   NaN
6   NaN
7   NaN
Name: solmap, dtype: float64

In [12]:
sanity_check = [e["question" if "question" in e.columns else "problem"].isin(r["question" if "question" in e.columns else "problem"]).mean() for r,e in df_r_e_pairs]
sanity_check # some of those were error pruned rows

[1.0, 1.0, 0.8181818181818182, 0.9866666666666667]

In [30]:
# make the files to run rims
from pathlib import Path

def pick_rows(src:pd.DataFrame=None, by:pd.DataFrame=None)->pd.DataFrame:
    q_key = "question" if "question" in src.columns else "problem"
    return src[src[q_key].isin(by[q_key])]

target_paths = [ Path(r).parent/Path(r).name.replace("_baseline", "_baseline_picked") for r,e in right_err_pairs]

contents = [pick_rows(src=df_r, by=df_e) for df_r, df_e in df_r_e_pairs]

for path, content in zip(target_paths, contents):
    with jsl.open(path, "w") as writer:
        writer.write_all(content.to_dict(orient="records")) 
        print(f"{len(content)} rows to \n\t{path}")



8 rows to 
	outputs/n5_baseline_dt.ocw/chatgpt1106/model_selection_prompts/n5_baseline_picked.jsonl
23 rows to 
	outputs/n10_baseline_dt.ocw/chatgpt1106/model_selection_prompts/n10_baseline_picked.jsonl
18 rows to 
	outputs/n15_baseline_dt.gsm/chatgpt1106/model_selection_prompts/n15_baseline_picked.jsonl
296 rows to 
	outputs/n5_baseline_dt.math/chatgpt1106/model_selection_prompts/n5_baseline_picked.jsonl


In [31]:
original_err_rows = [len(e) for r,e in df_r_e_pairs]
original_err_rows # 0, 0, 4, 4 loss. acceptable

[8, 23, 22, 300]

In [32]:
cmds = [] 
for gsm_jslf, n, dataset_type in zip(target_paths, [5, 10, 15, 5], ["ocw", "ocw", "gsm", "math"]):
    cmd = f"python run_inference.py rims_inference \\\n \
            --backbone chatgpt1106 \\\n \
            --gsm_jslf {gsm_jslf} \\\n \
            --n {n} \\\n \
            --dataset_type {dataset_type} \\\n \
            --n_jobs 8"
    cmds.append(cmd)
    print(cmd)

python run_inference.py rims_inference \
             --backbone chatgpt1106 \
             --gsm_jslf outputs/n5_baseline_dt.ocw/chatgpt1106/model_selection_prompts/n5_baseline_picked.jsonl \
             --n 5 \
             --dataset_type ocw \
             --n_jobs 8
python run_inference.py rims_inference \
             --backbone chatgpt1106 \
             --gsm_jslf outputs/n10_baseline_dt.ocw/chatgpt1106/model_selection_prompts/n10_baseline_picked.jsonl \
             --n 10 \
             --dataset_type ocw \
             --n_jobs 8
python run_inference.py rims_inference \
             --backbone chatgpt1106 \
             --gsm_jslf outputs/n15_baseline_dt.gsm/chatgpt1106/model_selection_prompts/n15_baseline_picked.jsonl \
             --n 15 \
             --dataset_type gsm \
             --n_jobs 8
python run_inference.py rims_inference \
             --backbone chatgpt1106 \
             --gsm_jslf outputs/n5_baseline_dt.math/chatgpt1106/model_selection_prompts/n5_baseline

In [1]:
# FIXED_BASELINES --> no need to merge. 

mkdir -p outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/model_selection_prompts/
mkdir -p outputs/0_final_results/gsm8K_test_dt.gsm/chatgpt1106/model_selection_prompts/
mkdir -p outputs/0_final_results/MATH-full_dt.math/chatgpt1106/model_selection_prompts/

ocw_ok_5=outputs/n5_baseline_dt.ocw/chatgpt1106/model_selection_prompts/n5_baseline.jsonl
ocw_ok_5_to=outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/model_selection_prompts/n5_baseline.jsonl
cp $ocw_ok_5 $ocw_ok_5_to

ocw_ok_10=outputs/n10_baseline_dt.ocw/chatgpt1106/model_selection_prompts/n10_baseline.jsonl
ocw_ok_10_to=outputs/0_final_results/ocw_course_dt.ocw/chatgpt1106/model_selection_prompts/n10_baseline.jsonl
cp $ocw_ok_10 $ocw_ok_10_to

gsm_ok_15=outputs/n15_baseline_dt.gsm/chatgpt1106/model_selection_prompts/n15_baseline.jsonl
gsm_ok_15_to=outputs/0_final_results/gsm8K_test_dt.gsm/chatgpt1106/model_selection_prompts/n15_baseline.jsonl
cp $gsm_ok_15 $gsm_ok_15_to

math_ok_5=outputs/n5_baseline_dt.math/chatgpt1106/model_selection_prompts/n5_baseline.jsonl
math_ok_5_to=outputs/0_final_results/MATH-full_dt.math/chatgpt1106/model_selection_prompts/n5_baseline.jsonl
cp $math_ok_5 $math_ok_5_to


SyntaxError: invalid decimal literal (3381292144.py, line 4)

In [6]:

# FIXED + ORIGINAL RIMS 
# FIXED: find outputs/ -name "n*baseline_dt*/**/*rims*.jsonl" 
# ORIGINAL_RIMS results files
"""

find **/chatgpt1106 -name "n*rims_T*.jsonl"

src/results_paths.txt
"""

import pandas as pd 
import jsonlines as jsl
from pathlib import Path

# original rims results files + picked files 
def check_and_merge_results(jslf1, jslf2):
    """
    usually the jsonl records does not expected to overlap, but math result do. 
    I guess this comes from error: true with different reason than 429 error in math (rims run on ~error)
    """
    # load and drop error lines first: expects most of overlaps will disappear.
    df1, df2 = pd.read_json(jslf1, lines=True), pd.read_json(jslf2, lines=True)
    df1_ = df1[~(df1.error)]
    df2_ = df2[~(df2.error)]
    print("dropped errors (jslf1, jslf2)")
    print(-len(df1_)+len(df1), len(df1))
    print(-len(df2_)+len(df2), len(df2))

    # check overlaps 
    q_key = "question" if "question" in df1.columns else "problem"  
    overlap = (df1_[q_key].isin(df2_[q_key]))
    print(f"{overlap.sum()=} rows")
    df1__ = df1_[~overlap]

    # merge
    df_merged = pd.concat([df1__, df2_], axis="index")    
    
    return df_merged.to_dict(orient="records")


# to merge rows... (from results_paths.txt)
JSLF_PAIRS = {
    "math_n5_rims":
    [("outputs/MATH-full_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt1/n5_rims_T0.5.jsonl",
     "outputs/n5_baseline_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt1/n5_rims_T0.5.jsonl"),
    ("outputs/MATH-full_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt1/n5_rims_T0.2.jsonl",
     "outputs/n5_baseline_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt1/n5_rims_T0.2.jsonl"),
    ("outputs/MATH-full_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.5.jsonl",
     "outputs/n5_baseline_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.5.jsonl"),
    ("outputs/MATH-full_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.2.jsonl",
     "outputs/n5_baseline_dt.math/chatgpt1106/rims_math_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.2.jsonl")],
"ocw_n10_rims":
     [("outputs_dgx/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n10_rims_T0.5.jsonl",
      "outputs/n10_baseline_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n10_rims_T0.5.jsonl"),
     ("outputs_dgx/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n10_rims_T0.2.jsonl",
      "outputs/n10_baseline_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n10_rims_T0.2.jsonl"),
     ("outputs_dgx/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n10_rims_T0.5.jsonl",
      "outputs/n10_baseline_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n10_rims_T0.5.jsonl"),
     ("outputs_dgx/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n10_rims_T0.2.jsonl",
      "outputs/n10_baseline_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n10_rims_T0.2.jsonl")],

"gsm_n15_rims":
    [("outputs_dgx/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_pal2p2c.cot2p2c.cot2pal.txt/n15_rims_T0.2.jsonl",
      "outputs/n15_baseline_dt.gsm/chatgpt1106/rewrote.p2c_gsm_pal2p2c.cot2p2c.cot2pal.txt/n15_rims_T0.2.jsonl"),
    ("outputs_dgx/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_pal2p2c.cot2p2c.cot2pal.txt/n15_rims_T0.5.jsonl",
      "outputs/n15_baseline_dt.gsm/chatgpt1106/rewrote.p2c_gsm_pal2p2c.cot2p2c.cot2pal.txt/n15_rims_T0.5.jsonl"),
    ("outputs_dgx/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_newer_best_p2c2cot.pal2p2c.pal2cot.txt/n15_rims_T0.2.jsonl",
      "outputs/n15_baseline_dt.gsm/chatgpt1106/rewrote.p2c_gsm_newer_best_p2c2cot.pal2p2c.pal2cot.txt/n15_rims_T0.2.jsonl"),
    ("outputs_dgx/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_newer_best_p2c2cot.pal2p2c.pal2cot.txt/n15_rims_T0.5.jsonl",
      "outputs/n15_baseline_dt.gsm/chatgpt1106/rewrote.p2c_gsm_newer_best_p2c2cot.pal2p2c.pal2cot.txt/n15_rims_T0.5.jsonl"),
    ("outputs_dgx/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_cot2p2c.pal2cot.pal2p2c.txt/n15_rims_T0.2.jsonl",
      "outputs/n15_baseline_dt.gsm/chatgpt1106/rewrote.p2c_gsm_cot2p2c.pal2cot.pal2p2c.txt/n15_rims_T0.2.jsonl"),
    ("outputs_dgx/gsm8K_test_dt.gsm/chatgpt1106/rewrote.p2c_gsm_cot2p2c.pal2cot.pal2p2c.txt/n15_rims_T0.5.jsonl",
      "outputs/n15_baseline_dt.gsm/chatgpt1106/rewrote.p2c_gsm_cot2p2c.pal2cot.pal2p2c.txt/n15_rims_T0.5.jsonl")],

"ocw_n5_rims":
       [("outputs/n5_baseline_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.5.jsonl",
     "outputs/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.5.jsonl"),
       ("outputs/n5_baseline_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.2.jsonl",
     "outputs/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.pal-cot__.txt/n5_rims_T0.2.jsonl"),
       ("outputs/n5_baseline_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n5_rims_T0.5.jsonl",
     "outputs/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n5_rims_T0.5.jsonl"),
       ("outputs/n5_baseline_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n5_rims_T0.2.jsonl",
     "outputs/ocw_course_dt.ocw/chatgpt1106/rims_ocw_p2c-cot.pal-p2c.cot-p2c__.txt/n5_rims_T0.2.jsonl")],
}

def get_prefix(path):
    return Path(path).parts[1]


ROOTDIR = Path("outputs/0_final_results")

for key, pairs in JSLF_PAIRS.items():
    for jslf1, jslf2 in pairs:
        jslf1, jslf2 = Path(jslf1), Path(jslf2)
        assert Path(jslf1).name == Path(jslf2).name

        for f in [jslf1, jslf2]:
            prefix = get_prefix(f)
            if prefix.startswith("ocw_course") or prefix.startswith("MATH-full") or prefix.startswith("gsm8K_test"):
                newpathparts = f.parts[1:]
                newpath = ROOTDIR/"/".join(newpathparts)
                newpath.parent.mkdir(parents=True, exist_ok=True)
                break
        assert prefix.startswith("ocw_course") or prefix.startswith("MATH-full") or prefix.startswith("gsm8K_test")
        
        merged = check_and_merge_results(jslf1, jslf2)
        with jsl.open(newpath, "w") as writer:
            writer.write_all(merged)
            print(f"wrote {len(merged)} rows to \n\t{newpath}")

TypeError: expected str, bytes or os.PathLike object, not tuple

In [5]:
# make SC15, SC5, 10 with 
# run_modif_SC_results.py
ROOTDIR.parts[1]

'0_final_results'

In [None]:
# run_evaluation_new_n.py script for all.

In [None]:
# re-organize the result files for later and analyses


In [None]:
# prepare full script for sjjung to run 