# Init

In [None]:
import pandas as pd
import glob

In [None]:
model = "gpt4"
level = "l1"

In [None]:
df = pd.DataFrame()
files = sorted(
    glob.glob(f"attempts/{level}/{model}*"),
    key=lambda x: int(x.split("_")[-1].split(".")[0]),
)
for i, fp in enumerate(files):
    fdf = pd.read_csv(fp)
    fdf["test"] = i+1
    df = pd.concat([df, fdf], ignore_index=True)
df["test"].unique()

In [None]:
dfcot = pd.DataFrame()
files = sorted(
    glob.glob(f"attempts/{level}/cot/{model}*"),
    key=lambda x: int(x.split("_")[-1].split(".")[0]),
)
for i, fp in enumerate(files):
    fdf = pd.read_csv(fp)
    fdf["test"] = i+1
    dfcot = pd.concat([dfcot, fdf], ignore_index=True)
dfcot["test"].unique()

# Results

## By chapter

In [None]:
# Questions per chapter
df["chapter_name"].value_counts()

In [None]:
# Chapter % of test
df["chapter_name"].value_counts() / len(df) * 100

In [None]:
# No CoT
spacing = max(map(len, df["chapter_name"].unique())) + 4
for chapter in df["chapter_name"].unique():
    df_chapter = df[df["chapter_name"] == chapter]
    num_correct = sum(df_chapter["correct"] == "yes")
    score = num_correct / len(df_chapter)
    spacer = " " * (spacing - len(chapter))

    print(
        f"  {chapter}:{spacer}{num_correct:>3}/{len(df_chapter):<3}  {score*100:.2f}%"
    )

In [None]:
# CoT
spacing = max(map(len, dfcot["chapter_name"].unique())) + 4
for chapter in dfcot["chapter_name"].unique():
    dfcot_chapter = dfcot[dfcot["chapter_name"] == chapter]
    num_correct = sum(dfcot_chapter["correct"] == "yes")
    score = num_correct / len(dfcot_chapter)
    spacer = " " * (spacing - len(chapter))

    print(
        f"  {chapter}:{spacer}{num_correct:>3}/{len(dfcot_chapter):<3}  {score*100:.2f}%"
    )

In [None]:
spacing = max(map(len, dfcot["chapter_name"].unique())) + 4
for chapter in dfcot["chapter_name"].unique():
    df_chapter = df[df["chapter_name"] == chapter]
    dfcot_chapter = dfcot[dfcot["chapter_name"] == chapter]
    num_correct = sum(df_chapter["correct"] == "yes")
    num_correct_cot = sum(dfcot_chapter["correct"] == "yes")
    score = num_correct / len(dfcot_chapter)
    score_cot = num_correct_cot / len(dfcot_chapter)
    spacer = " " * (spacing - len(chapter))

    symbol = "<" if score < score_cot else ">" if score > score_cot else "="
    print(
        f"  {chapter}:{spacer}  noCoT ({score*100:.2f}%)   {symbol}  CoT ({score_cot*100:.2f}%)"
    )

In [None]:
len(dfcot)

In [None]:
dfcot.columns

In [None]:
list(dfcot["chapter_name"].unique())

In [None]:
dfcot_wrong = dfcot[(dfcot["correct"] == "no") & (df["correct"] == "yes")]
print(f"cot got {len(dfcot_wrong)} wrong nocot got right")
for chapter in dfcot_wrong["chapter_name"].unique():
    print(f"{len(dfcot_wrong[dfcot_wrong['chapter_name'] == chapter])} in {chapter}")

In [None]:
cols = [
    "id",
    "test",
    "chapter_name", 
    "question", 
    "choice_a", 
    "choice_b", 
    "choice_c", 
    "answer",
    "explanation",
    "guess",
    "correct",
    "thinking"
]
chapters =  [
 'Quantitative Methods',
 'Economics',
 'Financial Statement Analysis',
 'Corporate Issuers',
 'Ethics',
 'Equity',
 'Fixed Income',
 'Derivatives',
 'Alternative Investments',
 'Portfolio Management',
]

lookat = dfcot_wrong[dfcot_wrong["chapter_name"].isin(chapters)]

for idx, row in lookat.iterrows():
    for col, val in row[cols].items():
        if col == "id":
            print("#", col, val)
        else:
            print("###", col)
            if type(val) == str:
                print(val.replace("<p>", "").replace("</p>", ""))
            else:
                print(val)
        print()

In [None]:
# get error analysis from markdown

# make sure index is reset
dfcot_wrong.reset_index(inplace=True)
dfcot_wrong["error_kind"] = ""
dfcot_wrong["error"] = ""
with open("analysis/gpt4_cot_wrong_nocot_right.md") as f:
    idx = -1
    error_kind = False
    error = False
    for line in f.readlines():
        if error_kind:
            dfcot_wrong.loc[idx, "error_kind"] = line.strip()
            error_kind = False
        if error:
            dfcot_wrong.loc[idx, "error"] = line.strip()
            error = False
        if line.startswith("# id"):
            idx += 1
        if line.startswith("### error_kind"):
            error_kind = True
        elif line.startswith("### error"):
            error = True

In [None]:
len(dfcot_wrong)

In [None]:
dfcot_wrong["error_kind"].value_counts() / len(dfcot_wrong) * 100

In [None]:
2/len(dfcot_wrong) * 100