# Setup

In [None]:
%autosave 0

In [None]:
import numpy as np
import pandas as pd

# Load data

In [None]:
# fmt: off
dfs = {
    "wiley": pd.read_csv(f"./attempts/chatgpt_nothought_l1_2017_wiley.csv"),
    "frm_am": pd.read_csv(f"./attempts/chatgpt_nothought_l1_2019_frm_am.csv"),
    # "frm_am": pd.read_csv(f"./attempts/chatgpt_l1_2019_frm_am.csv"),
    "frm_pm": pd.read_csv(f"./attempts/chatgpt_nothought_l1_2019_frm_pm.csv"),    
    
    # "pm_investopedia": pd.read_csv(f"./attempts/chatgpt_nothought_investopedia_l1_2019_frm_am.csv"),
}
# fmt: on

# Analysis

## Test Scores

In [None]:
for name, df in dfs.items():
    print(name, end=": ")
    print(f"{sum(df['correct'] == 'yes') / len(df) * 100:.2f}%")

In [None]:
dfs_wrong = {}
for name, df in dfs.items():
    dfs_wrong[name] = df[df["correct"] == "no"]

In [None]:
dfs_right = {}
for name, df in dfs.items():
    dfs_right[name] = df[df["correct"] == "yes"]

In [None]:
merge_cols = [
    "question",
    "A",
    "B",
    "C",
    "answer",
    "explanation",
    "category",  # not in wiley exam
    "guess",
]
first = True
same_guess: pd.DataFrame = None  # type: ignore
for name, df in dfs_wrong.items():
    if first:
        same_guess = df
        first = False
    else:
        same_guess = same_guess.merge(df, on=merge_cols, how="inner", suffixes=["_" + prev, "_" + name])  # type: ignore
    prev = name
same_guess.rename(columns={"correct": "correct_" + prev}, inplace=True)  # type: ignore
same_guess.head()

In [None]:
merge_cols = [
    "question",
    "A",
    "B",
    "C",
    "answer",
    "explanation",
    "category",  # not in wiley exam
    "correct",
]
first = True
same_result: pd.DataFrame = None  # type: ignore
for name, df in dfs_wrong.items():
    if first:
        same_result = df
        first = False
    else:
        same_result = same_result.merge(df, on=merge_cols, how="inner", suffixes=["_" + prev, "_" + name])  # type: ignore
    prev = name
same_result.rename(columns={"guess": "guess_" + prev}, inplace=True)  # type: ignore
same_result.head()

In [None]:
# peek a wrong answer
i = 2
print(chatgptwrong.iloc[i]["answer"])
print(chatgptwrong.iloc[i]["guess"])
print()
print(chatgptwrong.iloc[i]["thinking"])
print()
print(chatgptwrong.iloc[i]["explanation"])

In [None]:
print(gpt4wrong.iloc[i]["answer"])
print(gpt4wrong.iloc[i]["guess"])
print()
print(gpt4wrong.iloc[i]["thinking"])
print()
print(gpt4wrong.iloc[i]["explanation"])

In [None]:
labelled = pd.read_csv(f"data/l1_labelled.csv")

chatgpt[["math", "knowledge", "reasoning"]] = labelled[["math", "knowledge", "reasoning"]]  # fmt: skip
gpt4[["math", "knowledge", "reasoning"]] = labelled[["math", "knowledge", "reasoning"]]

In [None]:
# math
print(
    sum(chatgpt[chatgpt["math"]]["correct"] == "yes")
    / len(chatgpt[chatgpt["math"]])
    * 100
)
print(sum(gpt4[gpt4["math"]]["correct"] == "yes") / len(gpt4[gpt4["math"]]) * 100)

In [None]:
# knowledge
print("knowledge")
chatgptknowledge = chatgpt[chatgpt["knowledge"]]
print(sum(chatgptknowledge["correct"] == "yes") / len(chatgptknowledge) * 100)
gpt4knowledge = gpt4[gpt4["knowledge"]]
print(sum(gpt4knowledge["correct"] == "yes") / len(gpt4knowledge) * 100)

print()
# knowledge - math
print("knowledge - math")
chatgptknowledge = chatgptknowledge[~chatgptknowledge["math"]]
print(sum(chatgptknowledge["correct"] == "yes") / len(chatgptknowledge) * 100)
gpt4knowledge = gpt4knowledge[~gpt4knowledge["math"]]
print(sum(gpt4knowledge["correct"] == "yes") / len(gpt4knowledge) * 100)

print()
# knowledge - math & reasoning
print("knowledge - math & reasoning")
chatgptknowledge = chatgptknowledge[~chatgptknowledge["reasoning"]]
print(sum(chatgptknowledge["correct"] == "yes") / len(chatgptknowledge) * 100)
gpt4knowledge = gpt4knowledge[~gpt4knowledge["reasoning"]]
print(sum(gpt4knowledge["correct"] == "yes") / len(gpt4knowledge) * 100)

In [None]:
# reasoning
print("reasoning")
chatgptreasoning = chatgpt[chatgpt["reasoning"]]
print(sum(chatgptreasoning["correct"] == "yes") / len(chatgptreasoning) * 100)
gpt4reasoning = gpt4[gpt4["reasoning"]]
print(sum(gpt4reasoning["correct"] == "yes") / len(gpt4reasoning) * 100)

print()
# reasoning - math
print("reasoning - math")
chatgptreasoning = chatgptreasoning[~chatgptreasoning["math"]]
print(sum(chatgptreasoning["correct"] == "yes") / len(chatgptreasoning) * 100)
gpt4reasoning = gpt4reasoning[~gpt4reasoning["math"]]
print(sum(gpt4reasoning["correct"] == "yes") / len(gpt4reasoning) * 100)

In [None]:
# questions both get wrong
bothwrong = chatgpt.merge(
    gpt4,
    on=[
        "question",
        "correct",
        "answer",
        "explanation",
        "math",
        "knowledge",
        "reasoning",
        "A",
        "B",
        "C",
    ],
    how="inner",
)
bothwrong = bothwrong[bothwrong["correct"] == "no"]
print(len(bothwrong))
bothwrong.head()

In [None]:
for i in range(3):
    print(bothwrong.iloc[i]["question"])
    print()
    print(bothwrong.iloc[i]["guess_x"])
    for line in bothwrong.iloc[i]["thinking_x"].split("-"):
        print("-", line)
    print()
    print(bothwrong.iloc[i]["guess_y"])
    for line in bothwrong.iloc[i]["thinking_y"].split("-"):
        print("-", line)
    print()
    print(bothwrong.iloc[i]["answer"])
    print(bothwrong.iloc[i]["explanation"])
    print()
    print("-" * 80)
    print()

In [None]:
print(sum(bothwrong["math"]) / len(bothwrong) * 100)
print(sum(bothwrong["knowledge"]) / len(bothwrong) * 100)
print(sum(bothwrong["reasoning"]) / len(bothwrong) * 100)

In [None]:
bothright = chatgpt.merge(
    gpt4,
    on=[
        "question",
        "correct",
        "answer",
        "explanation",
        "math",
        "knowledge",
        "reasoning",
        "A",
        "B",
        "C",
    ],
    how="inner",
)
bothright = bothwrong[bothwrong["correct"] == "no"]

In [None]:
df = bothwrong[bothwrong["math"]]
for _ in range(10):
    i = np.random.randint(len(df))
    print(f"question {i}")
    print(df.iloc[i]["question"])
    print(df.iloc[i]["A"])
    print(df.iloc[i]["B"])
    print(df.iloc[i]["C"])
    print("answer")
    print(df.iloc[i]["answer"])
    print("explanation")
    print(df.iloc[i]["explanation"])
    print()
    print("gpt3.5")
    print(df.iloc[i]["guess_x"])
    print(df.iloc[i]["thinking_x"])
    print()
    print("gpt4")
    print(df.iloc[i]["guess_y"])
    print(df.iloc[i]["thinking_y"])
    print("-" * 80)

In [None]:
print(bothwrong[bothwrong["math"]].iloc[16])

In [None]:
chatgpt_critique = pd.read_csv("chatgpt_critique.csv")
gpt4_critique = pd.read_csv("gpt4_critique.csv")

In [None]:
len(chatgpt_critique[chatgpt_critique.error == "math"])

In [None]:
len(chatgpt_critique[chatgpt_critique.error == "knowledge"])

In [None]:
len(chatgpt_critique[chatgpt_critique.error == "both"])

In [None]:
len(gpt4_critique[gpt4_critique.error == "math"])

In [None]:
len(gpt4_critique[gpt4_critique.error == "knowledge"])

In [None]:
len(gpt4_critique[gpt4_critique.error == "both"])

In [None]:
# what percentage was each error
print("chatgpt")
print(
    len(chatgpt_critique[chatgpt_critique.error == "math"])
    / len(chatgpt_critique)
    * 100
)
print(
    len(chatgpt_critique[chatgpt_critique.error == "knowledge"])
    / len(chatgpt_critique)
    * 100
)
print(
    len(chatgpt_critique[chatgpt_critique.error == "both"])
    / len(chatgpt_critique)
    * 100
)

print()
print("gpt4")
print(len(gpt4_critique[gpt4_critique.error == "math"]) / len(gpt4_critique) * 100)
print(len(gpt4_critique[gpt4_critique.error == "knowledge"]) / len(gpt4_critique) * 100)
print(len(gpt4_critique[gpt4_critique.error == "both"]) / len(gpt4_critique) * 100)

In [None]:
row = chatgpt_critique[chatgpt_critique.error == "math"].iloc[0]
print(row["question"])
print(row["A"])
print(row["B"])
print(row["C"])
print(row["answer"])
print(row["explanation"])
print()
print(row["guess"])
print(row["thinking"])
print()
print(row["error"])
print(row["error_details"])