In [54]:
from openai import OpenAI
import pandas as pd
import IPython.display as ipd
import random
import numpy as np
import os

from eedi.datasets import make_nice_df, make_complete_query
from pydantic import BaseModel
from dotenv import load_dotenv

load_dotenv()

True

In [26]:
df_train = pd.read_csv("../data/train.csv")
# df_train = make_nice_df(df_train)
# df_train["QuestionComplete"] = df_train.apply(make_complete_query, axis=1)
df_mis = pd.read_csv("../data/misconception_mapping.csv")

In [27]:
def merge_with_misconceptions(
    df_train: pd.DataFrame, df_mis: pd.DataFrame
) -> pd.DataFrame:
    mis_dict = df_mis["MisconceptionName"].to_dict()
    df_merged = df_train.copy()
    for letter in "ABCD":
        df_merged[f"Misconception{letter}Name"] = df_merged[
            f"Misconception{letter}Id"
        ].apply(lambda x: None if np.isnan(x) else mis_dict[int(x)])
    return df_merged

In [39]:
df_merged = merge_with_misconceptions(df_train, df_mis)
# filter out the perfect dataset (a question must be accompanied with 3 misconceptions and 1 correct (nan miscon))
criteria = (
    df_merged[
        ["MisconceptionAId", "MisconceptionBId", "MisconceptionCId", "MisconceptionDId"]
    ]
    .notna()
    .sum(axis=1)
    == 3
)
df_merged = df_merged[criteria].reset_index(drop=True)
df_merged.head(3)

Unnamed: 0,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText,MisconceptionAId,MisconceptionBId,MisconceptionCId,MisconceptionDId,MisconceptionAName,MisconceptionBName,MisconceptionCName,MisconceptionDName
0,1,1612,Simplify an algebraic fraction by factorising ...,1077,Simplifying Algebraic Fractions,D,"Simplify the following, if possible: \( \frac{...",\( m+1 \),\( m+2 \),\( m-1 \),Does not simplify,2142.0,143.0,2142.0,,Does not know that to factorise a quadratic ex...,Thinks that when you cancel identical terms fr...,Does not know that to factorise a quadratic ex...,
1,2,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,B,Tom and Katie are discussing the \( 5 \) plant...,Only\nTom,Only\nKatie,Both Tom and Katie,Neither is correct,1287.0,,1287.0,1073.0,Believes if you changed all values by the same...,,Believes if you changed all values by the same...,Believes if you add the same value to all numb...
2,3,2377,Recall and use the intersecting diagonals prop...,88,Properties of Quadrilaterals,C,The angles highlighted on this rectangle with ...,acute,obtuse,\( 90^{\circ} \),Not enough information,1180.0,1180.0,,1180.0,Does not know the properties of a rectangle,Does not know the properties of a rectangle,,Does not know the properties of a rectangle


In [102]:
i = random.randrange(0, len(df_merged))
row = df_merged.loc[i]
prompt = f"""
Subject: {row['SubjectName']}

Construct: {row['ConstructName']}

Question: {row['QuestionText']}

Answers:
A. {row['AnswerAText']}
B. {row['AnswerBText']}
C. {row['AnswerCText']}
D. {row['AnswerDText']}

Misconceptions:
A. {row['MisconceptionAName'] or '- (answer A is correct, no misconception)'}
B. {row['MisconceptionBName'] or '- (answer B is correct, no misconception)'}
C. {row['MisconceptionCName'] or '- (answer C is correct, no misconception)'}
D. {row['MisconceptionDName'] or '- (answer D is correct, no misconception)'}
"""

print(prompt)


Subject: Properties of Polygons

Construct: Given the name of a polygon, draw or identify a corresponding diagram

Question: Which of the following shapes is a hexagon?

Answers:
A. ![An irregular, five sided shape]()
B. ![A regular, five sided shape]()
C. ![A regular, eight sided shape]()
D. None of these

Misconceptions:
A. Confuses pentagon and hexagon
B. Confuses pentagon and hexagon
C. Confuses octagon and hexagon
D. - (answer D is correct, no misconception)



In [55]:
client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
client

<openai.OpenAI at 0x74e930d95590>

In [None]:
class Answers(BaseModel):
    a: str
    b: str
    c: str
    d: str


class Misconceptions(BaseModel):
    a: str
    b: str
    c: str
    d: str


class MathQuestion(BaseModel):
    subject: str
    construct_name: str  # can't use `construct` because it shadows pydantic internals
    question: str
    answers: Answers
    misconceptions: Misconceptions

class MathQuestionList(BaseModel):
    math_questions: list[MathQuestion]

In [111]:
for i, row in df_merged.iterrows():
    print(row)
    print(type(row))
    break

QuestionId                                                            1
ConstructId                                                        1612
ConstructName         Simplify an algebraic fraction by factorising ...
SubjectId                                                          1077
SubjectName                             Simplifying Algebraic Fractions
CorrectAnswer                                                         D
QuestionText          Simplify the following, if possible: \( \frac{...
AnswerAText                                                   \( m+1 \)
AnswerBText                                                   \( m+2 \)
AnswerCText                                                   \( m-1 \)
AnswerDText                                           Does not simplify
MisconceptionAId                                                 2142.0
MisconceptionBId                                                  143.0
MisconceptionCId                                                

In [114]:
lls = [[1,2,3],[4,5,6]]
d = [row for row in lls]

In [103]:
prompt = f"""
subject: {row['SubjectName']}

construct_name: {row['ConstructName']}

question: {row['QuestionText']}

answers:
a. {row['AnswerAText']}
b. {row['AnswerBText']}
c. {row['AnswerCText']}
d. {row['AnswerDText']}

misconceptions:
a. {row['MisconceptionAName'] or '- (answer a is correct, no misconception)'}
b. {row['MisconceptionBName'] or '- (answer b is correct, no misconception)'}
c. {row['MisconceptionCName'] or '- (answer c is correct, no misconception)'}
d. {row['MisconceptionDName'] or '- (answer d is correct, no misconception)'}
"""
strict_prompt = MathQuestion(
    subject=row["SubjectName"],
    construct_name=row["ConstructName"],
    question=row["QuestionText"],
    answers=Answers(
        a=row["AnswerAText"],
        b=row["AnswerBText"],
        c=row["AnswerCText"],
        d=row["AnswerDText"],
    ),
    misconceptions=Misconceptions(
        a=row['MisconceptionAName'] or '- (answer A is correct, no misconception)',
        b=row['MisconceptionBName'] or '- (answer B is correct, no misconception)',
        c=row['MisconceptionCName'] or '- (answer C is correct, no misconception)',
        d=row['MisconceptionDName'] or '- (answer D is correct, no misconception)',
    ),
)
print(strict_prompt.model_dump_json(indent=4))


{
    "subject": "Properties of Polygons",
    "construct_name": "Given the name of a polygon, draw or identify a corresponding diagram",
    "question": "Which of the following shapes is a hexagon?",
    "answers": {
        "a": "![An irregular, five sided shape]()",
        "b": "![A regular, five sided shape]()",
        "c": "![A regular, eight sided shape]()",
        "d": "None of these"
    },
    "misconceptions": {
        "a": "Confuses pentagon and hexagon",
        "b": "Confuses pentagon and hexagon",
        "c": "Confuses octagon and hexagon",
        "d": "- (answer D is correct, no misconception)"
    }
}


In [93]:
print(prompt)


subject: Adding and Subtracting Negative Numbers

construct_name: Carry out addition problems involving one negative integer

question: ![Diagram of a thermometer showing positive and negative temperatures, with an arrow labelled +9 going from -3 up to 6]() Which of the following calculations is represented on the thermometer?

answers:
a. \( -3+9=6 \)
b. \( 3+6=9 \)
c. \( 6+9=-3 \)
d. \( -3+6=9 \)

misconceptions:
a. - (answer a is correct, no misconception)
b. Believes a negative number can be replaced with its positive equivalent when answering an addition problem from a numberline
c. Believes we move down a number line when we add
d. Believes the number of jumps between two numbers on a numberline represents the sum of those 2 numbers 



In [106]:
completion = client.beta.chat.completions.parse(
    # model="gpt-4o-2024-08-06",
    model="gpt-4o",
    messages=[
        {
            "role": "system",
            "content": "You are a mathematics teacher tasked to create questions to assess the student's understanding of math concepts. You will be presented with one example: the math question, 1 correct and 3 distraction answers, along with the math misconceptions that led students choosing the distractors instead. Your task is to create similar, but diverse set of 10 new questions. Feel free to choose your own math subject and construct_name, but still relate to the given example. Remember, each set must contain exactly one '- (answer X is correct, no misconception)'. Return the answer in json.",
        },
        {"role": "user", "content": prompt},
    ],
    response_format=MathQuestionList,
)

In [108]:
df_train

Unnamed: 0,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText,MisconceptionAId,MisconceptionBId,MisconceptionCId,MisconceptionDId
0,0,856,Use the order of operations to carry out calcu...,33,BIDMAS,A,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),\( 3 \times 2+(4-5) \),\( 3 \times(2+4-5) \),Does not need brackets,,,,1672.0
1,1,1612,Simplify an algebraic fraction by factorising ...,1077,Simplifying Algebraic Fractions,D,"Simplify the following, if possible: \( \frac{...",\( m+1 \),\( m+2 \),\( m-1 \),Does not simplify,2142.0,143.0,2142.0,
2,2,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,B,Tom and Katie are discussing the \( 5 \) plant...,Only\nTom,Only\nKatie,Both Tom and Katie,Neither is correct,1287.0,,1287.0,1073.0
3,3,2377,Recall and use the intersecting diagonals prop...,88,Properties of Quadrilaterals,C,The angles highlighted on this rectangle with ...,acute,obtuse,\( 90^{\circ} \),Not enough information,1180.0,1180.0,,1180.0
4,4,3387,Substitute positive integer values into formul...,67,Substitution into Formula,A,The equation \( f=3 r^{2}+3 \) is used to find...,\( 30 \),\( 27 \),\( 51 \),\( 24 \),,,,1818.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1864,1864,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,C,What is the range of the following numbers?\n\...,\( 5 \),\( 11 \),\( 23 \),\( 16 \),2456.0,691.0,,1349.0
1865,1865,2695,"Describe an enlargement, with no centre of enl...",90,Length Scale Factors in Similar Shapes,B,Shape \( Q \) is an enlargement of shape \( P ...,\( 3 \div 11 \),\( 11 \div 3 \),\( 3 \times 11 \),\( 11-3 \),1500.0,,2442.0,1258.0
1866,1866,854,Use the order of operations to carry out calcu...,33,BIDMAS,B,What does the following equal?\n\[\n8-7+10 \ti...,\( 36 \),\( 31 \),\( -29 \),\( 33 \),,,2306.0,1507.0
1867,1867,2634,Distinguish between congruency and similarity,274,Congruency in Other Shapes,B,Tom and Katie are discussing congruence and si...,Only\nTom,Only Katie,Both Tom and Katie,Neither is correct,2312.0,,2312.0,2312.0


In [107]:
results = completion.choices[0].message.parsed
assert results is not None
for result in results.math_questions:
    asd = f"""
subject: {result.subject}

construct_name: {result.construct_name}

question: {result.question}

answers:
a. {result.answers.a}
b. {result.answers.b}
c. {result.answers.c}
d. {result.answers.d}

misconceptions:
a. {result.misconceptions.a}
b. {result.misconceptions.b}
c. {result.misconceptions.c}
d. {result.misconceptions.d}
"""
    print(asd)
    print("🔥")


subject: Properties of Polygons

construct_name: Determine the number of sides of a polygon

question: Which of the following polygons has 7 sides?

answers:
a. Hexagon
b. Heptagon
c. Octagon
d. Nonagon

misconceptions:
a. Confuses hexagon and heptagon
b. - (answer b is correct, no misconception)
c. Confuses heptagon and octagon
d. Confuses heptagon and nonagon

🔥

subject: Properties of Polygons

construct_name: Identify the number of diagonals in a polygon

question: How many diagonals does a pentagon have?

answers:
a. 5
b. 8
c. 10
d. 3

misconceptions:
a. Incorrect diagonal formula application
b. Incorrect calculation of diagonals
c. Confuses total lines with diagonals
d. - (answer d is correct, no misconception)

🔥

subject: Properties of Polygons

construct_name: Identify polygon based on angle measurements

question: Given an interior angle of 120 degrees, what polygon is it?

answers:
a. Pentagon
b. Hexagon
c. Octagon
d. Decagon

misconceptions:
a. Confuses sum of interior ang