In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-7B")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Math-7B")

In [38]:
import pandas as pd
data_path = 'data/finetune_data.csv'
df = pd.read_csv(data_path, header=0)

In [None]:
prompt = """You are a math expert. Given a math question, its correct answer and wrong answer, \
tell me what kind of misconception might the student who answers the wrong answer have. \
Here is an example: Question Text: Which type of graph is represented by the equation \\( y=\\frac{{1}}{{x}} \\)?\n\
Correct Answer Text: A reciprocal graph\n\
Wrong Answer Text: A quadratic graph\n\
The misconception for the wrong answer is: Confuses reciprocal and quadratic graphs.\n\
Now tell me what misconception does the following wrong answer imply: {question}"""

prompts = []
for question in df['LLM_Response']:
    prompts.append(prompt.format(question=question))
    print(prompts[len(prompts)-1])
    break


You are a math expert. Given a math question, its correct answer and wrong answer,     tell me what kind of misconception might the student who answers the wrong answer have.         Here is an example: Question Text: Which type of graph is represented by the equation \( y=\frac{1}{x} \)?
             Correct Answer Text: A reciprocal graph                Wrong Answer Text: A quadratic graph                    The misconception for the wrong answer is Confuses reciprocal and quadratic graphs                        Now tell me what misconception does the following wrong answer imply:                             Question Text: In a triangle, the two base angles are 60 degrees and 70 degrees. What is the measure of the third angle?
Correct Answer Text: 50 degrees
Wrong Answer Text: 130 degrees
        


### Combine synthetic dataset with existing dataset

In [None]:
misconceptions = pd.read_csv('data/misconception_mapping.csv', header=0)
train = pd.read_csv('data/train.csv', header=0)

# Initialize an empty list to collect rows for the new DataFrame
rows = []

# Iterate through each row in the `train` DataFrame
for _, row in train.iterrows():
    # Iterate through the possible misconceptions A, B, C, D
    for option in ['A', 'B', 'C', 'D']:
        misconception_id_col = f'Misconception{option}Id'
        answer_text_col = f'Answer{option}Text'
        
        misconception_id = row[misconception_id_col]
        # Check if the misconception ID is valid
        if pd.notna(misconception_id) and int(misconception_id) <= len(misconceptions):
            misconception_name = misconceptions.loc[
                misconceptions['MisconceptionId'] == int(misconception_id), 'MisconceptionName'
            ].values[0]
            
            correct_answer_text = row[f'Answer{row.CorrectAnswer}Text']
            wrong_answer_text = row[answer_text_col]
            question_text = row['QuestionText']
            
            llm_response = (
                f"Question Text: {question_text} \n"
                f"Correct Answer Text: {correct_answer_text} \n"
                f"Wrong Answer Text: {wrong_answer_text}"
            )
            
            # Add the row to the list
            rows.append({
                'MisconceptionName': misconception_name,
                'LLM_Response': llm_response,
            })

# Create the new DataFrame
result_df = pd.DataFrame(rows)
combined_df = pd.concat([df, result_df], ignore_index=True, axis=0)
combined_df.to_csv('data/finetune_data.csv', index=False)
