In [38]:
import pandas as pd
data_path = 'data/finetune_data.csv'
df = pd.read_csv(data_path, header=0)

In [56]:
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/CodeQwen1.5-7B-Chat", trust_remote_code=True)

sys_prompt = """You are a math expert. Help users find the misconception in the wrong math answer."""

question_prompt = """Given a math question, its correct answer and wrong answer, \
tell me what kind of misconception might the student who answers the wrong answer have. \
\nHere is an example: Question Text: Which type of graph is represented by the equation \\( y=\\frac{{1}}{{x}} \\)?\n\
Correct Answer Text: A reciprocal graph\n\
Wrong Answer Text: A quadratic graph\nThe misconception for the wrong answer is: Confuses reciprocal and quadratic graphs.\n\
\nNow tell me what misconception does the following wrong answer imply:\n\n"""

prompts = []
for question, answer in zip(df['LLM_Response'], df['MisconceptionName']):
    prompt = [
        {
            "role": "system",
            "content": sys_prompt
        },
        {
            "role": "user",
            "content": question_prompt + question
        },
        {
            "role": "assistant",
            "content": answer
        }

    ]
    prompts.append(prompt)

dataset = Dataset.from_dict({"chat": prompts})
dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)})



Map: 100%|██████████| 6957/6957 [00:01<00:00, 5257.27 examples/s]


In [58]:
print(dataset['formatted_chat'][0])


<|im_start|>system
You are a math expert. Help users find the misconception in the wrong math answer.<|im_end|>
<|im_start|>user
Given a math question, its correct answer and wrong answer, tell me what kind of misconception might the student who answers the wrong answer have. 
Here is an example: Question Text: Which type of graph is represented by the equation \( y=\frac{{1}}{{x}} \)?
Correct Answer Text: A reciprocal graph
Wrong Answer Text: A quadratic graph
The misconception for the wrong answer is: Confuses reciprocal and quadratic graphs.

Now tell me what misconception does the following wrong answer imply:

Question Text: In a triangle, the two base angles are 60 degrees and 70 degrees. What is the measure of the third angle?
Correct Answer Text: 50 degrees
Wrong Answer Text: 130 degrees<|im_end|>
<|im_start|>assistant
Does not know that angles in a triangle sum to 180 degrees<|im_end|>



### Combine synthetic dataset with existing dataset

In [None]:
misconceptions = pd.read_csv('data/misconception_mapping.csv', header=0)
train = pd.read_csv('data/train.csv', header=0)

# Initialize an empty list to collect rows for the new DataFrame
rows = []

# Iterate through each row in the `train` DataFrame
for _, row in train.iterrows():
    # Iterate through the possible misconceptions A, B, C, D
    for option in ['A', 'B', 'C', 'D']:
        misconception_id_col = f'Misconception{option}Id'
        answer_text_col = f'Answer{option}Text'
        
        misconception_id = row[misconception_id_col]
        # Check if the misconception ID is valid
        if pd.notna(misconception_id) and int(misconception_id) <= len(misconceptions):
            misconception_name = misconceptions.loc[
                misconceptions['MisconceptionId'] == int(misconception_id), 'MisconceptionName'
            ].values[0]
            
            correct_answer_text = row[f'Answer{row.CorrectAnswer}Text']
            wrong_answer_text = row[answer_text_col]
            question_text = row['QuestionText']
            
            llm_response = (
                f"Question Text: {question_text} \n"
                f"Correct Answer Text: {correct_answer_text} \n"
                f"Wrong Answer Text: {wrong_answer_text}"
            )
            
            # Add the row to the list
            rows.append({
                'MisconceptionName': misconception_name,
                'LLM_Response': llm_response,
            })

# Create the new DataFrame
result_df = pd.DataFrame(rows)
combined_df = pd.concat([df, result_df], ignore_index=True, axis=0)
combined_df.to_csv('data/finetune_data.csv', index=False)


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-7B")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-Math-7B")

In [None]:
# hyperparameterrs
num_epochs = 5
learning_rate = 3e-4
weight_decay = 0.1

In [49]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [51]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    # Can select any from the below:
    # "unsloth/Qwen2.5-0.5B", "unsloth/Qwen2.5-1.5B", "unsloth/Qwen2.5-3B"
    # "unsloth/Qwen2.5-14B",  "unsloth/Qwen2.5-32B",  "unsloth/Qwen2.5-72B",
    # And also all Instruct versions and Math. Coding verisons!
    model_name = "unsloth/Qwen2.5-7B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

AssertionError: Torch not compiled with CUDA enabled