In [1]:
# Prompt Engineering
# https://isamu-website.medium.com/understanding-the-current-state-of-reasoning-with-llms-dbd9fa3fc1a0
# https://www.promptingguide.ai/

# Techniques to try
# Chain of Thought
# Meta Prompting
# Tree of Thoughts
# Buffer of Thoughts
# Generated Knowledge Prompting
# Selective CoT
# Intermediate Step Evaluation

# If you are more adventurous, you can try knowledge graph related prompting.

# Tasks
# Data and Workflow Engineering
# EDA
# Tokenizing
# Data Cleaning
# Prediction
# Submission
# Model Review & Selection
# Feature Engineering
# CLIP
# Prompt Engineering
# Fine Tuning

In [None]:
"""
TASKS

basic tasks
- data ingestion
- data cleaning
- data wrapping
- tokenization
- prediction
- submission

advanced tasks
- fine tuning
- prompt engineering
"""

### Imports

In [None]:
import re

import pandas as pd
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    pipeline,
    BitsAndBytesConfig,
)

### Initialization

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

### Data Ingestion

In [3]:
df_train = pd.read_csv(
    "data/train.csv",
    dtype={
        "MisconceptionAId": "Int64",
        "MisconceptionBId": "Int64",
        "MisconceptionCId": "Int64",
        "MisconceptionDId": "Int64",
    },
).fillna(-1)
df_test = pd.read_csv("data/test.csv")

In [None]:
print(df_train.head(1).transpose())
print(df_train.describe())

In [None]:
print(df_test.head(1).transpose())
print(df_test.describe())

In [25]:
# model_id = "meta-llama/Llama-3.2-1B-Instruct"
model_id = "unsloth/llama-3-8b-bnb-4bit"

In [7]:
PROMPT = """Question: {Question}
Incorrect Answer: {IncorrectAnswer}
Correct Answer: {CorrectAnswer}
Construct Name: {ConstructName}
Subject Name: {SubjectName}

Your task: Identify the misconception behind Incorrect Answer. Answer concisely and generically inside <response>$$INSERT TEXT HERE$$</response>.
Before answering the question think step by step concisely in 1-2 sentence inside <thinking>$$INSERT TEXT HERE$$</thinking> tag and respond your final misconception inside <response>$$INSERT TEXT HERE$$</response> tag."""

In [8]:
def apply_template(row, tokenizer):
    messages = [
        {
            "role": "user",
            "content": PROMPT.format(
                ConstructName=row["ConstructName"],
                SubjectName=row["SubjectName"],
                Question=row["QuestionText"],
                IncorrectAnswer=row[f"CorrectAnswerText"],
                CorrectAnswer=row[f"AnswerText"],
            ),
        }
    ]
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return text

In [2]:
def get_correct_answer(row):
    if row["CorrectAnswer"] == "A":
        return row["AnswerAText"]
    elif row["CorrectAnswer"] == "B":
        return row["AnswerBText"]
    elif row["CorrectAnswer"] == "C":
        return row["AnswerCText"]
    elif row["CorrectAnswer"] == "D":
        return row["AnswerDText"]
    else:
        return None

In [None]:
df_test["CorrectAnswerText"] = df_test.apply(get_correct_answer, axis=1)

In [11]:
select_column = [
    "QuestionId",
    "ConstructName",
    "SubjectName",
    "CorrectAnswer",
    "QuestionText",
    "CorrectAnswerText",
]
df_answer = pd.melt(
    df_test,
    id_vars=select_column,
    value_vars=[f"Answer{ans}Text" for ans in ["A", "B", "C", "D"]],
    var_name="Option",
    value_name="AnswerText",
).sort_values("QuestionId")

In [None]:
print(df_answer.head(1).transpose())

In [13]:
def process_option(x):
    out = re.search(r"Answer([A-D])", x)
    if out:
        return out.group(1)
    return None


df_answer["Option"] = df_answer["Option"].apply(process_option)

In [15]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
# tokenizer.chat_template = (
#     "{{bos_token}}"
#     "{% for message in messages %}"
#         "<|start_header_id|>{{message['role']}}<|end_header_id|>\n\n{{message['content']}}<|eot_id|>"
#     "{% endfor %}"
#     "{% if add_generation_prompt%}<|start_header_id|>assistant<|end_header_id|>\n\n"
#         "{% else %}{{eos_token}}"
#     "{% endif %}"
# )

In [16]:
df_answer = df_answer[df_answer["CorrectAnswer"] != df_answer["Option"]]
df_answer["Prompt"] = df_answer.apply(
    lambda row: apply_template(row, tokenizer), axis=1
)
df_answer.to_parquet("test.parquet", index=False)

In [17]:
df = pd.read_parquet("test.parquet")

In [22]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [None]:
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto",
    quantization_config=nf4_config,
)

# model = AutoModelForCausalLM.from_pretrained(model_id)
# pipeline = pipeline("text-generation", model=model_id, model_kwargs={"load_in_4bit": True}, device_map="auto")

In [20]:
import importlib

In [None]:
importlib.metadata.version("bitsandbytes")

In [23]:
from tqdm import tqdm

In [None]:
responses = []
for v in tqdm(df["Prompt"].values):
    responses.append(pipeline(v, max_new_tokens=512))

In [None]:
responses[0]

In [28]:
responses = [x[0]["generated_text"] for x in responses]
df["FullResponse"] = responses

In [29]:
def extract_response(text):
    return ",".join(re.findall(r"<response>(.*?)</response>", text)).strip()

In [30]:
responses = [extract_response(x) for x in responses]
df["Misconception"] = responses
df.to_parquet("output.parquet", index=False)

In [31]:
df = pd.read_parquet("output.parquet")

In [None]:
print(df["FullResponse"][0])

In [None]:
df.head(1).transpose()