In [1]:
import gc
import json
import time

import numpy as np
import pandas as pd
import torch
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    pipeline,
)
from transformers.tokenization_utils_fast import PreTrainedTokenizerFast

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

device: cuda


In [3]:
# utils
def clear():
    for _ in range(5):
        torch.cuda.empty_cache()
        gc.collect()
        time.sleep(0.5)


def apk(actual, predicted, k=25):
    if not actual:
        return 0.0
    if len(predicted) > k:
        predicted = predicted[:k]
    score, hits = 0.0, 0.0
    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            hits += 1.0
            score += hits / (i + 1.0)
    return score / min(len(actual), k)


def mapk(actual, predicted, k=25):
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

In [4]:
df_train = train_df = pd.read_csv(
    "data/train.csv",
    dtype={
        "MisconceptionAId": "Int64",
        "MisconceptionBId": "Int64",
        "MisconceptionCId": "Int64",
        "MisconceptionDId": "Int64",
    },
).fillna(-1)
df_test = pd.read_csv("data/test.csv")

In [49]:
len(df_train)

1869

In [5]:
df_train.head()

Unnamed: 0,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText,MisconceptionAId,MisconceptionBId,MisconceptionCId,MisconceptionDId
0,0,856,Use the order of operations to carry out calcu...,33,BIDMAS,A,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),\( 3 \times 2+(4-5) \),\( 3 \times(2+4-5) \),Does not need brackets,-1,-1,-1,1672
1,1,1612,Simplify an algebraic fraction by factorising ...,1077,Simplifying Algebraic Fractions,D,"Simplify the following, if possible: \( \frac{...",\( m+1 \),\( m+2 \),\( m-1 \),Does not simplify,2142,143,2142,-1
2,2,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,B,Tom and Katie are discussing the \( 5 \) plant...,Only\nTom,Only\nKatie,Both Tom and Katie,Neither is correct,1287,-1,1287,1073
3,3,2377,Recall and use the intersecting diagonals prop...,88,Properties of Quadrilaterals,C,The angles highlighted on this rectangle with ...,acute,obtuse,\( 90^{\circ} \),Not enough information,1180,1180,-1,1180
4,4,3387,Substitute positive integer values into formul...,67,Substitution into Formula,A,The equation \( f=3 r^{2}+3 \) is used to find...,\( 30 \),\( 27 \),\( 51 \),\( 24 \),-1,-1,-1,1818


In [48]:
df_train["ConstructName"].head()

0    Use the order of operations to carry out calcu...
1    Simplify an algebraic fraction by factorising ...
2              Calculate the range from a list of data
3    Recall and use the intersecting diagonals prop...
4    Substitute positive integer values into formul...
Name: ConstructName, dtype: object

In [6]:
PROMPT = """
Question: {question}
Incorrect Answer: {incorrect_answer}
Correct Answer: {correct_answer}
Construct Name: {construct_name}
Subject Name: {subject_name}

Your task: Identify the misconception behind Incorrect Answer. Answer concisely and generically inside <response>$$INSERT TEXT HERE$$</response>. Before answering the question think step by step concisely in 1-2 sentence inside the <thinking>$$INSERT TEXT HERE$$</thinking> tag. Respond with your final misconception inside the <response>$$INSERT TEXT HERE$$</response> tag.
"""

In [7]:
tokenizer: PreTrainedTokenizerFast = AutoTokenizer.from_pretrained(
    "meta-llama/Llama-3.2-1B", chat_template=PROMPT
)

In [8]:
def apply_template(
    construct_name: str,
    subject_name: str,
    question: str,
    incorrect_answer: str,
    correct_answer: str,
    tokenizer: PreTrainedTokenizerFast,
) -> str:
    messages = [
        {
            "role": "user",
            "content": PROMPT.format(
                construct_name=construct_name,
                subject_name=subject_name,
                question=question,
                incorrect_answer=incorrect_answer,
                correct_answer=correct_answer,
            ),
        }
    ]
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return text

In [9]:
train_dataset = []
for i, row in df_train.iterrows():
    for target_option in ["A", "B", "C", "D"]:
        answer_col = f"Answer{target_option}Text"
        misconception_col = f"Misconception{target_option}Id"
        # ignore question-answer pairs that are correct and have no misconception
        if row["CorrectAnswer"] == target_option or row[misconception_col] == -1:
            continue
        question_answer_id = f"{row['QuestionId']}_{target_option}"
        train_dataset.append(
            (
                question_answer_id,
                apply_template(
                    construct_name=row["ConstructName"],
                    subject_name=row["SubjectName"],
                    question=row["QuestionText"],
                    incorrect_answer=row[misconception_col],
                    correct_answer=row[answer_col],
                    tokenizer=tokenizer,
                ),
            )
        )
df = pd.DataFrame(train_dataset, columns=["QuestionId_Answer", "Prompt"])

In [44]:
print(df.loc[0]["Prompt"])


Question: {question}
Incorrect Answer: {incorrect_answer}
Correct Answer: {correct_answer}
Construct Name: {construct_name}
Subject Name: {subject_name}

Your task: Identify the misconception behind Incorrect Answer. Answer concisely and generically inside <response>$$INSERT TEXT HERE$$</response>. Before answering the question think step by step concisely in 1-2 sentence inside the <thinking>$$INSERT TEXT HERE$$</thinking> tag. Respond with your final misconception inside the <response>$$INSERT TEXT HERE$$</response> tag.


In [11]:
miscon_df = pd.read_csv("data/misconception_mapping.csv", index_col="MisconceptionId")

In [12]:
miscon_df.describe()

Unnamed: 0,MisconceptionName
count,2587
unique,2587
top,Does not know that angles in a triangle sum to...
freq,1


In [13]:
miscon_df.head()

Unnamed: 0_level_0,MisconceptionName
MisconceptionId,Unnamed: 1_level_1
0,Does not know that angles in a triangle sum to...
1,Uses dividing fractions method for multiplying...
2,Believes there are 100 degrees in a full turn
3,Thinks a quadratic without a non variable term...
4,Believes addition of terms and powers of terms...


In [14]:
id2label = miscon_df.to_dict()["MisconceptionName"]
label2id = {v: k for k, v in id2label.items()}

In [15]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased",
    num_labels=len(miscon_df),
    id2label=id2label,
    label2id=label2id,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
training_args = TrainingArguments(
    output_dir="miscon_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
# trainer.train()

In [28]:
test_dataset = []
for i, row in df_test.iterrows():
    for target_option in ["A", "B", "C", "D"]:
        answer_col = f"Answer{target_option}Text"
        question_answer_id = f"{row['QuestionId']}_{target_option}"
        test_dataset.append(
            (
                question_answer_id,
                apply_template(
                    construct_name=row["ConstructName"],
                    subject_name=row["SubjectName"],
                    question=row["QuestionText"],
                    incorrect_answer="{misconception_col}",
                    correct_answer=row[answer_col],
                    tokenizer=tokenizer,
                ),
            )
        )
df = pd.DataFrame(test_dataset, columns=["QuestionId_Answer", "Prompt"])

In [36]:
df

Unnamed: 0,QuestionId_Answer,Prompt
0,1869_A,\nQuestion: {question}\nIncorrect Answer: {inc...
1,1869_B,\nQuestion: {question}\nIncorrect Answer: {inc...
2,1869_C,\nQuestion: {question}\nIncorrect Answer: {inc...
3,1869_D,\nQuestion: {question}\nIncorrect Answer: {inc...
4,1870_A,\nQuestion: {question}\nIncorrect Answer: {inc...
5,1870_B,\nQuestion: {question}\nIncorrect Answer: {inc...
6,1870_C,\nQuestion: {question}\nIncorrect Answer: {inc...
7,1870_D,\nQuestion: {question}\nIncorrect Answer: {inc...
8,1871_A,\nQuestion: {question}\nIncorrect Answer: {inc...
9,1871_B,\nQuestion: {question}\nIncorrect Answer: {inc...


In [18]:
clf = pipeline(
    "text-generation",
    model="meta-llama/Llama-3.2-1B",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Llama-3.2-1B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [37]:
clf("test")

[{'label': 'LABEL_1', 'score': 0.8921873569488525}]

In [None]:
import torch
from transformers import pipeline

model_id = "meta-llama/Llama-3.2-1B"

pipe = pipeline(
    "text-generation", model=model_id, torch_dtype=torch.bfloat16, device_map="auto"
)

pipe("The key to life is")

In [10]:
train_df = pd.read_csv(
    "data/train.csv",
    dtype={
        "MisconceptionAId": "Int64",
        "MisconceptionBId": "Int64",
        "MisconceptionCId": "Int64",
        "MisconceptionDId": "Int64",
    },
)

In [None]:
train_df

In [12]:
test_df = pd.read_csv("data/test.csv")

In [None]:
test_df

In [14]:
class_mapping_df = pd.read_csv("data/misconception_mapping.csv")

In [None]:
class_mapping_df

In [16]:
submission_df = pd.read_csv("data/sample_submission.csv")

In [None]:
submission_df