In [22]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
DATA_PATH = 'dataset'
RETRIEVE_NUM = 25

MODEL_OUTPUT_PATH = 'model/'

test = pd.read_csv(f"{DATA_PATH}/test.csv")
misconception_mapping = pd.read_csv(f"{DATA_PATH}/misconception_mapping.csv")

In [24]:
common_col = [
    "QuestionId",
    "ConstructName",
    "SubjectName",
    "QuestionText",
    "CorrectAnswer",
]

test_long = (
    test[common_col + [f"Answer{alpha}Text" for alpha in ["A", "B", "C", "D"]]]
    .melt(
        id_vars=common_col,
        value_vars=[f"Answer{alpha}Text" for alpha in ["A", "B", "C", "D"]],
        var_name="AnswerType",
        value_name="AnswerText"
    )
)
test_long["AllText"] = test_long["ConstructName"] + " " + test_long["SubjectName"] + " " + test_long["QuestionText"] + " " + test_long["AnswerText"]
test_long["AnswerAlphabet"] = test_long["AnswerType"].str.extract(r"Answer([A-D])Text$")
test_long["QuestionId_Answer"] = test_long["QuestionId"].astype(str) + "_" + test_long["AnswerAlphabet"]


In [25]:
test_long

Unnamed: 0,QuestionId,ConstructName,SubjectName,QuestionText,CorrectAnswer,AnswerType,AnswerText,AllText,AnswerAlphabet,QuestionId_Answer
0,1869,Use the order of operations to carry out calcu...,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,A,AnswerAText,\( 3 \times(2+4)-5 \),Use the order of operations to carry out calcu...,A,1869_A
1,1870,Simplify an algebraic fraction by factorising ...,Simplifying Algebraic Fractions,"Simplify the following, if possible: \( \frac{...",D,AnswerAText,\( m+1 \),Simplify an algebraic fraction by factorising ...,A,1870_A
2,1871,Calculate the range from a list of data,Range and Interquartile Range from a List of Data,Tom and Katie are discussing the \( 5 \) plant...,B,AnswerAText,Only\nTom,Calculate the range from a list of data Range ...,A,1871_A
3,1869,Use the order of operations to carry out calcu...,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,A,AnswerBText,\( 3 \times 2+(4-5) \),Use the order of operations to carry out calcu...,B,1869_B
4,1870,Simplify an algebraic fraction by factorising ...,Simplifying Algebraic Fractions,"Simplify the following, if possible: \( \frac{...",D,AnswerBText,\( m+2 \),Simplify an algebraic fraction by factorising ...,B,1870_B
5,1871,Calculate the range from a list of data,Range and Interquartile Range from a List of Data,Tom and Katie are discussing the \( 5 \) plant...,B,AnswerBText,Only\nKatie,Calculate the range from a list of data Range ...,B,1871_B
6,1869,Use the order of operations to carry out calcu...,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,A,AnswerCText,\( 3 \times(2+4-5) \),Use the order of operations to carry out calcu...,C,1869_C
7,1870,Simplify an algebraic fraction by factorising ...,Simplifying Algebraic Fractions,"Simplify the following, if possible: \( \frac{...",D,AnswerCText,\( m-1 \),Simplify an algebraic fraction by factorising ...,C,1870_C
8,1871,Calculate the range from a list of data,Range and Interquartile Range from a List of Data,Tom and Katie are discussing the \( 5 \) plant...,B,AnswerCText,Both Tom and Katie,Calculate the range from a list of data Range ...,C,1871_C
9,1869,Use the order of operations to carry out calcu...,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,A,AnswerDText,Does not need brackets,Use the order of operations to carry out calcu...,D,1869_D


In [26]:
model = SentenceTransformer(MODEL_OUTPUT_PATH)

test_long_vec = model.encode(
    test_long["AllText"].to_list(), normalize_embeddings=True
)
misconception_mapping_vec = model.encode(
    misconception_mapping["MisconceptionName"].to_list(), normalize_embeddings=True
)
print(test_long_vec.shape)
print(misconception_mapping_vec.shape)


(12, 1024)
(2587, 1024)


In [27]:
test_cos_sim_arr = cosine_similarity(test_long_vec, misconception_mapping_vec)
test_sorted_indices = np.argsort(-test_cos_sim_arr, axis=1)[:, :RETRIEVE_NUM]

In [None]:
test_long["MisconceptionId"] = [" ".join(map(str, indices)) for indices in test_sorted_indices]
test_long["MisconceptionText"] = ["\n".join(misconception_mapping.iloc[indices]["MisconceptionName"].values) for indices in test_sorted_indices]

# Filter where CorrectAnswer != AnswerAlphabet
filtered_test_long = test_long[test_long["CorrectAnswer"] != test_long["AnswerAlphabet"]]

# Select relevant columns and sort by QuestionId_Answer
submission = filtered_test_long[["QuestionId_Answer", "MisconceptionId", "MisconceptionText"]].sort_values(by="QuestionId_Answer")

In [29]:
submission

Unnamed: 0,QuestionId_Answer,MisconceptionId,MisconceptionText
3,1869_B,1507,Carries out operations from left to right rega...
6,1869_C,1507,Carries out operations from left to right rega...
9,1869_D,1507,Carries out operations from left to right rega...
1,1870_A,2549,Believes only the first term needs to be divid...
4,1870_B,2549,Believes only the first term needs to be divid...
7,1870_C,2549,Believes only the first term needs to be divid...
2,1871_A,1287,Believes if you changed all values by the same...
8,1871_C,1287,Believes if you changed all values by the same...
11,1871_D,1287,Believes if you changed all values by the same...
