In [1]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, InputExample, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.losses import MultipleNegativesRankingLoss
from sentence_transformers.training_args import BatchSamplers
from datasets import Dataset
from sklearn.metrics.pairwise import cosine_similarity

  from tqdm.autonotebook import tqdm, trange


In [2]:
DATA_PATH = 'dataset'
MODEL_NAME = 'BAAI/bge-large-en-v1.5'
RETRIEVE_NUM = 25

BATCH_SIZE = 128
EPOCH = 2
LR = 2e-05
GRAD_ACC_STEP = 128 // BATCH_SIZE

MODEL_OUTPUT_PATH = 'model/'

train = pd.read_csv(f"{DATA_PATH}/train.csv")
misconception_mapping = pd.read_csv(f"{DATA_PATH}/misconception_mapping.csv")

In [3]:
common_col = [
    "QuestionId",
    "ConstructName",
    "SubjectName",
    "QuestionText",
    "CorrectAnswer",
]

train_long = (
    train[common_col + [f"Answer{alpha}Text" for alpha in ["A", "B", "C", "D"]]]
    .melt(
        id_vars=common_col,
        value_vars=[f"Answer{alpha}Text" for alpha in ["A", "B", "C", "D"]],
        var_name="AnswerType",
        value_name="AnswerText"
    )
)
train_long["AllText"] = train_long["ConstructName"] + " " + train_long["SubjectName"] + " " + train_long["QuestionText"] + " " + train_long["AnswerText"]
train_long["AnswerAlphabet"] = train_long["AnswerType"].str.extract(r"Answer([A-D])Text$")
train_long["QuestionId_Answer"] = train_long["QuestionId"].astype(str) + "_" + train_long["AnswerAlphabet"]

# Convert the misconceptions data to long format
train_misconception_long = (
    train[common_col + [f"Misconception{alpha}Id" for alpha in ["A", "B", "C", "D"]]]
    .melt(
        id_vars=common_col,
        value_vars=[f"Misconception{alpha}Id" for alpha in ["A", "B", "C", "D"]],
        var_name="MisconceptionType",
        value_name="MisconceptionId"
    )
)
train_misconception_long["AnswerAlphabet"] = train_misconception_long["MisconceptionType"].str.extract(r"Misconception([A-D])Id$")
train_misconception_long["QuestionId_Answer"] = train_misconception_long["QuestionId"].astype(str) + "_" + train_misconception_long["AnswerAlphabet"]
train_misconception_long = train_misconception_long[["QuestionId_Answer", "MisconceptionId"]].dropna()

# Join the misconceptions with the long format train data
train_long = train_long.merge(train_misconception_long, on="QuestionId_Answer", how="inner")

In [4]:
model = SentenceTransformer(MODEL_NAME)

# Compute embeddings for training data and misconception names
train_long_vec = model.encode(train_long["AllText"].tolist(), normalize_embeddings=True)
misconception_mapping_vec = model.encode(misconception_mapping["MisconceptionName"].tolist(), normalize_embeddings=True)

# Calculate cosine similarity between each train sample and each misconception
train_cos_sim_arr = cosine_similarity(train_long_vec, misconception_mapping_vec)
train_sorted_indices = np.argsort(-train_cos_sim_arr, axis=1)

# Retrieve top misconceptions based on similarity scores
train_long["PredictMisconceptionId"] = train_sorted_indices[:, :RETRIEVE_NUM].tolist()

# Explode to create positive and negative pairs
train_long_exploded = train_long.explode("PredictMisconceptionId")
train_long_exploded["PredictMisconceptionId"] = train_long_exploded["PredictMisconceptionId"].astype(int)

# Merge to obtain names for positive and negative pairs
train_retrieved = train_long_exploded.merge(misconception_mapping, left_on="MisconceptionId", right_on="MisconceptionId", how="left")
train_retrieved = train_retrieved.merge(misconception_mapping, left_on="PredictMisconceptionId", right_on="MisconceptionId", suffixes=("", "_predict"))

In [5]:
input_examples = []
for _, row in train_retrieved.iterrows():
    label = 1.0 if row["MisconceptionId"] == row["PredictMisconceptionId"] else 0.0
    input_examples.append(InputExample(texts=[row["AllText"], row["MisconceptionName"], row["MisconceptionName_predict"]], label=label))

train_dataset = Dataset.from_dict({
    "AllText": [ex.texts[0] for ex in input_examples],
    "MisconceptionName": [ex.texts[1] for ex in input_examples],
    "MisconceptionName_predict": [ex.texts[2] for ex in input_examples]
})

In [6]:
train_dataset

Dataset({
    features: ['AllText', 'MisconceptionName', 'MisconceptionName_predict'],
    num_rows: 98300
})

In [7]:
model = SentenceTransformer(MODEL_NAME)

loss = MultipleNegativesRankingLoss(model)

args = SentenceTransformerTrainingArguments(
    output_dir=MODEL_OUTPUT_PATH,
    num_train_epochs=EPOCH,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACC_STEP,
    per_device_eval_batch_size=BATCH_SIZE,
    eval_accumulation_steps=GRAD_ACC_STEP,
    learning_rate=LR,
    weight_decay=0.01,
    warmup_ratio=0.1,
    fp16=True,
    bf16=False,
    batch_sampler=BatchSamplers.NO_DUPLICATES,
    lr_scheduler_type="cosine_with_restarts",
    save_strategy="steps",
    save_steps=0.1,
    save_total_limit=2,
    logging_steps=100,
    do_eval=False
)

trainer = SentenceTransformerTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    loss=loss
)

trainer.train()
model.save_pretrained(MODEL_OUTPUT_PATH)

Step,Training Loss
100,2.1412
200,0.8396
300,0.6936
400,0.5966
500,0.5401
600,0.5643
700,0.6191
800,0.5084
900,0.2953
1000,0.224


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]