In [1]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Imports

In [2]:
import re
from pathlib import Path

import pandas as pd
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)
from transformers.models.llama.modeling_llama import LlamaForCausalLM
from tqdm import tqdm

In [3]:
# PATHS
train_csv = Path("data/train.csv")
test_csv = Path("data/test.csv")

In [4]:
model_id = "meta-llama/Llama-3.2-3B-Instruct"

In [5]:
df_train = pd.read_csv(
    train_csv,
    dtype={
        "MisconceptionAId": "Int64",
        "MisconceptionBId": "Int64",
        "MisconceptionCId": "Int64",
        "MisconceptionDId": "Int64",
    },
).fillna(-1)
df_test = pd.read_csv(test_csv)

In [6]:
PROMPT  = """Question: {Question}
Incorrect Answer: {IncorrectAnswer}
Correct Answer: {CorrectAnswer}
Construct Name: {ConstructName}
Subject Name: {SubjectName}

Your task: Identify the misconception behind Incorrect Answer. Answer concisely and generically inside <response>$$INSERT TEXT HERE$$</response>.
Before answering the question think step by step concisely in 1-2 sentence inside <thinking>$$INSERT TEXT HERE$$</thinking> tag and respond your final misconception inside <response>$$INSERT TEXT HERE$$</response> tag."""

In [7]:
def apply_template(row, tokenizer):
    messages = [
        {
            "role": "user",
            "content": PROMPT.format(
                ConstructName=row["ConstructName"],
                SubjectName=row["SubjectName"],
                Question=row["QuestionText"],
                IncorrectAnswer=row[f"CorrectAnswerText"],
                CorrectAnswer=row[f"AnswerText"],
            ),
        }
    ]
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return text

In [8]:
def get_correct_answer(row):
    if row["CorrectAnswer"] == "A":
        return row["AnswerAText"]
    elif row["CorrectAnswer"] == "B":
        return row["AnswerBText"]
    elif row["CorrectAnswer"] == "C":
        return row["AnswerCText"]
    elif row["CorrectAnswer"] == "D":
        return row["AnswerDText"]
    else:
        return None

In [9]:
df_test["CorrectAnswerText"] = df_test.apply(get_correct_answer, axis=1)
select_column = [
    "QuestionId",
    "ConstructName",
    "SubjectName",
    "CorrectAnswer",
    "QuestionText",
    "CorrectAnswerText",
]
df_answer = pd.melt(
    df_test,
    id_vars=select_column,
    value_vars=[f"Answer{ans}Text" for ans in ["A", "B", "C", "D"]],
    var_name="Option",
    value_name="AnswerText",
).sort_values("QuestionId")

In [10]:
def process_option(x):
    out = re.search(r"Answer([A-D])", x)
    if out:
        return str(out.group(1))
    return ""

df_answer["Option"] = df_answer["Option"].map(process_option)

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [12]:
df_answer = df_answer[df_answer["CorrectAnswer"] != df_answer["Option"]]
df_answer["Prompt"] = df_answer.apply(
    lambda row: apply_template(row, tokenizer), axis=1
)
df_answer.to_parquet("test.parquet", index=False)

df = pd.read_parquet("test.parquet")

In [13]:
generator = pipeline(
    "text-generation",
    model=model_id,
    model_kwargs={"temperature": 0.7, "top_p": 0.9},
    device="cuda",
)



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
# df = df.head(1)

In [15]:
responses = []
for v in tqdm(df["Prompt"].values):
    out = generator(v, max_new_tokens=4096)
    responses.append(out)

responses = [x[0]["generated_text"] for x in responses]
df["FullResponse"] = responses

  0%|          | 0/9 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 11%|█         | 1/9 [00:42<05:42, 42.80s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 22%|██▏       | 2/9 [01:24<04:54, 42.03s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 33%|███▎      | 3/9 [02:05<04:08, 41.44s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 44%|████▍     | 4/9 [03:46<05:25, 65.10s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 56%|█████▌    | 5/9 [04:35<03:56, 59.15s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 67%|██████▋   | 6/9 [06:29<03:53, 77.97s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 78%|███████▊  | 7/9 [13:21<06:14, 187.32s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
 89%|████████▉ | 8/9 [15:07<02:41, 161.16s/it]Setting `pad_token_id` to `eos_to

In [16]:
# df

In [17]:
def remove_prompt(record):
    l = len(record["Prompt"])
    value = record["FullResponse"][l:]
    return value

df["FullResponse"] = df.apply(remove_prompt, axis=1)

In [18]:
# df

In [19]:
def extract_response(text):
    subresponses = re.findall(r"<response>(?s:.*?)</response>", text)
    subresponses = [x.strip().replace("<response>", "").replace("</response>", "") for x in subresponses]
    return " ".join(subresponses).strip()

responses = [extract_response(x) for x in df["FullResponse"]]
df["Misconception"] = responses
df.to_parquet("output.parquet", index=False)

In [20]:
## get semantically similar misconceptions

In [21]:
# df = pd.read_parquet("output.parquet")

In [22]:
# df.head(1).transpose()

In [23]:
# !pip install -U sentence-transformers

In [24]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cdist
import numpy as np

df = pd.read_parquet("output.parquet")
df_misconception_mapping = pd.read_csv("data/misconception_mapping.csv")

# model = SentenceTransformer('/kaggle/input/bge-large-en-v1-5')
model = SentenceTransformer('BAAI/bge-small-en-v1.5')
# model = SentenceTransformer('/kaggle/input/bge-small-en-v1.5/transformers/bge/2')
# PREFIX = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
# input_features = df["Misconception"].str.lstrip(PREFIX).str.split("\n\nYour task:").str[0]
# input_features = df["Misconception"]



# embedding_query = model.encode(input_features+ "\n----\n" + df["fullLLMText"]) # , convert_to_tensor=True
# embedding_Misconception = model.encode(df_misconception_mapping.MisconceptionName.values)

# # compute cosine and euclid distance
# # Compute similarities
# cosine_similarities = cosine_similarity(embedding_query, embedding_Misconception)
# # Euclidean distance
# euclidean_distances = cdist(embedding_query, embedding_Misconception, metric='euclidean')
# euclidean_similarities = 1 / (1 + euclidean_distances)  # Convert distance to similarity
# # Combination of cosine and euclidean
# combined_similarities = (cosine_similarities + euclidean_similarities) / 2
# # Use the combined_similarities for sorting
# test_sorted_indices = np.argsort(-combined_similarities, axis=1)

# # top25ids = util.semantic_search(embedding_query, embedding_Misconception, top_k=25)
# df["MisconceptionId"] = test_sorted_indices[:, :25].tolist()
# df["MisconceptionId"] = df["MisconceptionId"].apply(lambda x: ' '.join(map(str, x)))
# df[["QuestionId_Answer", "MisconceptionId"]].to_csv("submission.csv", index=False)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [25]:
embedding_query = model.encode(df["Misconception"])

In [26]:
embedding_miscon = model.encode(df_misconception_mapping.MisconceptionName.values)

In [27]:
cosine_similarities = cosine_similarity(embedding_query, embedding_miscon)

In [28]:
rev_sorted_indices = np.argsort(-cosine_similarities, axis=1)

In [29]:
# rev_sorted_indices[:, :25]

In [30]:
# df.head(1).transpose()

In [31]:
df["MisconceptionId"] = rev_sorted_indices[:, :25].tolist()

In [32]:
df["MisconceptionId"] = df["MisconceptionId"].apply(lambda x: ' '.join(map(str, x)))

In [33]:
df["QuestionId_Answer"] = df["QuestionId"].astype(str) + "_" + df["CorrectAnswer"]

In [34]:
# df.head(1).transpose()

In [35]:
df[["QuestionId_Answer", "MisconceptionId"]].to_csv("submission.csv", index=False)

In [36]:
# def apk(actual, predicted, k=25):
#     """
#     Computes the average precision at k.

#     This function computes the average prescision at k between two lists of
#     items.

#     Parameters
#     ----------
#     actual : list
#              A list of elements that are to be predicted (order doesn't matter)
#     predicted : list
#                 A list of predicted elements (order does matter)
#     k : int, optional
#         The maximum number of predicted elements

#     Returns
#     -------
#     score : double
#             The average precision at k over the input lists
#     """

#     if not actual:
#         return 0.0

#     if len(predicted)>k:
#         predicted = predicted[:k]

#     score = 0.0
#     num_hits = 0.0

#     for i,p in enumerate(predicted):
#         # first condition checks whether it is valid prediction
#         # second condition checks if prediction is not repeated
#         if p in actual and p not in predicted[:i]:
#             num_hits += 1.0
#             score += num_hits / (i+1.0)

#     return score / min(len(actual), k)

# def mapk(actual, predicted, k=25):
#     """
#     Computes the mean average precision at k.

#     This function computes the mean average prescision at k between two lists
#     of lists of items.

#     Parameters
#     ----------
#     actual : list
#              A list of lists of elements that are to be predicted
#              (order doesn't matter in the lists)
#     predicted : list
#                 A list of lists of predicted elements
#                 (order matters in the lists)
#     k : int, optional
#         The maximum number of predicted elements

#     Returns
#     -------
#     score : double
#             The mean average precision at k over the input lists
#     """

#     return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])