In [1]:
# imports
import re

import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, pipeline
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

  from tqdm.autonotebook import tqdm, trange


In [2]:
# constants
eedi_train_csv = "data/train.csv"
eedi_test_csv = "data/test.csv"
eedi_miscon_csv = "data/misconception_mapping.csv"
llm_model_id = "meta-llama/Llama-3.2-1B-Instruct"
sbert_model_id = "BAAI/bge-small-en-v1.5"
submission_csv = "submission.csv"

In [12]:
# quick util
def dfpeek(title: str, df: pd.DataFrame) -> None:
    print(">>>>>>>>>>", title, ">>>>>>>>>")
    display(df.head(1).transpose())
    print("<<<<<<<<<<", title, "<<<<<<<<<<", end="\n\n")

#### eedi_train_csv

In [3]:
df = pd.read_csv(
    eedi_train_csv,
    dtype={
        "MisconceptionAId": "Int64",
        "MisconceptionBId": "Int64",
        "MisconceptionCId": "Int64",
        "MisconceptionDId": "Int64",
    },
).fillna(-1)

In [4]:
df.head(1).transpose()

Unnamed: 0,0
QuestionId,0
ConstructId,856
ConstructName,Use the order of operations to carry out calcu...
SubjectId,33
SubjectName,BIDMAS
CorrectAnswer,A
QuestionText,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...
AnswerAText,\( 3 \times(2+4)-5 \)
AnswerBText,\( 3 \times 2+(4-5) \)
AnswerCText,\( 3 \times(2+4-5) \)


In [5]:
df.describe()

Unnamed: 0,QuestionId,ConstructId,SubjectId,MisconceptionAId,MisconceptionBId,MisconceptionCId,MisconceptionDId
count,1869.0,1869.0,1869.0,1869.0,1869.0,1869.0,1869.0
mean,934.0,1613.261637,225.370787,794.288925,782.036383,742.287854,701.191011
std,539.678145,1060.591804,238.536233,863.565676,873.726977,849.724372,846.165922
min,0.0,4.0,33.0,-1.0,-1.0,-1.0,-1.0
25%,467.0,575.0,92.0,-1.0,-1.0,-1.0,-1.0
50%,934.0,1470.0,203.0,450.0,380.0,340.0,217.0
75%,1401.0,2637.0,238.0,1535.0,1535.0,1495.0,1383.0
max,1868.0,3526.0,1984.0,2585.0,2586.0,2585.0,2583.0


#### eedi_miscon_csv

In [8]:
df = pd.read_csv(eedi_miscon_csv)

In [10]:
df.head(1).transpose()

Unnamed: 0,0
MisconceptionId,0
MisconceptionName,Does not know that angles in a triangle sum to...


In [11]:
df.describe()

Unnamed: 0,MisconceptionId
count,2587.0
mean,1293.0
std,746.946897
min,0.0
25%,646.5
50%,1293.0
75%,1939.5
max,2586.0


#### df_x and df_y

In [14]:
df_x = pd.read_parquet(".intm/df_x.parquet")
df_y = pd.read_parquet(".intm/df_y.parquet")
dfpeek("df_x", df_x)
dfpeek("df_y", df_y)

>>>>>>>>>> df_x >>>>>>>>>


Unnamed: 0,0
QuestionId_Answer,0_A
ConstructName,Use the order of operations to carry out calcu...
SubjectName,BIDMAS
QuestionText,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...
CorrectAnswer,A
CorrectAnswerText,\( 3 \times(2+4)-5 \)
AnswerText,\( 3 \times(2+4)-5 \)
IsCorrectAnswer,True


<<<<<<<<<< df_x <<<<<<<<<<

>>>>>>>>>> df_y >>>>>>>>>


Unnamed: 0,0
QuestionId_Answer,0_A
MisconceptionId,-1
MisconceptionName,


<<<<<<<<<< df_y <<<<<<<<<<



#### Prompts

In [35]:
df_prompts = pd.read_parquet("D:/Drive/school/sem1/CS5242/Project/eedi/.intm/b7d6ed11-7c6e-4e8a-af2a-49323e7c5032/df_prompt_train.parquet")

In [37]:
dfpeek("df_prompts", df_prompts)

>>>>>>>>>> df_prompts >>>>>>>>>


Unnamed: 0,0
QuestionId_Answer,753_C
Prompt,<|begin_of_text|><|start_header_id|>system<|en...


<<<<<<<<<< df_prompts <<<<<<<<<<



In [None]:
tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
tokenizer.pad_token = tokenizer.eos_token

In [42]:
max([len(x) for x in df_prompts["Prompt"].to_list()])

2242

In [None]:
model_inputs = tokenizer(df_prompts["Prompt"].to_list(), return_tensors="pt", padding="max_length")