In [1]:
# imports
import re

import numpy as np
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, pipeline
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

  from tqdm.autonotebook import tqdm, trange


In [2]:
# constants
eedi_train_csv = "data/train.csv"
eedi_test_csv = "data/test.csv"
eedi_miscon_csv = "data/misconception_mapping.csv"
llm_model_id = "meta-llama/Llama-3.2-1B-Instruct"
sbert_model_id = "BAAI/bge-small-en-v1.5"
submission_csv = "submission.csv"

In [3]:
# quick util
def dfpeek(title: str, df: pd.DataFrame) -> None:
    print(">>>>>>>>>>", title, ">>>>>>>>>")
    display(df.head(1).transpose())
    print("<<<<<<<<<<", title, "<<<<<<<<<<", end="\n\n")

#### eedi_train_csv

In [4]:
df = pd.read_csv(
    eedi_train_csv,
    dtype={
        "MisconceptionAId": "Int64",
        "MisconceptionBId": "Int64",
        "MisconceptionCId": "Int64",
        "MisconceptionDId": "Int64",
    },
).fillna(-1)

In [10]:
df[["ConstructName", "SubjectName", "QuestionText"]]

Unnamed: 0,ConstructName,SubjectName,QuestionText
0,Use the order of operations to carry out calcu...,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...
1,Simplify an algebraic fraction by factorising ...,Simplifying Algebraic Fractions,"Simplify the following, if possible: \( \frac{..."
2,Calculate the range from a list of data,Range and Interquartile Range from a List of Data,Tom and Katie are discussing the \( 5 \) plant...
3,Recall and use the intersecting diagonals prop...,Properties of Quadrilaterals,The angles highlighted on this rectangle with ...
4,Substitute positive integer values into formul...,Substitution into Formula,The equation \( f=3 r^{2}+3 \) is used to find...
...,...,...,...
1864,Calculate the range from a list of data,Range and Interquartile Range from a List of Data,What is the range of the following numbers?\n\...
1865,"Describe an enlargement, with no centre of enl...",Length Scale Factors in Similar Shapes,Shape \( Q \) is an enlargement of shape \( P ...
1866,Use the order of operations to carry out calcu...,BIDMAS,What does the following equal?\n\[\n8-7+10 \ti...
1867,Distinguish between congruency and similarity,Congruency in Other Shapes,Tom and Katie are discussing congruence and si...


In [None]:
df.head(1).transpose()

In [None]:
df.describe()

#### eedi_miscon_csv

In [None]:
df = pd.read_csv(eedi_miscon_csv)

In [None]:
df.head(1).transpose()

In [None]:
df.describe()

#### df_x and df_y

In [None]:
df_x = pd.read_parquet(".intm/df_x.parquet")
df_y = pd.read_parquet(".intm/df_y.parquet")
dfpeek("df_x", df_x)
dfpeek("df_y", df_y)

#### Prompts

In [None]:
df_prompts = pd.read_parquet("D:/Drive/school/sem1/CS5242/Project/eedi/.intm/b7d6ed11-7c6e-4e8a-af2a-49323e7c5032/df_prompt_train.parquet")

In [None]:
dfpeek("df_prompts", df_prompts)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(llm_model_id)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
max([len(x) for x in df_prompts["Prompt"].to_list()])

In [None]:
model_inputs = tokenizer(df_prompts["Prompt"].to_list(), return_tensors="pt", padding="max_length")

#### df_responses

In [7]:
df_responses = pd.read_parquet("D:/Drive/school/sem1/CS5242/Project/eedi/.intm/be5e8e88-a441-4ae2-a37b-c7aae27ff5bc/df_responses.parquet")

In [8]:
dfpeek("df_responses", df_responses)

>>>>>>>>>> df_responses >>>>>>>>>


Unnamed: 0,0
QuestionId_Answer,824_C
Prompt,<|begin_of_text|><|start_header_id|>system<|en...
FullResponse,system\n\nCutting Knowledge Date: December 202...
Misconception,$$INSERT TEXT HERE$$ $$INSERT TEXT HERE$$


<<<<<<<<<< df_responses <<<<<<<<<<

