In [1]:
%%time
# vllm
!pip uninstall -y torch
!pip install -q --no-index --find-links=/kaggle/input/making-wheels-of-necessary-packages-for-vllm vllm
!pip install -q -U /kaggle/input/vllm-t4-fix/grpcio-1.62.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
!pip install -q -U /kaggle/input/vllm-t4-fix/ray-2.11.0-cp310-cp310-manylinux2014_x86_64.whl
!pip install -q --no-deps --no-index /kaggle/input/hf-libraries/sentence-transformers/sentence_transformers-3.1.0-py3-none-any.whl
!pip install --no-deps --no-index /kaggle/input/logits-processor-zoo/logits_processor_zoo-0.1.0-py3-none-any.whl

# Peft
!pip install transformers peft accelerate \
    -q -U --no-index --find-links /kaggle/input/lmsys-wheel-files
!pip install -q --no-index /kaggle/input/bitsandbytes0-42-0/bitsandbytes-0.42.0-py3-none-any.whl --find-links=/kaggle/input/bitsandbytes0-42-0
!pip install -q --no-index  /kaggle/input/bitsandbytes0-42-0/optimum-1.21.2-py3-none-any.whl --find-links=/kaggle/input/bitsandbytes0-42-0
!pip install -q --no-index  /kaggle/input/bitsandbytes0-42-0/auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --find-links=/kaggle/input/bitsandbytes0-42-0

Found existing installation: torch 2.4.0
Uninstalling torch-2.4.0:
  Successfully uninstalled torch-2.4.0
Processing /kaggle/input/logits-processor-zoo/logits_processor_zoo-0.1.0-py3-none-any.whl
Installing collected packages: logits-processor-zoo
Successfully installed logits-processor-zoo-0.1.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
logits-processor-zoo 0.1.0 requires accelerate<0.27.0,>=0.26.1, but you have accelerate 0.34.2 which is incompatible.[0m[31m
[0mCPU times: user 4.21 s, sys: 943 ms, total: 5.15 s
Wall time: 4min 53s


## LLM Reasoning
Prompt LLM to identify likely misconception that led to wrong answer

In [2]:
import pandas as pd


df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/test.csv")
explode_df = []
for idx, row in df.iterrows():
    for option in ["A", "B", "C", "D"]:
        if option == row["CorrectAnswer"]:
            continue
        correct_option = row["CorrectAnswer"]

        explode_df.append({"QuestionId_Answer": f"{row.QuestionId}_{option}",
                           "ConstructName": row.ConstructName,
                           "SubjectName": row.SubjectName,
                           "QuestionText": row.QuestionText,
                           "CorrectAnswer": row[f"Answer{correct_option}Text"],
                           "IncorrectAnswer": row[f"Answer{option}Text"]
                           })

df = pd.DataFrame(explode_df)
df.to_csv("explode_df.csv")
        

In [3]:
%%writefile vllm_reasoning.py

from tqdm import tqdm
from torch.utils.data import DataLoader
from vllm import LLM, SamplingParams
import json
from torch.utils.data import Dataset
import pandas as pd

class MathDataset(Dataset):
    def __init__(self, df):
        self.data = df
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        subjectName = self.data.loc[idx, "SubjectName"]
        constructName = self.data.loc[idx, "ConstructName"]
        question = self.data.loc[idx, "QuestionText"]
        correct_answer = self.data.loc[idx, "CorrectAnswer"]
        wrong_answer = self.data.loc[idx, "IncorrectAnswer"]
        

        prompt = f"""Here is a question about {constructName} ({subjectName}):
        
- Question: {question}
- Correct Answer: {correct_answer}
- Wrong Answer: {wrong_answer}
        
Please provide a detailed analysis on what misconception or reasoning error that cause the student to derive the wrong answer. Focus only on explaining the misconception.
"""
    
        message = [
            {"role": "system", "content": "You are a proficient Mathematics teacher. Your goal is to identify the likely misconception or reasoning error that led the student to choose the wrong answer."},
            {"role": "user", "content": prompt.strip()}
        ]
        
        return message, correct_answer, str(self.data.loc[idx, "QuestionId_Answer"])
    
def collate_batch(batch):
    data, labels, question_ids = zip(*batch)
    text = tokenizer.apply_chat_template(
        data,
        tokenize=False,
        add_generation_prompt=True
    )
    return text, labels, question_ids


if __name__ == "__main__":
    df = pd.read_csv("explode_df.csv")
    model_name = "/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1"
    device = "cuda" # the device to load the model onto
    model = LLM(model_name,
                quantization="awq",
                gpu_memory_utilization=1,
                tensor_parallel_size=2,
                trust_remote_code=True,
                dtype="half",
                max_model_len=4000)
    
    tokenizer = model.get_tokenizer()
    train_dataset = MathDataset(df)
    train_pbar = tqdm(DataLoader(train_dataset, batch_size=32, collate_fn=collate_batch))
    
    llm_response = []
    for model_inputs, answers, question_ids in train_pbar:
        outputs = model.generate(
            model_inputs,
            SamplingParams(
                n=1,
                temperature=0,
                seed=111,
                max_tokens=1024
            ),
            use_tqdm=False
        )
        
        for i in range(len(outputs)):
            output = outputs[i]
            llm_response.append({"QuestionId_Answer": question_ids[i], "Misconception": output.outputs[0].text})
            
    llm_misconception = pd.DataFrame(llm_response)
    llm_misconception.to_csv("llm_misconception.csv")
    #llm_response

Writing vllm_reasoning.py


In [4]:
!python vllm_reasoning.py

INFO 11-29 09:48:47 config.py:715] Defaulting to use mp for distributed inference
INFO 11-29 09:48:47 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1', speculative_config=None, tokenizer='/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1, use_v2_block_manager=F

In [5]:
import torch
import gc

gc.collect()
torch.cuda.empty_cache()
!nvidia-smi

Fri Nov 29 09:53:37 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   73C    P0             33W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                      

## Compute cosine similarity of embedding
Compute embeddings of **MisconceptionName** and LLM reasoned possible misconceptions using [SFR-Embedding-Mistral](https://huggingface.co/Salesforce/SFR-Embedding-Mistral)

In [6]:
from torch import Tensor
from transformers import AutoModel, AutoTokenizer, BitsAndBytesConfig
import torch
from tqdm import trange
import numpy as np


def batch_to_device(batch, target_device):
    """
    send a pytorch batch to a device (CPU/GPU)
    """
    for key in batch:
        if isinstance(batch[key], Tensor):
            batch[key] = batch[key].to(target_device)
    return batch


def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])


def get_detailed_instruct(task_description: str, query: str) -> str:
    return f"Instruct: {task_description}\nQuery: {query}"


def inference(df, misconception_df):
    device = "cuda"
    task = "Given the likely misconception or reasoning error that led the student to choose the wrong answer, please retrieve the most accurate description of the misconception."
    quries = [get_detailed_instruct(task, str(q)) for q in df["Misconception"].values]
    passages = [str(mis) for mis in misconception_df["MisconceptionName"].values]
   
    # load model and tokenizer
    lora_path="/kaggle/input/v7-recall/epoch_19_model/adapter.bin"
    model_path = "/kaggle/input/sfr-embedding-mistral/SFR-Embedding-2_R"
    tokenizer = AutoTokenizer.from_pretrained(lora_path.replace("/adapter.bin", ""))
    bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_use_double_quant=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.bfloat16                  
            )
    
    backbone = AutoModel.from_pretrained(model_path, quantization_config=bnb_config, device_map=device)
    config = LoraConfig(
            r=64,
            lora_alpha=128,
            target_modules=[
                "q_proj",
                "k_proj",
                "v_proj",
                "o_proj",
                "gate_proj",
                "up_proj",
                "down_proj",
            ],
            bias="none",
            lora_dropout=0.05,  # Conventional
            task_type="CAUSAL_LM",
        )
    model = get_peft_model(backbone, config)
    d = torch.load(lora_path, map_location=model.device)
    model.load_state_dict(d, strict=False)
    model = model.eval()
    model = model.to(device)

    max_length = 4096
    input_texts = quries + passages
    batch_size = 8
    all_embeddings = []
    length_sorted_idx = np.argsort([-len(text) for text in input_texts])
    input_texts_sorted = [input_texts[idx] for idx in length_sorted_idx]
    
    for start_index in trange(0, len(input_texts), batch_size, desc="Batches", disable=False):
        batch_dict = tokenizer(input_texts_sorted[start_index: start_index + batch_size], max_length=max_length, padding=True, truncation=True, return_tensors="pt")
        batch_dict = batch_to_device(batch_dict, device)  
        with torch.no_grad():
            outputs = model.model(**batch_dict)
            embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
            embeddings = embeddings.detach().cpu()
            embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
        all_embeddings.extend(embeddings.detach().cpu())

    all_embeddings = np.array([all_embeddings[idx] for idx in np.argsort(length_sorted_idx)])
    num_quries = len(quries)
    scores = (all_embeddings[:num_quries] @ all_embeddings[num_quries:].T) * 100
    top_100 = np.argsort(-scores, axis=1)[:, :100] # top 100 similar misconception
    return top_100

In [7]:
from peft import (
    LoraConfig,
    get_peft_model,
)

llm_misconception = pd.read_csv("llm_misconception.csv")
df["Misconception"] = llm_misconception["Misconception"]
misconception_df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")

top_100 = inference(df, misconception_df)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Batches: 100%|██████████| 325/325 [10:18<00:00,  1.90s/it]


In [8]:
str_top_100 = []
for i in range(len(top_100)):
    str_top_100.append(" ".join(top_100[i].astype("str")))
df["top_100"] = str_top_100
df.to_csv("top_100_df.csv")
df

Unnamed: 0,QuestionId_Answer,ConstructName,SubjectName,QuestionText,CorrectAnswer,IncorrectAnswer,Misconception,top_100
0,1869_B,Use the order of operations to carry out calcu...,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),\( 3 \times 2+(4-5) \),The student's misconception likely stems from ...,315 1345 2488 1392 2586 2306 1054 1005 2532 17...
1,1869_C,Use the order of operations to carry out calcu...,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),\( 3 \times(2+4-5) \),The student's misconception likely stems from ...,2488 315 1345 1084 1392 2586 373 2532 77 969 2...
2,1869_D,Use the order of operations to carry out calcu...,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),Does not need brackets,The student's misconception likely stems from ...,2532 706 77 1392 1507 1672 1005 2586 1345 871 ...
3,1870_A,Simplify an algebraic fraction by factorising ...,Simplifying Algebraic Fractions,"Simplify the following, if possible: \( \frac{...",Does not simplify,\( m+1 \),The student's misconception likely stems from ...,891 143 59 1540 2398 2078 167 885 2021 1610 36...
4,1870_B,Simplify an algebraic fraction by factorising ...,Simplifying Algebraic Fractions,"Simplify the following, if possible: \( \frac{...",Does not simplify,\( m+2 \),The student's misconception likely stems from ...,143 891 885 2078 2398 1540 59 265 715 1610 159...
5,1870_C,Simplify an algebraic fraction by factorising ...,Simplifying Algebraic Fractions,"Simplify the following, if possible: \( \frac{...",Does not simplify,\( m-1 \),The student's misconception likely stems from ...,891 143 1610 2398 885 59 2078 715 979 1540 126...
6,1871_A,Calculate the range from a list of data,Range and Interquartile Range from a List of Data,Tom and Katie are discussing the \( 5 \) plant...,Only\nKatie,Only\nTom,"The student likely chose ""Only Tom"" because th...",1287 1408 2439 2408 1073 1059 1923 1975 227 16...
7,1871_C,Calculate the range from a list of data,Range and Interquartile Range from a List of Data,Tom and Katie are discussing the \( 5 \) plant...,Only\nKatie,Both Tom and Katie,The misconception that led the student to inco...,1287 1073 1408 2408 2439 557 1923 1765 1338 17...
8,1871_D,Calculate the range from a list of data,Range and Interquartile Range from a List of Data,Tom and Katie are discussing the \( 5 \) plant...,Only\nKatie,Neither is correct,The student's misconception likely stems from ...,1287 1073 1408 2439 1765 2408 1059 1975 1700 1...


In [9]:
import torch
import gc

gc.collect()
torch.cuda.empty_cache()
!nvidia-smi

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Fri Nov 29 10:05:17 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   75C    P0             44W /   70W |     151MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                      

## Retrieve Final Answer Using LLM 

In [10]:
%%writefile vllm_rerank.py

from torch.utils.data import Dataset
from tqdm import tqdm
from torch.utils.data import DataLoader
from vllm import LLM, SamplingParams
from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor
import pandas as pd
import json


class MathDataset(Dataset):
    def __init__(self, df, misconception_df):
        self.data = df
        self.misconception_df = misconception_df
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        constructName = self.data.loc[idx, 'ConstructName']
        subjectName = self.data.loc[idx, "SubjectName"]
        question = self.data.loc[idx, "QuestionText"]
        correct_answer = self.data.loc[idx, "CorrectAnswer"]
        wrong_Answer = self.data.loc[idx, "IncorrectAnswer"]        
        retrival = "\n".join([f'{i + 1}. {self.misconception_df.loc[int(misconception_id), "MisconceptionName"]}' for i, misconception_id in enumerate(self.data.loc[idx, "top_100"].split(" "))])
    
        prompt = f"""Here is a question about {constructName}({subjectName}).
Question: {question}
Correct Answer: {correct_answer}
Incorrect Answer: {wrong_Answer} 
Answer concisely what misconception it is to lead to getting the Incorrect Answer. Pick the correct misconception number from the below:
                
{retrival}
"""
        message = [
            {"role": "system", "content": "You are a Mathematics teacher. Your task is to reason and identify the misconception behind the Incorrect Answer with the Question."},
            {"role": "user", "content": prompt.strip()}
        ]
        
        return message, correct_answer, str(self.data.loc[idx, "QuestionId_Answer"])
    
def collate_batch(batch):
    data, labels, question_ids = zip(*batch)
    text = tokenizer.apply_chat_template(
        data,
        tokenize=False,
        add_generation_prompt=True
    )
    return text, labels, question_ids


if __name__ == "__main__":
    df = pd.read_csv("top_100_df.csv")
    misconception_df = pd.read_csv("/kaggle/input/eedi-mining-misconceptions-in-mathematics/misconception_mapping.csv")
    train_dataset = MathDataset(df, misconception_df)
    train_pbar = tqdm(DataLoader(train_dataset, batch_size=32, collate_fn=collate_batch))
    
    model_name = "/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1"
    device = "cuda" # the device to load the model onto
    model = LLM(model_name,
                quantization="awq",
                gpu_memory_utilization=1,
                tensor_parallel_size=2,
                trust_remote_code=True,
                dtype="half",
                max_model_len=4000)
    tokenizer = model.get_tokenizer()
    
    llm_response = []
    for model_inputs, _, _ in train_pbar:
        outputs = model.generate(
            model_inputs,
            SamplingParams(
                n=1,
                temperature=0.,
                seed=111,
                max_tokens=1,
                logits_processors=[MultipleChoiceLogitsProcessor(tokenizer, choices=['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '32', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50', '51', '52', '53', '54', '55', '56', '57', '58', '59', '60', '61', '62', '63', '64', '65', '66', '67', '68', '69', '70', '71', '72', '73', '74', '75', '76', '77', '78', '79', '80', '81', '82', '83', '84', '85', '86', '87', '88', '89', '90', '91', '92', '93', '94', '95', '96', '97', '98', '99'])]
            ),
            use_tqdm=False
        )

        
        for i in range(len(outputs)):
            llm_response.append(outputs[i].outputs[0].text)

    with open("llm_rerank.json", "w") as fp:
        json.dump(llm_response, fp)



Writing vllm_rerank.py


In [11]:
!python vllm_rerank.py

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


INFO 11-29 10:05:22 config.py:715] Defaulting to use mp for distributed inference
INFO 11-29 10:05:22 llm_engine.py:176] Initializing an LLM engine (v0.5.3.post1) with config: model='/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1', speculative_config=None, tokenizer='/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=4000, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None), seed=0, served_model_name=/kaggle/input/qwen2.5/transformers/32b-instruct-awq/1, use_v2_block_manager=F

In [12]:
import json

with open("llm_rerank.json", "r") as fp:
    llm_answer = json.load(fp)

submission = []
for idx, row in df.iterrows():
    tmp = top_100[idx].astype(str)
    final_answer = [tmp[int(llm_answer[idx]) - 1]]
    for i in range(100):
        if tmp[i] != tmp[int(llm_answer[idx]) - 1]:
            final_answer.append(tmp[i])
            
    submission.append({"QuestionId_Answer": row["QuestionId_Answer"], "MisconceptionId": " ".join(final_answer)})
pd.DataFrame(submission).to_csv("submission.csv", index=False)

