In [1]:
!pip install transformers peft accelerate \
    -U --no-index --find-links /kaggle/input/lmsys-wheel-files

Looking in links: /kaggle/input/lmsys-wheel-files
Processing /kaggle/input/lmsys-wheel-files/peft-0.11.1-py3-none-any.whl
Installing collected packages: peft
Successfully installed peft-0.11.1


In [2]:
%%capture
!pip install --no-index /kaggle/input/bitsandbytes0-42-0/bitsandbytes-0.42.0-py3-none-any.whl --find-links=/kaggle/input/bitsandbytes0-42-0
# !pip install --no-index  /kaggle/input/bitsandbytes0-42-0/optimum-1.21.2-py3-none-any.whl --find-links=/kaggle/input/bitsandbytes0-42-0
# !pip install --no-index  /kaggle/input/bitsandbytes0-42-0/auto_gptq-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl --find-links=/kaggle/input/bitsandbytes0-42-0

In [None]:
# for explaining attention masks
# Original sentences
sentences = [
    "Hello world",           # 2 tokens
    "I love programming"     # 3 tokens
]

# After tokenization and padding to max_length=4
input_ids = [
    [101, 202, 303, 0],    # "Hello world [PAD]"
    [101, 202, 303, 404]   # "I love programming"
]

# Corresponding attention mask
attention_mask = [
    [1, 1, 1, 0],    # Last token is padding (0)
    [1, 1, 1, 1]     # All tokens are real (1)
]

In [3]:
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
import gc
import pandas as pd
import pickle
import sys
import numpy as np
from tqdm.autonotebook import trange
from sklearn.model_selection import GroupKFold
import json
import torch
from numpy.linalg import norm
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel,BitsAndBytesConfig
from peft import (
    LoraConfig,
    get_peft_model,
)
import json
import copy
import warnings
import os
warnings.filterwarnings('ignore')

def apk(actual, predicted, k=25):
    """
    Computes the average precision at k.
    
    This function computes the average prescision at k between two lists of
    items.
    
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
        
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    
    if not actual:
        return 0.0

    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        # first condition checks whether it is valid prediction
        # second condition checks if prediction is not repeated
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    return score / min(len(actual), k)

def mapk(actual, predicted, k=25):
    """
    Computes the mean average precision at k.
    
    This function computes the mean average prescision at k between two lists
    of lists of items.
    
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
        
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

def batch_to_device(batch, target_device):
    """
    send a pytorch batch to a device (CPU/GPU)
    """
    for key in batch:
        if isinstance(batch[key], Tensor):
            batch[key] = batch[key].to(target_device)
    return batch

def last_token_pool(last_hidden_states: Tensor,
                    attention_mask: Tensor) -> Tensor:
    """
    Extracts the last meaningful token from a sequence, handling both left and right padding
    """
    # Check if padding is on the left side
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        # If left-padded, last token is always at the end
        return last_hidden_states[:, -1]
    else:
        # For right-padded sequences, find the last actual token position
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        # Get the last actual token for each sequence in the batch
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]

def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

def inference(df, model, tokenizer, device):
    # Initialize batch parameters
    batch_size = 16
    max_length = 512
    
    # Extract sentences and their IDs from dataframe
    sentences = list(df['query_text'].values)
    pids = list(df['order_index'].values)
    all_embeddings = []

    # Sort sentences by length (longest first) for efficient batching
    length_sorted_idx = np.argsort([-len(sen) for sen in sentences])
    sentences_sorted = [sentences[idx] for idx in length_sorted_idx]

    # Process sentences in batches
    for start_index in trange(0, len(sentences), batch_size, desc="Batches", disable=False):
        # Get current batch of sentences
        sentences_batch = sentences_sorted[start_index: start_index + batch_size]
        
        # Tokenize the batch
        features = tokenizer(
            sentences_batch, 
            max_length=max_length, 
            padding=True, 
            truncation=True,
            return_tensors="pt"
        )
        
        # Move batch to specified device (CPU/GPU)
        features = batch_to_device(features, device)
        
        # Generate embeddings
        with torch.no_grad():  # Disable gradient calculation for inference
            outputs = model(**features)
            # Get the last token embeddings
            embeddings = last_token_pool(outputs.last_hidden_state, features['attention_mask'])
            # Normalize the embeddings
            embeddings = torch.nn.functional.normalize(embeddings, dim=-1)
            # Convert to numpy and store
            embeddings = embeddings.detach().cpu().numpy().tolist()
        all_embeddings.extend(embeddings)

    # Restore original order of embeddings
    all_embeddings = [np.array(all_embeddings[idx]).reshape(1, -1) 
                     for idx in np.argsort(length_sorted_idx)]

    # Combine all embeddings into a single array
    sentence_embeddings = np.concatenate(all_embeddings, axis=0)
    
    # Create dictionary mapping IDs to embeddings
    result = {pids[i]: em for i, em in enumerate(sentence_embeddings)}
    return result

In [None]:
# # example usage
# # Moving batch to GPU
# batch = {
#     'input_ids': torch.tensor([[1, 2, 3]]),
#     'attention_mask': torch.tensor([[1, 1, 1]]),
#     'labels': torch.tensor([1])
# }
# batch = batch_to_device(batch, 'cuda')

# # Getting last tokens from sequences
# hidden_states = torch.randn(32, 128, 768)  # batch_size=32, seq_len=128, hidden_dim=768
# attention_mask = torch.ones(32, 128)  # batch_size=32, seq_len=128
# last_tokens = last_token_pool(hidden_states, attention_mask)

In [4]:
path_prefix = "/kaggle/input/eedi-mining-misconceptions-in-mathematics"
# model_path = "/kaggle/input/sfr-embedding-mistral/SFR-Embedding-2_R"
model_path = "/kaggle/input/qwen2.5-14/pytorch/default/1"

lora_path='/kaggle/input/qwen14b-it-lora/lora_weights/adapter.bin'
device='cuda:0'
VALID = False

In [5]:
# Load the tokenizer from the base model directory
# Removing 'lora_weights/adapter.bin' from the path to get to the base directory
# which contains the original model's tokenizer files
tokenizer = AutoTokenizer.from_pretrained(lora_path.replace("/adapter.bin",""))

# Configure 4-bit quantization settings using bitsandbytes
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                         # Enable 4-bit quantization
    bnb_4bit_use_double_quant=True,           # Enable double quantization for additional memory savings
    bnb_4bit_quant_type="nf4",                # Use 4-bit NormalFloat (NF4) data type
    bnb_4bit_compute_dtype=torch.bfloat16     # Use bfloat16 for intermediate computations
)

# Load the base model with quantization settings
model = AutoModel.from_pretrained(
    model_path,                               # Path to the base model
    quantization_config=bnb_config,           # Apply the quantization settings
    device_map=device,                        # Automatically map model across available devices
    trust_remote_code=True                    # Allow loading of remote code (needed for some models)
)

# If LoRA weights are provided, load and apply them
if lora_path:
    print("loading lora")
    
    # Configure LoRA architecture - must match the configuration used during original fine-tuning
    config = LoraConfig(
        r=64,                    # Rank of the LoRA update matrices
        lora_alpha=128,          # LoRA scaling factor
        target_modules=[         # List of model modules to apply LoRA to
            "q_proj",            # Query projection
            "k_proj",            # Key projection
            "v_proj",            # Value projection
            "o_proj",            # Output projection
            "gate_proj",         # Gate projection
            "up_proj",           # Upward projection
            "down_proj",         # Downward projection
        ],
        bias="none",            # Don't train bias parameters
        lora_dropout=0.05,      
        task_type="FEATURE_EXTRACTION",  
    )
    
    # Wrap the base model with LoRA architecture
    model = get_peft_model(model, config)
    
    # Load the pre-trained LoRA weights
    d = torch.load(lora_path, map_location=model.device)
    
    # Apply the LoRA weights to the model
    # strict=False allows loading partial state dictionaries
    model.load_state_dict(d, strict=False)
    
    # Merge LoRA weights with base model weights and cleanup
    # This combines the LoRA adaptations with the original weights
    model = model.merge_and_unload()

# Set model to evaluation mode
# This disables dropout and other training-specific behaviors
model = model.eval()
# model = model.to(device)

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

loading lora


In [None]:
# # 输出模型的参数名和参数值
# for name, param in model.named_parameters():
#     if "base_model.model.layers.12.input_layernorm.weight"  in name:
#         print(f"参数名: {name}")
#         print(f"参数值: {param}")


In [6]:
task_description = 'Given a math question with correct answer and a misconcepted incorrect answer, retrieve the most accurate misconception for the incorrect answer.'

In [7]:
if VALID:
    tra = pd.read_parquet("/kaggle/input/val-parquet/v1_val.parquet")
    print(tra.shape)
else:
    tra = pd.read_csv(f"{path_prefix}/test.csv")
    print(tra.shape)
misconception_mapping = pd.read_csv(f"{path_prefix}/misconception_mapping.csv")
if tra.shape[0]<10:
    misconception_mapping = misconception_mapping.sample(n=5,random_state=2023)

(3, 11)


In [8]:
if VALID:
    train_data = []
    for _,row in tra.iterrows():
        for c in ['A','B','C','D']:
            if str(row[f"Misconception{c}Id"])!="nan":
                # print(row[f"Misconception{c}Id"])
                real_answer_id = row['CorrectAnswer']
                real_text = row[f'Answer{real_answer_id}Text']
                query_text = f"### SubjectName: {row['SubjectName']}\n### ConstructName: {row['ConstructName']}\n### Question: {row['QuestionText']}\n### Correct Answer: {real_text}\n### Misconcepte Incorrect answer: {row[f'Answer{c}Text']}"
                row['query_text'] = get_detailed_instruct(task_description,query_text)
                row['answer_id'] = int(row[f"Misconception{c}Id"])
                train_data.append(copy.deepcopy(row))
    train_df = pd.DataFrame(train_data)
    train_df['order_index'] = list(range(len(train_df)))
else:
    train_data = []
    for _,row in tra.iterrows():
        for c in ['A','B','C','D']:
            if c ==row['CorrectAnswer']:
                continue
            if f'Answer{c}Text' not in row:
                continue
            real_answer_id = row['CorrectAnswer']
            real_text = row[f'Answer{real_answer_id}Text']
            query_text = f"### SubjectName: {row['SubjectName']}\n### ConstructName: {row['ConstructName']}\n### Question: {row['QuestionText']}\n### Correct Answer: {real_text}\n### Misconcepte Incorrect answer: {row[f'Answer{c}Text']}"
            row['query_text'] = get_detailed_instruct(task_description,query_text)
            row['answer_name'] = c
            train_data.append(copy.deepcopy(row))
    train_df = pd.DataFrame(train_data)
    train_df['order_index'] = list(range(len(train_df)))
train_df.shape

(9, 14)

In [9]:
train_df

Unnamed: 0,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText,query_text,answer_name,order_index
0,1869,856,Use the order of operations to carry out calcu...,33,BIDMAS,A,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),\( 3 \times 2+(4-5) \),\( 3 \times(2+4-5) \),Does not need brackets,Instruct: Given a math question with correct a...,B,0
0,1869,856,Use the order of operations to carry out calcu...,33,BIDMAS,A,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),\( 3 \times 2+(4-5) \),\( 3 \times(2+4-5) \),Does not need brackets,Instruct: Given a math question with correct a...,C,1
0,1869,856,Use the order of operations to carry out calcu...,33,BIDMAS,A,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,\( 3 \times(2+4)-5 \),\( 3 \times 2+(4-5) \),\( 3 \times(2+4-5) \),Does not need brackets,Instruct: Given a math question with correct a...,D,2
1,1870,1612,Simplify an algebraic fraction by factorising ...,1077,Simplifying Algebraic Fractions,D,"Simplify the following, if possible: \( \frac{...",\( m+1 \),\( m+2 \),\( m-1 \),Does not simplify,Instruct: Given a math question with correct a...,A,3
1,1870,1612,Simplify an algebraic fraction by factorising ...,1077,Simplifying Algebraic Fractions,D,"Simplify the following, if possible: \( \frac{...",\( m+1 \),\( m+2 \),\( m-1 \),Does not simplify,Instruct: Given a math question with correct a...,B,4
1,1870,1612,Simplify an algebraic fraction by factorising ...,1077,Simplifying Algebraic Fractions,D,"Simplify the following, if possible: \( \frac{...",\( m+1 \),\( m+2 \),\( m-1 \),Does not simplify,Instruct: Given a math question with correct a...,C,5
2,1871,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,B,Tom and Katie are discussing the \( 5 \) plant...,Only\nTom,Only\nKatie,Both Tom and Katie,Neither is correct,Instruct: Given a math question with correct a...,A,6
2,1871,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,B,Tom and Katie are discussing the \( 5 \) plant...,Only\nTom,Only\nKatie,Both Tom and Katie,Neither is correct,Instruct: Given a math question with correct a...,C,7
2,1871,2774,Calculate the range from a list of data,339,Range and Interquartile Range from a List of Data,B,Tom and Katie are discussing the \( 5 \) plant...,Only\nTom,Only\nKatie,Both Tom and Katie,Neither is correct,Instruct: Given a math question with correct a...,D,8


In [10]:
train_embeddings = inference(train_df, model, tokenizer, device)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [11]:
misconception_mapping['query_text'] = misconception_mapping['MisconceptionName']
misconception_mapping['order_index'] = misconception_mapping['MisconceptionId']
doc_embeddings = inference(misconception_mapping, model, tokenizer, device)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [12]:
sentence_embeddings = np.concatenate([e.reshape(1, -1) for e in list(doc_embeddings.values())])
index_text_embeddings_index = {index: paper_id for index, paper_id in
                                         enumerate(list(doc_embeddings.keys()))}

In [13]:
predicts_test = []
for _, row in tqdm(train_df.iterrows()):
    query_id = row['order_index']
    query_em = train_embeddings[query_id].reshape(1, -1)
    
    cosine_similarity = np.dot(query_em, sentence_embeddings.T).flatten()
    
    sort_index = np.argsort(-cosine_similarity)[:25]
    pids = [index_text_embeddings_index[index] for index in sort_index]
    predicts_test.append(pids)

0it [00:00, ?it/s]

In [14]:
if VALID:
    train_df['recall_ids'] = predicts_test
    print(mapk([[data] for data in train_df['answer_id'].values],train_df['recall_ids'].values))
else:
    train_df['MisconceptionId'] = [' '.join(map(str,c)) for c in predicts_test]
    sub = []
    for _,row in train_df.iterrows():
        sub.append(
            {
                "QuestionId_Answer":f"{row['QuestionId']}_{row['answer_name']}",
                "MisconceptionId":row['MisconceptionId']
            }
        )
    submission_df = pd.DataFrame(sub)
    submission_df.to_csv("submission.csv", index=False)
    print("Submission file created successfully!")

Submission file created successfully!
