In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/converted/b6_train_data_converted.csv
/kaggle/input/fpt-ai/dataset/b6_test_data.csv
/kaggle/input/fpt-ai/dataset/b6_train_data.csv
/kaggle/input/fpt-ai/dataset/test_data.pkl
/kaggle/input/fpt-ai/dataset/train_data.pkl


## Gen Reasoning

In [2]:
# Example of how to use this for a single question
import pandas as pd

# Read the questions
questions_df = pd.read_csv("/kaggle/input/fpt-ai/dataset/b6_test_data.csv")

# Function to convert A/B/C... to index
def char_to_index(char):
    return ord(char) - ord('A')

def format_prompt(question, answer):
    return f"""<|im_start|>system
Given a question and its correct answer. Explain why this is the correct answer. Be concise and clear in your explanation.
<|im_end|>
<|im_start|>user
Explain why this is the correct answer:
{question}
Correct Answer:
{answer}
<|im_end|>
<|im_start|>assistant
"""

In [3]:
import torch
from tqdm import tqdm

# Function to extract character (A, B, C...) from answer field
def extract_answer_char(answer):
    if pd.isna(answer) or not isinstance(answer, str):
        return None
    if "ANSWER:" in answer:
        answer_char = answer.split(":")[-1].strip()
    else:
        answer_char = answer.strip()
    return answer_char

def gen_reasoning(df, batch_size=8, max_new_tokens=256):
    # Collect processed rows
    reasonings = {}

    # Use a batch processing approach
    num_batches = len(df) // batch_size + (1 if len(df) % batch_size > 0 else 0)

    for batch_idx in tqdm(range(num_batches), total=num_batches):
        batch_start = batch_idx * batch_size
        batch_end = min((batch_idx + 1) * batch_size, len(df))
        batch_df = df.iloc[batch_start:batch_end]

        # Prepare the inputs for the batch
        batch_prompts = []
        task_ids = []

        for _, row in batch_df.iterrows():
            task_id = row['task_id']
            question = row['question']
            choices_str = row['choices']
            answer_raw = row['answer']

            # Extract clean answer char
            answer_char = extract_answer_char(answer_raw)
            if not answer_char:
                continue  # Skip rows with no valid answer

            try:
                choices = eval(choices_str)
            except Exception:
                continue  # Skip rows where choices can't be evaluated

            answer_index = char_to_index(answer_char)

            # Get correct answer text
            if 0 <= answer_index < len(choices):
                correct_answer_text = choices[answer_index]

            prompt = format_prompt(question, correct_answer_text)
            batch_prompts.append(prompt)
            task_ids.append(task_id)

        # Tokenize the entire batch and pass to the model
        inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)

        # Generate outputs for the entire batch in one go
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            top_p=1.0,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id
        )

        # Decode the outputs for the entire batch
        for task_id, output in zip(task_ids, outputs):
            decoded = tokenizer.decode(output, skip_special_tokens=True)
            if "<|im_start|>assistant" in decoded:
                reasoning = decoded.split("<|im_start|>assistant")[-1].strip()
            else:
                reasoning = decoded.strip()

            # Save each task_id with the corresponding reasoning
            reasonings[task_id] = reasoning

            # Clear cache and free memory after each batch
        del inputs  # Delete the input batch
        del outputs  # Delete the model's output batch

        torch.cuda.empty_cache()  # Clear the GPU memory cache

    return reasonings


In [4]:
# df = pd.read_csv("/kaggle/input/fpt-ai/dataset/b6_train_data.csv")
# reasonings = gen_reasoning(df,8,256)

## Fine-tuning LLM

In [5]:
# Đọc dữ liệu
train_data = pd.read_csv('/kaggle/input/fpt-ai/dataset/b6_train_data.csv')
test_data = pd.read_csv('/kaggle/input/fpt-ai/dataset/b6_test_data.csv')

print("Kích thước dữ liệu huấn luyện:", train_data.shape)
print("Kích thước dữ liệu kiểm tra:", test_data.shape)

# Kiểm tra các giá trị null
print("\nSố lượng giá trị null trong tập huấn luyện:")
print(train_data.isnull().sum())
print("\nSố lượng giá trị null trong tập kiểm tra:")
print(test_data.isnull().sum())

Kích thước dữ liệu huấn luyện: (3963, 4)
Kích thước dữ liệu kiểm tra: (1253, 3)

Số lượng giá trị null trong tập huấn luyện:
task_id      0
question     0
choices      0
answer      14
dtype: int64

Số lượng giá trị null trong tập kiểm tra:
task_id     0
question    0
choices     0
dtype: int64


In [6]:
# Hàm để chuyển đổi chuỗi choices thành danh sách
def parse_choices(choices_str):
    try:
        if isinstance(choices_str, list):
            return choices_str

        # Sử dụng ast.literal_eval để chuyển đổi chuỗi thành danh sách
        try:
            return ast.literal_eval(choices_str)
        except:
            # Nếu không thể phân tích cú pháp, thực hiện xử lý thủ công
            choices_str = choices_str.strip('[]')
            choices = [choice.strip().strip("'").strip('"') for choice in choices_str.split(',')]
            return choices
    except:
        print(f"Lỗi xử lý choices: {choices_str}")
        return []

# Áp dụng hàm parse_choices cho cả tập huấn luyện và kiểm tra
train_data['parsed_choices'] = train_data['choices'].apply(parse_choices)
test_data['parsed_choices'] = test_data['choices'].apply(parse_choices)

# Chuẩn hóa câu trả lời (một số có "ANSWER: " ở đầu)
def normalize_answer(answer_str):
    if isinstance(answer_str, str):
        if "ANSWER: " in answer_str:
            return answer_str.split("ANSWER: ")[1]
    return answer_str

train_data['normalized_answer'] = train_data['answer'].apply(normalize_answer)

In [7]:
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    Trainer
)
from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import warnings

# Hàm định dạng dữ liệu cho mô hình
def format_example(row, include_answer=True):
    question = row['question']
    choices = row['parsed_choices']

    # Định dạng các lựa chọn thành A, B, C, D
    formatted_choices = '\n'.join([f"{chr(65+i)}. {choice}" for i, choice in enumerate(choices)])

    if include_answer:
        return f"""{question}

Options:
{formatted_choices}

Answer: {row['normalized_answer']}"""
    else:
        return f"""{question}

Options:
{formatted_choices}

Answer:"""

# Tạo tập dữ liệu để huấn luyện
train_data['text'] = train_data.apply(format_example, axis=1)
test_data['text'] = test_data.apply(format_example, axis=1, include_answer=False)

# Phân chia tập huấn luyện và tập validation
train_df, val_df = train_test_split(train_data, test_size=0.2, random_state=42)

print(f"Số lượng mẫu trong tập huấn luyện: {len(train_df)}")
print(f"Số lượng mẫu trong tập validation: {len(val_df)}")

# Tạo dataset từ các pandas DataFrame
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_data)

# Hiển thị một ví dụ từ tập huấn luyện
print("\nVí dụ từ tập huấn luyện:")
print(train_dataset[1]['text'])

Số lượng mẫu trong tập huấn luyện: 3170
Số lượng mẫu trong tập validation: 793

Ví dụ từ tập huấn luyện:
Question: Which of the following statements is true about list comprehension?

Options:
A. It can only be used for creating lists of integers
B. It can include an optional if clause for filtering elements
C. It can include an optional if clause for filtering elements
D. It cannot be nested

Answer: B


In [8]:
!pip install -U bitsandbytes



In [9]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import ast
from sklearn.model_selection import train_test_split
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    Trainer
)
from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import warnings
warnings.filterwarnings('ignore')

In [10]:
model_id = "Qwen/Qwen2.5-Coder-0.5B-Instruct"

# Cấu hình lượng tử hóa 4-bit để giảm memory footprint
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True
)

# Tải tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Tải mô hình với cấu hình lượng tử hóa
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)


# Chuẩn bị mô hình cho huấn luyện 4-bit
model = prepare_model_for_kbit_training(model)

# Cấu hình LoRA để fine-tuning
lora_config = LoraConfig(
    r=8,  # Rank
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "down_proj", "up_proj"]
)

# Áp dụng LoRA cho mô hình
peft_model = get_peft_model(model, lora_config)
print(f"Số lượng tham số huấn luyện: {peft_model.print_trainable_parameters()}")

# Hàm tiền xử lý dữ liệu để mã hóa
def preprocess_function(examples):
    # Cắt bớt các câu có kích thước vượt quá max_length
    max_length = 256  # Có thể điều chỉnh

    inputs = tokenizer(
        examples["text"],
        truncation=True,
        max_length=max_length,
        padding="max_length",
        return_tensors="pt",
    )

    inputs["labels"] = inputs["input_ids"].clone()
    return inputs

# Áp dụng tiền xử lý
tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_val = val_dataset.map(preprocess_function, batched=True)

# Thiết lập tham số huấn luyện
training_args = TrainingArguments(
    output_dir="results/code_mlu_model",
    evaluation_strategy="epoch",
    learning_rate=5e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=1,
    num_train_epochs=50,
    weight_decay=0.01,
    save_strategy="epoch",
    fp16=True,
    report_to="none",
)

# Khởi tạo trainer
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

# Tiến hành huấn luyện mô hình
print("Bắt đầu huấn luyện mô hình...")
trainer.train()

# Lưu mô hình đã huấn luyện
peft_model.save_pretrained("results/code_mlu_model_final")
tokenizer.save_pretrained("results/code_mlu_model_final")

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.33G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

trainable params: 20,185,088 || all params: 7,635,801,600 || trainable%: 0.2643
Số lượng tham số huấn luyện: None


Map:   0%|          | 0/3170 [00:00<?, ? examples/s]

Map:   0%|          | 0/793 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Bắt đầu huấn luyện mô hình...


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## Loading and Embedding

## RAG


In [None]:
import pickle
import torch 
from torch.nn import functional as F
import numpy as np
import pandas as pd

def vector_store(file_path):
    data = pickle.load(open(file_path, "rb"))

    task_id = list(data.keys())
    embeddings = np.array(list(data.values()))
    embeddings = torch.tensor(embeddings)

    matrix_embeddings = torch.cat([embeddings], dim=0)
    normalized_embeddings = F.normalize(matrix_embeddings, p=2, dim=1)

    return task_id, normalized_embeddings

def top_k_cosine_similarity(A, B, k=3):
    # Normalize A and B to unit vectors (L2 normalization)
    A_norm = A / A.norm(dim=1, keepdim=True)
    B_norm = B / B.norm(dim=1, keepdim=True)
    
    # Compute cosine similarity matrix
    similarity = torch.mm(A_norm, B_norm.T)  # [n, m] matrix
    
    # Get the top-k indices and values from the similarity matrix
    top_k_values, top_k_indices = torch.topk(similarity, k=k, dim=1, largest=True, sorted=True)
    
    return top_k_values, top_k_indices


def retrieval(query_index, csv_data_path, train_pkl_path, test_pkl_path, reasonings, k=3):
    task_id, train_db = vector_store(train_pkl_path)
    _, test_db = vector_store(test_pkl_path)

    # print(f"retrieving {k} most similar questions to query index {query_index}")
    top_k_values, top_k_indices = top_k_cosine_similarity(test_db, train_db, k=k)
    rag_indices = top_k_indices[query_index].tolist()
    top_k_task_id = [task_id[idx] for idx in rag_indices]

    raw_data = pd.read_csv(csv_data_path)
    qa_pairs = raw_data[raw_data["task_id"].isin(top_k_task_id)][["question", "choices", "answer", "reasoning"]]

    return qa_pairs.to_dict(orient='records')

In [None]:
def index_to_letter(index):
    try:
        letters = ['A', 'B', 'C', 'D']
        letter = letters[index]
    except:
        letter = 'A'
    return letter

letter = index_to_letter(2)
print(letter)

In [None]:
import ast
import re
import pandas as pd

# Read the questions
questions_df = pd.read_csv("/kaggle/input/fpt-ai/dataset/b6_test_data.csv")


questions_df["choices"] = questions_df["choices"].apply(ast.literal_eval)

import re

def extract_index_from_output(text):
    # Search for the pattern after "<|im_start|>assistant"
    # match = re.search(r"<\|im_start\|>assistant\s*(?:ANSWER:\s*)?(\d+)", text)
    match = re.search(r"assistant\s*(?:ANSWER:\s*)?(\d+)", text)
    if match:
        return int(match.group(1))  # Extract the number
    else:
        return 0  # Default to 0 if no match is found

def format_prompt(question, choices, qa_pairs):
    formatted_choices = "\n".join([f"{i}. {c}" for i, c in enumerate(choices)])
    return f"""<|im_start|>system
You are a helpful AI assistant tasked with answering multiple-choice questions about coding.

I will provide you with questions and their possible answer choices. I will also give you several examples. For each question:
1. Respond solely with the index number of the correct option in the list, starting from 0.
2. Do not include any explanations, text, or anything other than the letter of the correct answer.

Here are some examples:

1. Question: {qa_pairs[0]['question']}
   Choices:
   {qa_pairs[0]['choices']}
   Answer: {qa_pairs[0]['answer']}
   Reasoning: {qa_pairs[0]['reasoning']}

2. Question: {qa_pairs[1]['question']}
   Choices:
   {qa_pairs[1]['choices']}
   Answer: {qa_pairs[1]['answer']}
   Reasoning: {qa_pairs[1]['reasoning']}

3. Question: {qa_pairs[2]['question']}
   Choices:
   {qa_pairs[2]['choices']}
   Answer: {qa_pairs[2]['answer']}
   Reasoning: {qa_pairs[1]['reasoning']}

Now, please answer the following question:
<|im_end|>
<|im_start|>user
Question: {question}
Choices:
{formatted_choices}
<|im_end|>
<|im_start|>assistant
"""
import re

def extract_number_after_assistant(text):
    # Search for the number after "assistant" or "ANSWER:"
    match = re.search(r"assistant\s*(?:ANSWER:\s*)?(\d+)", text)
    if match:
        return int(match.group(1))  # Extract and return the number
    else:
        return None  # Return None if no match is found

def index_to_letter(index):
    try:
        letters = ['A', 'B', 'C', 'D', 'E', 'F', 'G']
        letter = letters[index]
    except:
        letter = 'A'
    return letter

def get_index_answer(prompt, max_new_tokens=5):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    output = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        top_p=1.0,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    if "<|im_start|>assistant" in decoded:
        result = decoded.split("<|im_start|>assistant")[-1].strip()
    else:
        result = decoded.strip()
    # print(result)
    # print("--------------------------------")
    # print(extract_index_from_output(result))
    # print(extract_number_after_assistant(result))
    return index_to_letter((extract_index_from_output(result)))

In [None]:
def index_to_letter(index, zero_based=True):
    """
    Convert an index to a corresponding letter (A, B, C, ...).

    Args:
        index (int): The index to convert.
        zero_based (bool): If True, treats the index as zero-based (0 -> A).
                           If False, treats the index as one-based (1 -> A).

    Returns:
        str: The corresponding letter.
    """
    if not zero_based:
        index -= 1  # Convert 1-based index to 0-based
    return chr(65 + index)  # 65 is the ASCII value of 'A'

In [None]:
preds = []

from tqdm import tqdm

for idx, row in tqdm(questions_df.iterrows(), total=len(questions_df)):
    task_id = row['task_id']
    question = row['question']
    # print(question)
    choices_str = row['choices']

    qa_pairs = retrieval(idx, csv_data_path, train_pkl_path, test_pkl_path, k=3)

    prompt = format_prompt(question, choices_str, qa_pairs)

    index = get_index_answer(prompt)
    preds.append(index)


submission = pd.DataFrame({
    "task_id": test_df["task_id"],
    "answer": preds
})
submission.to_csv("submission.csv", index=False)
print("Đã tạo submission.csv xong.")
    

In [None]:
submission = pd.DataFrame({
    "task_id": questions_df["task_id"],
    "answer": preds
})
submission.to_csv("submission.csv", index=False)
print("Đã tạo submission.csv xong.")

In [None]:
import pandas as pd
from tqdm import tqdm

def batch_infer(questions_df, csv_data_path, train_pkl_path, test_pkl_path, batch_size=4):
    preds = []

    # Process in batches
    for start_idx in tqdm(range(0, len(questions_df), batch_size), total=(len(questions_df) // batch_size) + 1):
        end_idx = min(start_idx + batch_size, len(questions_df))
        batch = questions_df.iloc[start_idx:end_idx]

        for idx, row in batch.iterrows():
            task_id = row['task_id']
            question = row['question']
            choices_str = row['choices']

            # Retrieve relevant QA pairs
            qa_pairs = retrieval(idx, csv_data_path, train_pkl_path, test_pkl_path, k=3)

            # Format the prompt
            prompt = format_prompt(question, choices_str, qa_pairs)

            # Get the predicted index
            index = get_index_answer(prompt)
            preds.append(index)

    return preds



# Example usage
preds = batch_infer(questions_df, csv_data_path, train_pkl_path, test_pkl_path, batch_size=32)


# Create the submission DataFrame
submission = pd.DataFrame({
    "task_id": questions_df["task_id"],
    "answer": preds
})
submission.to_csv("submission.csv", index=False)
print("Đã tạo submission.csv xong.")

# Batch Processing