# Math Question Answer Verification Competition

## Starter Code

Borrowed from [official Unsloth implementation](https://colab.research.google.com/drive/1Ys44kVvmeZtnICzWz0xgpRnrIOjZAuxp?usp=sharing#scrollTo=MKX_XKs_BNZR)

In [None]:
# %%capture
# This cell will take time
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

Found existing installation: unsloth 2024.11.6
Uninstalling unsloth-2024.11.6:
  Successfully uninstalled unsloth-2024.11.6
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-9_kfjs8f/unsloth_eb4d81c967d14d7080f5b56578adc943
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-9_kfjs8f/unsloth_eb4d81c967d14d7080f5b56578adc943
  Resolved https://github.com/unslothai/unsloth.git to commit d8ff860c842095f4729fdd1d5aedf567a9e2c4da
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: unsloth
  Building wheel for unsloth (pyproject.toml) ... [?25l[?25hdone
  Created wheel for unsloth: filename=unsloth-2024.11.6-py3-none-a

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 4096 # Choose any
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.11.6: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 8.0. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 128, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

## Load model and wrap with LoRA adapters

## Competition dataset

In [None]:
# download and load competition dataset

from datasets import load_dataset
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")
# print and see dataset
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 1000000
    })
    test: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 10000
    })
})

In [None]:
prompt = """You are a Quiz Master and you are tasked with finding if an answer to a given maths question is correct or not. Your response should be 'True' if answer is correct, otherwise 'False'. Below is Question and Answer.

### Question:
{}

### Answer:
{}

### Solution:
{}

### Instructions:
1. Carefully read the question and the given answer.
2. Solve the problem independently to verify the result.
3. Compare your solution with the given solution.
4. Conclude with either 'True' if the given answer is correct, or 'False' if it is incorrect.


Output:
{}
"""



EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
def formatting_prompts_func(examples):
    questions = examples["question"]
    answers       = examples["answer"]
    solutions     = examples["solution"]
    outputs      = examples["is_correct"]
    texts = []
    for instruction, input, sol, output in zip(questions, answers, solutions, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = prompt.format(instruction, input, sol, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }




In [None]:
# Process the training dataset and generate prompt for each datapoint

train_dataset = dataset['train'].map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

In [None]:
import random
from sklearn.model_selection import train_test_split

# Set the size of your random subset
subset_size = 80000

# Randomly select indices from the training dataset
random_indices = random.sample(range(len(train_dataset)), subset_size)

# Create the random subset
full_subset = train_dataset.select(random_indices)

# Split the subset into training and validation sets
train_indices, val_indices = train_test_split(range(len(full_subset)), test_size=0.2, random_state=42)
train_subset = full_subset.select(train_indices)
val_subset = full_subset.select(val_indices)

print(f"Training set size: {len(train_subset)}")
print(f"Validation set size: {len(val_subset)}")

Training set size: 64000
Validation set size: 16000


In [None]:
# import random

# # Determine the size of your evaluation subset
# eval_size = 10000  # for example

# # Create a list of random indices
# eval_indices = random.sample(range(len(train_dataset)), eval_size)

# # Create the evaluation subset
# eval_subset = train_dataset.select(eval_indices)

In [None]:
#print a smaple training example
train_subset['text'][0]

"You are a Quiz Master and you are tasked with finding if an answer to a given maths question is correct or not. Your response should be 'True' if answer is correct, otherwise 'False'. Below is Question and Answer.\n\n### Question:\nIf $a,b,c>0$, find the smallest possible value of\n\\[\\left\\lfloor{\\frac{a+b}{c}}\\right\\rfloor+\\left\\lfloor{\\frac{b+c}{a}}\\right\\rfloor+\\left\\lfloor{\\frac{c+a}{b}}\\right\\rfloor.\\](Note that $\\lfloor{x}\\rfloor$ denotes the greatest integer less than or equal to $x$.)\n\n### Answer:\n2(2h+1)\n\n### Solution:\nLet's represent $a,b,c$ as $x+h$, $x+k$, and $x+j$, respectively.\nWe can now write out the expression that we want to minimize as follows.\n\\[\\left\\lfloor{\\frac{a+b}{c}}\\right\\rfloor+\\left\\lfloor{\\frac{b+c}{a}}\\right\\rfloor+\\left\\lfloor{\\frac{c+a}{b}}\\right\\rfloor=2\\left\\lfloor{\\frac{x+h+x+k}{x+j}}\\right\\rfloor+2\\left\\lfloor{\\frac{x+j+x+h}{x+k}}\\right\\rfloor+2\\left\\lfloor{\\frac{x+k+x+j}{x+h}}\\right\\rfloor

In [None]:
# import numpy as np
# from sklearn.metrics import accuracy_score, f1_score

# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     predictions = np.argmax(predictions, axis=1)
#     return {
#         "accuracy": accuracy_score(labels, predictions),
#         "f1": f1_score(labels, predictions, average="weighted")
#     }

In [109]:
torch.cuda.empty_cache()

## SFT

In [110]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

training_args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 2,
        warmup_ratio = 0.1,
        max_steps = 250,
        learning_rate = 0.00001047,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adafactor",
        weight_decay = 0.07122,
        max_grad_norm=1.0,
        lr_scheduler_type = "cosine_with_restarts",
        num_train_epochs = 1,
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    )

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_subset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 4,
    packing = False, # Can make training 5x faster for short sequences.
    args = training_args
)

Map (num_proc=4):   0%|          | 0/64000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [111]:
# Start training
train_result = trainer.train()

# Print final results
print("\nTraining completed!")
print(f"Final metrics: {train_result.metrics}")

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 64,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 2
\        /    Total batch size = 4 | Total steps = 250
 "-____-"     Number of trainable parameters = 335,544,320


Step,Training Loss
1,0.1795
2,0.227
3,0.1669
4,0.2147
5,0.1294
6,0.2198
7,0.189
8,0.1203
9,0.2094
10,0.2041



Training completed!
Final metrics: {'train_runtime': 228.3029, 'train_samples_per_second': 4.38, 'train_steps_per_second': 1.095, 'total_flos': 1.991651614285824e+16, 'train_loss': 0.08561318024992942, 'epoch': 0.015625}


In [112]:
# # Run evaluation on the validation dataset
# results = trainer.evaluate(eval_dataset=val_subset)
# print("Validation Results:", results)

## inference

In [113]:
# Sample inferene data point
test_dataset = dataset['test']

sample_ques = test_dataset['question'][5]
sample_ans = test_dataset['answer'][5]
answer = test_dataset['is_correct'][5]

print(sample_ques)
print(sample_ans)
print(answer)


For what values of $x$ is $2x^2+8x\le-6$? Express your answer in interval notation.
[-1 - \sqrt{5}, -1 + \sqrt{5} ]
True


In [99]:
# # Running inference on single test
# FastLanguageModel.for_inference(model) # Enable native 2x faster inference
# input_prompt = prompt.format(
#         sample_ques, # ques
#         sample_ans, # given answer
#         "", # output - leave this blank for generation! LLM willl generate is it is True or False
#     )

# print("Input Promt:\n", input_prompt)
# inputs = tokenizer(
# [
#     input_prompt
# ], return_tensors = "pt").to("cuda")

# input_shape = inputs['input_ids'].shape
# input_token_len = input_shape[1] # 1 because of batch
# outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# # you can get the whole generated text by uncommenting the below line
# text_generated = tokenizer.batch_decode(outputs, skip_special_tokens=True)
# print("")
# print("Text Generated: ", text_generated)

# response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
# type(response)
# response

## saving model

In [114]:
model.save_pretrained("lora_model") # Local saving
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/tokenizer.json')

In [115]:
torch.cuda.empty_cache()

In [116]:
if True:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference


==((====))==  Unsloth 2024.11.6: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 8.0. CUDA Toolkit = 12.4.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [117]:
# import random
# import torch
# from torch.utils.data import Subset

# subset_size = 500  # Or any number smaller than the full dataset size
# indices = random.sample(range(len(test_dataset)), subset_size)
# test_subset = Subset(test_dataset, indices)
# print(len(test_subset))

In [118]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from tqdm import tqdm
import pandas as pd
import numpy as np

import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# Enable faster inference
FastLanguageModel.for_inference(model)

# Prepare a list to collect results
results = []

# Prepare a batch size for processing multiple samples at once
batch_size = 8  # Adjust based on your GPU memory capacity

# Initialize the progress bar with total number of batches
num_batches = len(test_dataset) // batch_size + (1 if len(test_dataset) % batch_size != 0 else 0)

# Process samples in batches with progress bar
for start_idx in tqdm(range(0, len(test_dataset), batch_size), total=num_batches, desc="Processing Batches"):
    end_idx = min(start_idx + batch_size, len(test_dataset))

    # Get questions and answers for the current batch
    batch_questions = test_dataset['question'][start_idx:end_idx]
    batch_answers = test_dataset['answer'][start_idx:end_idx]
    batch_solutions = test_dataset['solution'][start_idx:end_idx]

    # Create input prompts for the batch
    input_prompts = [
        prompt.format(
            ques,    # question
            ans,     # given answer
            sol,     #
            "")      # empty for generation
        for ques, ans, sol in zip(batch_questions, batch_answers, batch_solutions)
    ]

    inputs = tokenizer(
        input_prompts,

        return_tensors="pt",
        padding=True,  # Add padding to make all sequences the same length
        truncation=True,
    ).to("cuda")

    input_shape = inputs['input_ids'].shape
    input_token_len = input_shape[1]

    # Generate outputs from the model
    outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)

    # Extract responses starting from input_token_len
    responses = tokenizer.batch_decode([output[input_token_len:] for output in outputs], skip_special_tokens=True)

    # Store results
    for idx, res in enumerate(responses):
        results.append({
            "ID": start_idx + idx,
            "is_correct": res
        })

# Convert results to a DataFrame
results_df = pd.DataFrame(results)


results_df['is_correct'] = results_df['is_correct'].apply(
    lambda x: (
        x.strip().lower() == 'true' if isinstance(x, str)
        else (float(x) > 0 if isinstance(x, (int, float, np.number)) or (isinstance(x, str) and x.replace('.', '', 1).isdigit())
        else False)
    )
)

# Save results to a CSV file
results_df.to_csv('inference_results.csv', index=False)

print("Inference results saved to 'inference_results.csv'")

Processing Batches: 100%|██████████| 1250/1250 [13:49<00:00,  1.51it/s]

Inference results saved to 'inference_results.csv'





In [119]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('inference_results.csv')

column_name = 'is_correct'  # Replace with your column name
column_type = df[column_name].dtype

print(f"The data type of '{column_name}' column is: {column_type}")

The data type of 'is_correct' column is: bool
