# Math Question Answer Verification Competition

## TEAM LLM training notebook

 ## INSTALLATION

In [None]:
%%capture
!pip install unsloth "xformers==0.0.28.post2"
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 1024 # Choose any
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2024.11.7: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA A100-SXM4-40GB. Max memory: 39.564 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.0+cu121. CUDA = 8.0. CUDA Toolkit = 12.1.
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post2. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

In [None]:
training_prompt = """You are a great mathematician and you are tasked with finding if an solution to a given maths question is correct or not. Your response should be 'True' if solution is correct, otherwise 'False'. Below is Question and Solution.
Question:
{}
Solution:
{}
Output:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
# EOS_TOKEN=""
def formatting_prompts_func(examples):
    question = examples["question"]
    # ans= examples["answer"]
    solution=examples["solution"]
    output= examples["is_correct"]
    texts = []
    for instruction, input, output in zip(question,solution, output):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = training_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

## Competition dataset

In [None]:
# Step 1: Load the dataset
from datasets import load_dataset
dataset = load_dataset("ad6398/nyu-dl-teach-maths-comp")
# print and see dataset
dataset

README.md:   0%|          | 0.00/2.09k [00:00<?, ?B/s]

train-00000-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

train-00001-of-00002.parquet:   0%|          | 0.00/195M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/3.65M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/10000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 1000000
    })
    test: Dataset({
        features: ['question', 'is_correct', 'answer', 'solution'],
        num_rows: 10000
    })
})

In [None]:
# Access the training split
train_dataset = dataset['train']

# Verify the type of the original train split
print(f"Original train_dataset type: {type(train_dataset)}")  # Should be datasets.Dataset

# Step 2: Shuffle the training dataset
# Setting a seed for reproducibility
shuffled_train = train_dataset.shuffle(seed=42)

# Step 3: Split into validation and the rest
# Select the first 1,000 samples for validation
validation_set = shuffled_train.select(range(1000))

# Select the next 40,000 samples for the new training set
new_train_set = shuffled_train.select(range(1000, 41000))

# Step 4: Save the remaining samples for convenience
remaining_samples = shuffled_train.select(range(41000, len(shuffled_train)))

# Step 5: Verify the splits
print(f"Validation set size: {len(validation_set)}")          # Should be 1,000
print(f"New training set size: {len(new_train_set)}")        # Should be 40,000
print(f"Remaining samples size: {len(remaining_samples)}")  # Should be 959,000

# Step 6: Verify that the data types are consistent
print(f"Type of new_train_set: {type(new_train_set)}")
print(f"Type of original train_dataset: {type(train_dataset)}")
print(f"Are types equal? {type(new_train_set) == type(train_dataset)}")  # Should be True

# Optional Step 7: Update the DatasetDict with new splits
# Remove the original 'train' split
del dataset['train']

# Assign the new splits to the DatasetDict
# Note: If you want to keep the original dataset unchanged, skip this step and use the variables separately
dataset = dataset.copy()  # Make a copy to avoid modifying the original dataset object
dataset['train'] = new_train_set
dataset['validation'] = validation_set
dataset['remaining'] = remaining_samples

# Verify the updated DatasetDict
print(dataset)

# Optional Step 8: Verify data type consistency in the updated DatasetDict
print(f"Type of dataset['train']: {type(dataset['train'])}")  # Should be datasets.Dataset
print(f"Type of dataset['validation']: {type(dataset['validation'])}")  # Should be datasets.Dataset
print(f"Type of dataset['remaining']: {type(dataset['remaining'])}")

Original train_dataset type: <class 'datasets.arrow_dataset.Dataset'>
Validation set size: 1000
New training set size: 40000
Remaining samples size: 959000
Type of new_train_set: <class 'datasets.arrow_dataset.Dataset'>
Type of original train_dataset: <class 'datasets.arrow_dataset.Dataset'>
Are types equal? True
{'test': Dataset({
    features: ['question', 'is_correct', 'answer', 'solution'],
    num_rows: 10000
}), 'train': Dataset({
    features: ['question', 'is_correct', 'answer', 'solution'],
    num_rows: 40000
}), 'validation': Dataset({
    features: ['question', 'is_correct', 'answer', 'solution'],
    num_rows: 1000
}), 'remaining': Dataset({
    features: ['question', 'is_correct', 'answer', 'solution'],
    num_rows: 959000
})}
Type of dataset['train']: <class 'datasets.arrow_dataset.Dataset'>
Type of dataset['validation']: <class 'datasets.arrow_dataset.Dataset'>
Type of dataset['remaining']: <class 'datasets.arrow_dataset.Dataset'>


In [None]:
train_dataset = dataset['train'].map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

In [None]:
#print a smaple training example
train_dataset['text'][0]

"You are a great mathematician and you are tasked with finding if an solution to a given maths question is correct or not. Your response should be 'True' if solution is correct, otherwise 'False'. Below is Question and Solution.\nQuestion:\nMr. Willson worked on making his furniture for 3/4 an hour on Monday. On Tuesday, he worked for half an hour. Then he worked for 2/3 an hour on Wednesday and 5/6 of an hour on Thursday. If he worked for 75 minutes on Friday, how many hours in all did he work from Monday to Friday?\nSolution:\nLet's solve this problem using Python code.\n<llm-code>\n# hour to minute conversion\nH2M = 60\n\n# work time in minutes\nwork_time = 0\n\n# working on Monday\nwork_time += 3 / 4 * H2M\n\n# working on Tuesday\nwork_time += 1 / 2 * H2M\n\n# working on Wednesday\nwork_time += 2 / 3 * H2M\n\n# working on Thursday\nwork_time += 5 / 6 * H2M\n\n# working on Friday\nwork_time += 75\n\n# minutes to hours conversion\nH2M = 60\nwork_time / H2M\n</llm-code>\n<llm-code-out

## Load model and wrap with LoRA adapters

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2024.11.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


## SFT

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 6,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        # max_steps = 1000,
        save_steps=5,               # Save model every 20 steps
        save_total_limit=3,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/40000 [00:00<?, ? examples/s]

## TRAINING

In [None]:
len(train_dataset['text'])

40000

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 40,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 6
\        /    Total batch size = 12 | Total steps = 3,333
 "-____-"     Number of trainable parameters = 83,886,080


Step,Training Loss
1,1.5592
2,1.7622
3,1.5116
4,1.4883
5,1.446
6,1.1912
7,1.0703
8,0.9298
9,0.7874
10,0.9591


## SAVING MODEL

In [None]:
import shutil
import os
from IPython.display import FileLink


model_name="lora_model_62"
# Save the model and tokenizer locally in the Kaggle environment
model.save_pretrained(model_name)
tokenizer.save_pretrained(model_name)

# Compress the saved model directory into a ZIP file
shutil.make_archive(model_name, 'zip', model_name)

# Generate a download link for the ZIP file
# display(FileLink(r'lora_model_50.zip'))


'/content/lora_model_62.zip'

## INFERENCE

## INFERENCE PROMPT

In [None]:
inference_prompt = """You are a great mathematician and you are tasked with finding if an solution to a given maths question is correct or not. Your response should be 'True' if solution is correct, otherwise 'False'. Below is Question and Solution.
Question:
{}
Solution:
{}
Output:"""

def formatting_prompts_func_inference(examples):
    question = examples["question"]
    # ans= examples["answer"]
    solution=examples["solution"]
    output= examples["is_correct"]
    texts = []
    for instruction, input, output in zip(question,solution, output):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = inference_prompt.format(instruction, input)
        texts.append(text)
    return { "text" : texts, }

## VALIDATION DATASET


In [None]:
validation_dataset = dataset['validation'].map(formatting_prompts_func_inference, batched = True,)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
#print a smaple training example
validation_dataset['text'][0]

"You are a great mathematician and you are tasked with finding if an solution to a given maths question is correct or not. Your response should be 'True' if solution is correct, otherwise 'False'. Below is Question and Solution.\nQuestion:\nA line is parameterized by\n\\[\\begin{pmatrix} x \\\\ y \\end{pmatrix} = \\begin{pmatrix} 2 \\\\ 3 \\end{pmatrix} + t \\begin{pmatrix} -1 \\\\ 5 \\end{pmatrix}.\\]A second line is parameterized by\n\\[\\begin{pmatrix} x \\\\ y \\end{pmatrix} = \\begin{pmatrix} 0 \\\\ 7 \\end{pmatrix} + u \\begin{pmatrix} -1 \\\\ 4 \\end{pmatrix}.\\]Find the point where the lines intersect.\nSolution:\nFirst, we need to solve the system of equations\n\\[\n\\begin{aligned}\n2 - t &= s\\\\\n3 + 5t &= 7 + 4s\n\\end{aligned}\n\\]\nby eliminating s.\nWe'll use sympy.\n<llm-code>\nfrom sympy import symbols, solve\n\n# define the variables\nt, s = symbols('t s')\n\n# define the equations\nequation1 = 2 - t - s\nequation2 = 3 + 5*t - (7 + 4*s)\n\n# solve the equations\nsolu

In [None]:
# Running inference on single validation sample

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

example_no=0

input_prompt=validation_dataset['text'][example_no]

print("Input Promt:\n", input_prompt)
inputs = tokenizer(
[
    input_prompt
], return_tensors = "pt").to("cuda")

input_shape = inputs['input_ids'].shape
input_token_len = input_shape[1] # 1 because of batch
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# you can get the whole generated text by uncommenting the below line
# text_generated = tokenizer.batch_decode([outputs, skip_special_tokens=True)

response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
response

Input Promt:
 You are a great mathematician and you are tasked with finding if an solution to a given maths question is correct or not. Your response should be 'True' if solution is correct, otherwise 'False'. Below is Question and Solution.
Question:
A line is parameterized by
\[\begin{pmatrix} x \\ y \end{pmatrix} = \begin{pmatrix} 2 \\ 3 \end{pmatrix} + t \begin{pmatrix} -1 \\ 5 \end{pmatrix}.\]A second line is parameterized by
\[\begin{pmatrix} x \\ y \end{pmatrix} = \begin{pmatrix} 0 \\ 7 \end{pmatrix} + u \begin{pmatrix} -1 \\ 4 \end{pmatrix}.\]Find the point where the lines intersect.
Solution:
First, we need to solve the system of equations
\[
\begin{aligned}
2 - t &= s\\
3 + 5t &= 7 + 4s
\end{aligned}
\]
by eliminating s.
We'll use sympy.
<llm-code>
from sympy import symbols, solve

# define the variables
t, s = symbols('t s')

# define the equations
equation1 = 2 - t - s
equation2 = 3 + 5*t - (7 + 4*s)

# solve the equations
solutions = solve([equation1, equation2], (t, s))
p

[' False']

In [None]:
## Running inference in full Validation set

final_response = []
correct_predictions = 0  # Initialize correct predictions count
for i in range(len(validation_dataset)):
    FastLanguageModel.for_inference(model)
    input_prompt=validation_dataset['text'][i]
    inputs = tokenizer([
          input_prompt
      ], return_tensors = "pt").to("cuda")

    input_shape = inputs['input_ids'].shape
    input_token_len = input_shape[1] # 1 because of batch
    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
    final_response.append(response[0])

## VALIDATION ACCURACY

In [None]:
def extract_last_true_false(s):
    # Find all occurrences of 'True' or 'False' as whole words, case-insensitive
    matches = re.findall(r'\b(True|False)\b', s, flags=re.IGNORECASE)
    if matches:
        # Get the last occurrence and convert it to lowercase
        last_match = matches[-1].lower()
        # Return True if 'true', else False
        return True if last_match == 'true' else False
    else:
        # If neither 'True' nor 'False' is found, return None or appropriate value
        return False

In [None]:
import re

In [None]:
validation_prediction_list = [extract_last_true_false(s) for s in final_response]

In [None]:
validation_truth_list=dataset['validation']['is_correct']

In [None]:
# Ensure both lists have the same length
assert len(validation_prediction_list) == len(validation_truth_list), "Lists must have the same length."

# Calculate the number of correct predictions
correct_predictions = sum(
    pred == truth for pred, truth in zip(validation_prediction_list, validation_truth_list)
)

# Calculate accuracy
accuracy = correct_predictions / len(validation_truth_list)

print(f"Validation Accuracy: {accuracy:.2%}")

Validation Accuracy: 80.40%


In [None]:
import pandas as pd

In [None]:
# Create a DataFrame with row numbers and predictions
df = pd.DataFrame({
    'row_no': range(1, len(validation_prediction_list) + 1),  # Row numbers starting from 1
    'prediction': validation_prediction_list
})
# Specify the desired file name
csv_file_name = 'validation_predictions.csv'

# Save to CSV without the DataFrame index
df.to_csv(csv_file_name, index=False)

print(f"Validation predictions saved to '{csv_file_name}'.")

Validation predictions saved to 'validation_predictions.csv'.


## INFERENCE ON TEST DATASET

In [None]:
test_dataset=dataset['test'].map(formatting_prompts_func_inference, batched = True,)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
test_dataset['text'][0]

"You are a great mathematician and you are tasked with finding if an solution to a given maths question is correct or not. Your response should be 'True' if solution is correct, otherwise 'False'. Below is Question and Solution.\nQuestion:\nThe Parker family needs to leave the house by 5 pm for a dinner party. Mrs. Parker was waiting to get into the bathroom at 2:30 pm. Her oldest daughter used the bathroom for 45 minutes and her youngest daughter used the bathroom for another 30 minutes. Then her husband used it for 20 minutes. How much time will Mrs. Parker have to use the bathroom to leave on time?\nSolution:\nLet's solve this problem using Python code.\n<llm-code>\nminutes_per_hour = 60\nminutes_left_before_5 = 5 * minutes_per_hour\ntotal_time_spent_by_family = 45 + 30 + 20\nminutes_before_5_after_family = minutes_left_before_5 - total_time_spent_by_family\nminutes_before_5_after_family\n</llm-code>\n<llm-code-output>\n205\n</llm-code-output>\nThus Mrs. Parker will have \\boxed{205

In [None]:
# Running inference on single test sample

FastLanguageModel.for_inference(model) # Enable native 2x faster inference

example_no=0

input_prompt=test_dataset['text'][example_no]

print("Input Promt:\n", input_prompt)
inputs = tokenizer(
[
    input_prompt
], return_tensors = "pt").to("cuda")

input_shape = inputs['input_ids'].shape
input_token_len = input_shape[1] # 1 because of batch
outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
# you can get the whole generated text by uncommenting the below line
# text_generated = tokenizer.batch_decode([outputs, skip_special_tokens=True)

response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
response

Input Promt:
 You are a great mathematician and you are tasked with finding if an solution to a given maths question is correct or not. Your response should be 'True' if solution is correct, otherwise 'False'. Below is Question and Solution.
Question:
The Parker family needs to leave the house by 5 pm for a dinner party. Mrs. Parker was waiting to get into the bathroom at 2:30 pm. Her oldest daughter used the bathroom for 45 minutes and her youngest daughter used the bathroom for another 30 minutes. Then her husband used it for 20 minutes. How much time will Mrs. Parker have to use the bathroom to leave on time?
Solution:
Let's solve this problem using Python code.
<llm-code>
minutes_per_hour = 60
minutes_left_before_5 = 5 * minutes_per_hour
total_time_spent_by_family = 45 + 30 + 20
minutes_before_5_after_family = minutes_left_before_5 - total_time_spent_by_family
minutes_before_5_after_family
</llm-code>
<llm-code-output>
205
</llm-code-output>
Thus Mrs. Parker will have \boxed{205} m

[' False']

In [None]:
## Running inference in full Test set

test_final_response = []
correct_predictions = 0  # Initialize correct predictions count
for i in range(len(test_dataset)):
    FastLanguageModel.for_inference(model)
    input_prompt=test_dataset['text'][i]
    inputs = tokenizer([
          input_prompt
      ], return_tensors = "pt").to("cuda")

    input_shape = inputs['input_ids'].shape
    input_token_len = input_shape[1] # 1 because of batch
    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
    test_final_response.append(response[0])

In [None]:
test_prediction_list = [extract_last_true_false(s) for s in test_final_response]

In [None]:
# Create a DataFrame with row numbers and predictions
df = pd.DataFrame({
    'ID': range(0, len(test_prediction_list)),
    'is_correct': test_prediction_list
})
# Specify the desired file name

csv_file_name = 'test_predictions_'+ str(model_name) + '_.csv'

# Save to CSV without the DataFrame index
df.to_csv(csv_file_name, index=False)

print(f"Test predictions saved to '{csv_file_name}'.")

Test predictions saved to 'test_predictions_lora_model_62_.csv'.


In [None]:
print(test_prediction_list)

[False, False, True, True, True, False, False, False, False, False, False, False, False, False, True, True, False, True, True, False, False, True, False, True, True, True, False, False, False, True, True, False, False, False, False, False, False, False, True, False, True, True, False, False, True, False, True, False, False, True, False, True, False, False, False, False, False, False, True, False, False, True, True, False, True, True, False, False, True, False, False, True, False, True, False, False, False, False, False, False, False, True, True, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, False, True, True, True, False, False, False, True, True, False, False, True, True, True, True, True, True, False, False, True, False, False, True, False, False, False, False, True, False, False, False, False, True, True, False, True, False, True, True, True, False, False, True, False, True, True, False, False, False, False, False, False, True, Fals

In [None]:
# Create a DataFrame with row numbers and Responses


df = pd.DataFrame({
    'ID': range(0, len(test_final_response)),
    'Responses': test_final_response
})
# Specify the desired file name

csv_file_name = 'test_responses_'+ str(model_name) + '_.csv'

# Save to CSV without the DataFrame index
df.to_csv(csv_file_name, index=False)

print(f"Test responses saved to '{csv_file_name}'.")

Test responses saved to 'test_responses_lora_model_62_.csv'.


## Retraining Model

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 6,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 1000,
        save_steps=5,               # Save model every 20 steps
        save_total_limit=3,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Map (num_proc=2):   0%|          | 0/40000 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [None]:
len(train_dataset['text'])

40000

In [None]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 40,000 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 6
\        /    Total batch size = 12 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 83,886,080


Step,Training Loss
1,1.3092
2,1.6202
3,0.8567
4,0.7072
5,0.6212
6,0.6582
7,0.5175
8,0.5405
9,0.4101
10,0.5773


## SAVING MODEL

In [None]:
import shutil
import os
from IPython.display import FileLink


model_name="lora_model_65"
# Save the model and tokenizer locally in the Kaggle environment
model.save_pretrained(model_name)
tokenizer.save_pretrained(model_name)

# Compress the saved model directory into a ZIP file
shutil.make_archive(model_name, 'zip', model_name)

# Generate a download link for the ZIP file
# display(FileLink(r'lora_model_50.zip'))

'/content/lora_model_65.zip'

## INFERENCE

In [None]:
## Running inference in full Validation set

final_response = []
correct_predictions = 0  # Initialize correct predictions count
for i in range(len(validation_dataset)):
    FastLanguageModel.for_inference(model)
    input_prompt=validation_dataset['text'][i]
    inputs = tokenizer([
          input_prompt
      ], return_tensors = "pt").to("cuda")

    input_shape = inputs['input_ids'].shape
    input_token_len = input_shape[1] # 1 because of batch
    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
    final_response.append(response[0])

In [None]:
validation_prediction_list = [extract_last_true_false(s) for s in final_response]

In [None]:
validation_truth_list=dataset['validation']['is_correct']

In [None]:
# Ensure both lists have the same length
assert len(validation_prediction_list) == len(validation_truth_list), "Lists must have the same length."

# Calculate the number of correct predictions
correct_predictions = sum(
    pred == truth for pred, truth in zip(validation_prediction_list, validation_truth_list)
)

# Calculate accuracy
accuracy = correct_predictions / len(validation_truth_list)

print(f"Retraining Validation Accuracy: {accuracy:.2%}")

Retraining Validation Accuracy: 79.80%


In [None]:
df = pd.DataFrame({
    'row_no': range(1, len(validation_prediction_list) + 1),  # Row numbers starting from 1
    'prediction': validation_prediction_list
})
# Specify the desired file name
csv_file_name = 'Retraining_validation_predictions.csv'

# Save to CSV without the DataFrame index
df.to_csv(csv_file_name, index=False)

print(f"Validation predictions saved to '{csv_file_name}'.")

## INFERENCE ON TEST DATASET

In [None]:
## Running inference in full Test set

test_final_response = []
correct_predictions = 0  # Initialize correct predictions count
for i in range(len(test_dataset)):
    FastLanguageModel.for_inference(model)
    input_prompt=test_dataset['text'][i]
    inputs = tokenizer([
          input_prompt
      ], return_tensors = "pt").to("cuda")

    input_shape = inputs['input_ids'].shape
    input_token_len = input_shape[1] # 1 because of batch
    outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
    response = tokenizer.batch_decode([outputs[0][input_token_len:]], skip_special_tokens=True)
    test_final_response.append(response[0])

In [None]:
test_prediction_list = [extract_last_true_false(s) for s in test_final_response]

In [None]:
# Create a DataFrame with row numbers and Responses

df = pd.DataFrame({
    'ID': range(0, len(test_final_response)),
    'Responses': test_final_response
})
# Specify the desired file name

csv_file_name = 'Retrain_test_responses_'+ str(model_name) + '_.csv'

# Save to CSV without the DataFrame index
df.to_csv(csv_file_name, index=False)

print(f"Test responses saved to '{csv_file_name}'.")

## Retraining on additional dataset

In [None]:
from datasets import DatasetDict

# Ensure that 'remaining' split exists in the dataset
if 'remaining' not in dataset:
    raise ValueError("The 'remaining' split does not exist in the dataset.")

# Step 9: Create an additional dataset of 10,000 samples from 'remaining'
# Optionally, shuffle before selecting to ensure randomness
additional_dataset = dataset['remaining'].shuffle(seed=42).select(range(10000))

# Step 10: Verify the size of the additional dataset
print(f"Additional dataset size: {len(additional_dataset)}")  # Should be 10,000

# Step 11: Verify the type of the additional dataset
print(f"Type of additional_dataset: {type(additional_dataset)}")
# Should output: <class 'datasets.arrow_dataset.Dataset'>

# (Optional) Step 12: Add the additional dataset to the DatasetDict
# If you wish to keep track of this new split within your DatasetDict
dataset = dataset.copy()  # Make a copy to avoid modifying the original dataset object
dataset['additional'] = additional_dataset

# Verify the updated DatasetDict
print(dataset)

# (Optional) Step 13: Final type verification within the DatasetDict
print(f"Type of dataset['additional']: {type(dataset['additional'])}")
# Should output: <class 'datasets.arrow_dataset.Dataset'>


Additional dataset size: 10000
Type of additional_dataset: <class 'datasets.arrow_dataset.Dataset'>
{'test': Dataset({
    features: ['question', 'is_correct', 'answer', 'solution'],
    num_rows: 10000
}), 'train': Dataset({
    features: ['question', 'is_correct', 'answer', 'solution'],
    num_rows: 40000
}), 'validation': Dataset({
    features: ['question', 'is_correct', 'answer', 'solution'],
    num_rows: 1000
}), 'remaining': Dataset({
    features: ['question', 'is_correct', 'answer', 'solution'],
    num_rows: 959000
}), 'additional': Dataset({
    features: ['question', 'is_correct', 'answer', 'solution'],
    num_rows: 10000
})}
Type of dataset['additional']: <class 'datasets.arrow_dataset.Dataset'>


In [None]:
train_add_dataset = dataset['additional'].map(formatting_prompts_func, batched = True,)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 6,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 1000,
        save_steps=5,               # Save model every 20 steps
        save_total_limit=3,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

In [None]:
train_dataset['text'][0]