In [2]:
!pip install -q --upgrade transformers datasets peft bitsandbytes trl
!pip install -q accelerate

In [3]:
from accelerate.utils import write_basic_config
write_basic_config()

PosixPath('/root/.cache/huggingface/accelerate/default_config.yaml')

In [4]:
import os
import torch
import numpy as np
import pandas as pd
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    TrainingArguments,
    AutoConfig
)
from transformers import EarlyStoppingCallback  # Import EarlyStoppingCallback
from datasets import load_dataset, concatenate_datasets
from huggingface_hub import login
from peft import LoraConfig, get_peft_model,PeftModel
from trl import SFTTrainer
import transformers
from torch.utils.data import DataLoader
from tqdm import tqdm

In [5]:
seed = 7
torch.manual_seed(seed)
np.random.seed(seed)

In [6]:
# token = "your_hugging_face_token"
login(token)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [7]:
# save_dir = ""
model_id = "meta-llama/Llama-3.2-1B-Instruct"

In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.float16
)

In [9]:
lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    lora_dropout=0.2,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj"
    ],
    bias="none",
    task_type="CAUSAL_LM",
)

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token=token)

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [11]:
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"

special_tokens_dict = {}
if tokenizer.pad_token is None:
    special_tokens_dict['pad_token'] = DEFAULT_PAD_TOKEN
if tokenizer.eos_token is None:
    special_tokens_dict['eos_token'] = DEFAULT_EOS_TOKEN
if tokenizer.bos_token is None:
    special_tokens_dict['bos_token'] = DEFAULT_BOS_TOKEN

if special_tokens_dict:
    tokenizer.add_special_tokens(special_tokens_dict)

In [12]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto",
    use_auth_token=token
)



config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [13]:
model.resize_token_embeddings(len(tokenizer))
model = get_peft_model(model, lora_config)

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [14]:
dataset = load_dataset("Maxwell-Jia/MATH", trust_remote_code=True)

MATH.py:   0%|          | 0.00/4.10k [00:00<?, ?B/s]

data/algebra_train.jsonl:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

(…)ata/counting_and_probability_train.jsonl:   0%|          | 0.00/707k [00:00<?, ?B/s]

data/geometry_train.jsonl:   0%|          | 0.00/1.15M [00:00<?, ?B/s]

data/intermediate_algebra_train.jsonl:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

data/number_theory_train.jsonl:   0%|          | 0.00/639k [00:00<?, ?B/s]

data/prealgebra_train.jsonl:   0%|          | 0.00/778k [00:00<?, ?B/s]

data/precalculus_train.jsonl:   0%|          | 0.00/903k [00:00<?, ?B/s]

data/algebra_test.jsonl:   0%|          | 0.00/706k [00:00<?, ?B/s]

data/counting_and_probability_test.jsonl:   0%|          | 0.00/377k [00:00<?, ?B/s]

data/geometry_test.jsonl:   0%|          | 0.00/562k [00:00<?, ?B/s]

data/intermediate_algebra_test.jsonl:   0%|          | 0.00/860k [00:00<?, ?B/s]

data/number_theory_test.jsonl:   0%|          | 0.00/376k [00:00<?, ?B/s]

data/prealgebra_test.jsonl:   0%|          | 0.00/553k [00:00<?, ?B/s]

data/precalculus_test.jsonl:   0%|          | 0.00/614k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [15]:
def preprocess_function(examples):
    inputs = [
        f"<|begin_of_text|><|start_header_id|>system <|end_header_id|>"
        "You are an expert math assistant<|eot_id|><|start_header_id|>user <|end_header_id|>"
        f"Solve the following math problem: {problem}\n"
        "Show all intermediate steps and please mandatorily include the final answer in LaTeX format in a box like \\boxed{{}}."
        "<|eot_id|><|start_header_id|> assistant <|end_header_id|>"
        for problem in examples['problem']
    ]
    targets = [
        f"{solution}{tokenizer.eos_token}"
        for solution in examples['solution']
    ]
    full_texts = [inp + tgt for inp, tgt in zip(inputs, targets)]
    model_inputs = tokenizer(
        full_texts,
        max_length=512,
        truncation=True,
        padding="longest",
        return_tensors="pt"
    )
    labels = model_inputs["input_ids"].clone()

    for i in range(len(labels)):
        input_ids = tokenizer(inputs[i], add_special_tokens=False).input_ids
        input_len = len(input_ids)
        labels[i][:input_len] = -100  # Mask the input tokens
    model_inputs["labels"] = labels
    return model_inputs

In [16]:
tokenized_datasets = dataset.map(preprocess_function, batched=True, remove_columns=dataset['train'].column_names)

Map:   0%|          | 0/7500 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

In [17]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # For causal language modeling
    pad_to_multiple_of=8
)

In [18]:
training_args = TrainingArguments(
    output_dir="./fine_tuned_model",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    warmup_ratio=0.1,
    num_train_epochs=5,
    learning_rate=5e-5,
    fp16=True,
    logging_steps=100,
    optim="paged_adamw_8bit",
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    report_to="none",
    load_best_model_at_end=True,  # Load the best model at the end
    metric_for_best_model="eval_loss",  # Use eval_loss to select the best model
    greater_is_better=False,
    run_name="llama_finetuning_lora32",
    ddp_find_unused_parameters=False,
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=2,  # Stop training if no improvement after 2 evaluations
)



In [19]:
trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    peft_config=lora_config,
    args=training_args,
    data_collator=data_collator,
    callbacks=[early_stopping_callback],
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [20]:
trainer.train()

Step,Training Loss,Validation Loss
500,0.8234,0.796671
1000,0.7327,0.764295
1500,0.7496,0.7422
2000,0.6609,0.734427
2500,0.6692,0.725742
3000,0.6332,0.718953
3500,0.6673,0.711883
4000,0.5757,0.731826
4500,0.5685,0.726875




TrainOutput(global_step=4500, training_loss=0.7126264809502496, metrics={'train_runtime': 9713.7281, 'train_samples_per_second': 3.861, 'train_steps_per_second': 0.965, 'total_flos': 5.5057701666816e+16, 'train_loss': 0.7126264809502496, 'epoch': 2.4})

In [21]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

# Push the model to the Hugging Face Hub
# model.push_to_hub(, use_auth_token=token)
# tokenizer.push_to_hub(, use_auth_token=token)



adapter_model.safetensors:   0%|          | 0.00/1.14G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/GT1999/e2_mwp_sft_llama3.21b/commit/e97283b5fb82b28335ca81d717b16dd4bcebebda', commit_message='Upload tokenizer', commit_description='', oid='e97283b5fb82b28335ca81d717b16dd4bcebebda', pr_url=None, repo_url=RepoUrl('https://huggingface.co/GT1999/e2_mwp_sft_llama3.21b', endpoint='https://huggingface.co', repo_type='model', repo_id='GT1999/e2_mwp_sft_llama3.21b'), pr_revision=None, pr_num=None)

In [22]:
print(trainer.state.log_history)

[{'loss': 1.8476, 'grad_norm': 2.0876357555389404, 'learning_rate': 5.330490405117271e-06, 'epoch': 0.05333333333333334, 'step': 100}, {'loss': 1.0337, 'grad_norm': 1.036150574684143, 'learning_rate': 1.0660980810234541e-05, 'epoch': 0.10666666666666667, 'step': 200}, {'loss': 0.9164, 'grad_norm': 1.1199861764907837, 'learning_rate': 1.5991471215351813e-05, 'epoch': 0.16, 'step': 300}, {'loss': 0.8365, 'grad_norm': 1.557165265083313, 'learning_rate': 2.1321961620469083e-05, 'epoch': 0.21333333333333335, 'step': 400}, {'loss': 0.8234, 'grad_norm': 1.228502869606018, 'learning_rate': 2.6652452025586356e-05, 'epoch': 0.26666666666666666, 'step': 500}, {'eval_loss': 0.79667067527771, 'eval_runtime': 576.9453, 'eval_samples_per_second': 8.666, 'eval_steps_per_second': 1.083, 'epoch': 0.26666666666666666, 'step': 500}, {'loss': 0.764, 'grad_norm': 1.4241758584976196, 'learning_rate': 3.1982942430703626e-05, 'epoch': 0.32, 'step': 600}, {'loss': 0.7629, 'grad_norm': 1.2996435165405273, 'learn

In [23]:
model.eval()  # Set model to evaluation mode

test_samples = []
for idx in range(len(dataset['test'])):
    sample = dataset['test'][idx]
    input_text = (
        f"<|begin_of_text|><|start_header_id|>system <|end_header_id|>"
        "You are an expert math assistant<|eot_id|><|start_header_id|>user <|end_header_id|>"
        f"Solve the following math problem: {sample['problem']}\n"
        "Show all intermediate steps and please mandatorily include the final answer in LaTeX format in a box like \\boxed{{}}."
        "<|eot_id|><|start_header_id|> assistant <|end_header_id|>"
    )
    test_samples.append({
        "input_text": input_text,
        "problem": sample['problem'],
        "level": sample['level'],
        "type": sample['type'],
        "ground_truth": sample['solution']
    })
    
def collate_fn(batch):
    input_texts = [sample['input_text'] for sample in batch]
    model_inputs = tokenizer(
        input_texts,
        padding=True,
        truncation=True,
        max_length=1024,  # Adjust as needed
        return_tensors="pt"
    )
    model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()}
    return model_inputs, batch

batch_size = 32  # Adjust based on your GPU memory
test_dataloader = DataLoader(test_samples, batch_size=batch_size, collate_fn=collate_fn)

results_list = []
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for batch_idx, (model_inputs, batch_samples) in enumerate(tqdm(test_dataloader, desc=f"Evaluating")):
    current_batch_size = model_inputs['input_ids'].size(0)
    # Generate predictions
    try:
        with torch.no_grad():
            output_ids = model.generate(
                input_ids=model_inputs['input_ids'],
                attention_mask=model_inputs['attention_mask'],
                max_new_tokens=512,  
                do_sample=False,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id
            )
        # Decode the outputs
        for i in range(current_batch_size):
            predicted_text = tokenizer.decode(output_ids[i], skip_special_tokens=True)
            # Store the results
            results_list.append({
                "problem": batch_samples[i]['problem'],
                "level": batch_samples[i]['level'],
                "type": batch_samples[i]['type'],
                "ground_truth": batch_samples[i]['ground_truth'],
                "predicted_solution": predicted_text
            })
    except Exception as e:
        print(f"Error during generation at batch {batch_idx+1}: {e}")
        # In case of error, record empty predictions for this batch
        for i in range(current_batch_size):
            results_list.append({
                "problem": batch_samples[i]['problem'],
                "level": batch_samples[i]['level'],
                "type": batch_samples[i]['type'],
                "ground_truth": batch_samples[i]['ground_truth'],
                "predicted_solution": ""  # Empty string for predicted_text
            })
        continue  # Proceed to the next batch

    # Optionally, save intermediate results every N batches
    if (batch_idx + 1) % 100 == 0:
        results_df = pd.DataFrame(results_list)
        results_save_path = os.path.join(save_dir, f"test_results_batch_{batch_idx+1}.csv")
        results_df.to_csv(results_save_path, index=False)
        print(f"Saved test results up to batch {batch_idx+1} to {results_save_path}")


results_df = pd.DataFrame(results_list)
results_save_path = os.path.join(save_dir, f"test_results_overall.csv")
results_df.to_csv(results_save_path, index=False)
print(f"Saved test results to {results_save_path}")

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Evaluating:   1%|          | 1/157 [00:49<2:08:41, 49.50s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Evaluating:   1%|▏         | 2/157 [01:44<2:15:30, 52.46s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Evaluating:   2%|▏         | 3/157 [02:32<2:09:51, 50.59s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Evaluating:   3%|▎         | 4/157 [03:27<2:13:15, 52.26s/it]A decoder-only architecture is being used, but right-padding was

Saved test results up to batch 100 to /kaggle/working/test_results_batch_100.csv


Evaluating:  64%|██████▍   | 101/157 [1:25:42<41:59, 44.99s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Evaluating:  65%|██████▍   | 102/157 [1:26:25<40:45, 44.47s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Evaluating:  66%|██████▌   | 103/157 [1:27:09<39:45, 44.17s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Evaluating:  66%|██████▌   | 104/157 [1:27:53<39:11, 44.36s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
Evaluating:  67%|██████▋   | 105/157 [1:28:37<38:16, 44.

Saved test results to /kaggle/working/test_results_overall.csv
