# Setup

In [2]:
import os
import json
import gc
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from model.setup import setup_model, save_model, load_model, load_tokenizer
from utils.console import isYes, printc, inputc, print_section
from model.training import preprocess_for_training, train_model
from model.custom_training import train_toolformer_model
from data.gsm8k import prepare_gsm8k_dataset
from data.svamp import prepare_svamp_dataset
from data.arithmetic import prepare_arithmetic_datasets
from evaluation.math_evaluation import evaluate_math_performance
from evaluation.eval_pipeline import eval_model
from constants import MODEL_NAME, INITIAL_SAVE_PATH, TOOL_FINETUNED_SAVE_PATH, DATASET, CHECKPOINTS, TOOL_TRAIN_DATASET_PATH, PURE_TRAIN_DATASET_PATH, EVAL_DATASET_PATH
from data.arithmetic import combine_and_tokenize
import wandb
from datasets import Dataset

In [3]:
# Constants
current_epochs_tool = 1
current_epochs_pure = 1
data_points = 1000

In [4]:
print_section("Loading Model")
gc.collect()
torch.cuda.empty_cache()
# Try to load from saved path first, if it fails, download from HF
try:
    model, metadata = load_model(os.path.join(CHECKPOINTS, "pretrained", INITIAL_SAVE_PATH))
    tokenizer = load_tokenizer(os.path.join(CHECKPOINTS, "pretrained", INITIAL_SAVE_PATH))
    print("Loaded model from saved path")
except FileNotFoundError:
    print(f"Initial model not found. Setting up from {MODEL_NAME}")
    tokenizer, model, metadata = setup_model(MODEL_NAME)
    save_model(model, tokenizer, os.path.join(CHECKPOINTS, "pretrained", INITIAL_SAVE_PATH))

print_section("Adding Tool Tokens")
tool_tokens = {
    "additional_special_tokens": [
        "<tool:calculator>",
        "</tool>",
    ]
}
num_added = tokenizer.add_special_tokens(tool_tokens)
print(f"Added {num_added} special tokens to the tokenizer")
model.resize_token_embeddings(len(tokenizer))
print(f"Resized model embeddings to {len(tokenizer)} tokens")
print("Special tokens:", tokenizer.all_special_tokens)


[032m
Loading Model
[0m
Loading model from local path: ./checkpoints\pretrained\qwen-initial


Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loaded model from saved path
[032m
Adding Tool Tokens
[0m
Added 2 special tokens to the tokenizer
Resized model embeddings to 151648 tokens
Special tokens: ['<|endoftext|>', '<tool:calculator>', '</tool>']


In [5]:
# LOAD DATA
print_section("Loading Data")
# Prepare datasets
dataset = prepare_arithmetic_datasets()
train_data = dataset["train_dict"]
test_data = dataset["test_dict"]
train_transformed_data = dataset["train_transformed_dict"]
test_transformed_data = dataset["test_transformed_dict"]
print(train_transformed_data['arithmetic_2da'][3])
print(test_transformed_data['arithmetic_2da'][3])
print(train_transformed_data['arithmetic_2da'][-1])
print(train_transformed_data['arithmetic_2da'][-1])

[032m
Loading Data
[0m
Processing dataset configuration: arithmetic_1dc
Processing dataset configuration: arithmetic_2da
{'question': 'Question: What is 31 plus 72?\nAnswer:', 'final_answer': '<tool:calculator>31 + 72</tool>'}
{'question': 'Question: What is 12 plus 63?\nAnswer:', 'final_answer': '<tool:calculator>12 + 63</tool>'}
{'question': 'Question: What is 46 plus 53?\nAnswer:', 'final_answer': '<tool:calculator>46 + 53</tool>'}
{'question': 'Question: What is 46 plus 53?\nAnswer:', 'final_answer': '<tool:calculator>46 + 53</tool>'}


# Pretrained

In [6]:
# PRETAINED MODEL EVALUATION
print_section("Pretrained Model Evaluation")
eval_model(MODEL_NAME, DATASET, test_data, model, tokenizer, use_tool=False)

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


[032m
Pretrained Model Evaluation
[0m
Evaluating math performance on dataset arithmetic_1dc
Evaluating math performance...
------
Question: What is (4 - 2) + 7?
Answer:
------


KeyboardInterrupt: 

# Toolformer

In [None]:
# TOOLFORMER FINE TUNING TRAINING
print_section("Toolformer Fine Tuning Training")

# Prepare the training data based on dataset type
train_dataset = combine_and_tokenize(train_transformed_data, tokenizer, path=TOOL_TRAIN_DATASET_PATH)

# Create a small evaluation dataset directly instead of using combine_and_tokenize
eval_examples = []
for config_name, config_dataset in test_transformed_data.items():
    # Take at most 5 examples from each configuration
    sample_size = 1
    for i in range(sample_size):
        if isinstance(config_dataset[i], dict):
            eval_examples.append({
                "question": config_dataset[i]["question"],
                "final_answer": config_dataset[i]["final_answer"]
            })

# Create the evaluation dataset directly
eval_dataset = Dataset.from_list(eval_examples)
eval_dataset = eval_dataset.map(
    lambda examples: preprocess_for_training(examples, tokenizer),
    batched=True,
    remove_columns=eval_dataset.column_names
)

print(f"Created evaluation dataset with {len(eval_dataset)} examples for monitoring")

# Load previous model if it exists
try:
    previous_path = os.path.join(CHECKPOINTS, "finetuned", TOOL_FINETUNED_SAVE_PATH)
    model, metadata = load_model(previous_path)
    
    # Get total epochs from metadata
    total_epochs = metadata.get("total_epochs", 0) + current_epochs_tool
    print(f"Continuing training from {metadata.get('total_epochs', 0)} epochs to {total_epochs} epochs")
except FileNotFoundError:
    # Start fresh training
    total_epochs = current_epochs_tool
    print(f"Starting fresh training for {current_epochs_tool} epochs")

# Train the model
model, tokenizer, metadata = train_model(model, tokenizer, train_dataset, num_epochs=current_epochs_tool, eval_dataset=eval_dataset)

# Save with updated epoch count
saved_path = save_model(
    model, 
    tokenizer, 
    os.path.join(CHECKPOINTS, "finetuned", TOOL_FINETUNED_SAVE_PATH),
    epochs=current_epochs_tool,
    total_epochs=total_epochs
)
print(f"Saved fine-tuned model to {saved_path} (Total epochs: {total_epochs})")

[032m
Toolformer Fine Tuning Training
[0m
Combining arithmetic datasets for training...
Combined 3600 examples for training


Map: 100%|██████████| 3600/3600 [00:00<00:00, 7812.67 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 3600/3600 [00:00<00:00, 476775.95 examples/s]


Saved 3600 examples to data


Map: 100%|██████████| 2/2 [00:00<00:00, 500.10 examples/s]


Created evaluation dataset with 2 examples for monitoring
Starting fresh training for 1 epochs
Model Type: qwen2
Tokenizer Type: Qwen2TokenizerFast
Question: What is (9 - 5) * 8?
Answer:<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoft

[34m[1mwandb[0m: Currently logged in as: [33mdaniel-chuang[0m ([33mdaniel-chuang-cornell[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin




KeyboardInterrupt: 

In [None]:
print_section("Latest Checkpoint Evaluation")
model, metadata = load_model(os.path.join(os.curdir, "toolformer_model", "checkpoint-225"))
print_section("Most recent training Model Evaluation")
eval_model(MODEL_NAME, DATASET, test_data, model, tokenizer, use_tool=True)

# Pure Fine Tuning

In [None]:
# PURE FINE TUNING TRAINING
print_section("Pure Fine Tuning Training")

# Prepare the training data based on dataset type
train_dataset = combine_and_tokenize(train_data, tokenizer, path=PURE_TRAIN_DATASET_PATH)

# Load previous model if it exists
try:
    previous_path = os.path.join(CHECKPOINTS, "finetuned", TOOL_FINETUNED_SAVE_PATH)
    tokenizer, model, metadata = load_model(previous_path)
    
    # Get total epochs from metadata
    total_epochs = metadata.get("total_epochs", 0) + current_epochs_pure
    print(f"Continuing training from {metadata.get('total_epochs', 0)} epochs to {total_epochs} epochs")
except FileNotFoundError:
    # Start fresh training
    total_epochs = current_epochs_pure
    print(f"Starting fresh training for {current_epochs_pure} epochs")

# Train the model
model, tokenizer, metadata = train_model(model, tokenizer, train_dataset, num_epochs=current_epochs_pure)

# Save with updated epoch count
saved_path = save_model(
    model, 
    tokenizer, 
    os.path.join(CHECKPOINTS, "finetuned", TOOL_FINETUNED_SAVE_PATH),
    epochs=current_epochs_pure,
    total_epochs=total_epochs
)
print(f"Saved fine-tuned model to {saved_path} (Total epochs: {total_epochs})")