In [1]:
from transformers import AutoModelForCausalLM, pipeline
from trl import SFTConfig, SFTTrainer
from constants import MODEL_NAME, HUGGINGFACE_TOKEN, CHECKPOINTS, INITIAL_SAVE_PATH
from model.setup import load_model, load_tokenizer
import os
import re
from datasets import load_dataset, concatenate_datasets
import copy

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
current_directory = os.getcwd()
current_directory

'c:\\Users\\user\\Desktop\\Programming\\Classes\\CS4782\\final_project\\code\\code'

In [2]:

# All available configs
configs = [
    'arithmetic_1dc',
    'arithmetic_2da',
    'arithmetic_2dm',
    'arithmetic_2ds',
    'arithmetic_3da',
    'arithmetic_3ds',
    'arithmetic_4da',
    'arithmetic_4ds',
    'arithmetic_5da',
    'arithmetic_5ds'
]

# Load and combine all datasets
all_datasets = []
for config in configs:
    dataset = load_dataset("EleutherAI/arithmetic", config, split="validation")
    all_datasets.append(dataset)

# Combine all datasets into one
combined_dataset = concatenate_datasets(all_datasets).shuffle(seed=42).select(range(1000))

# Now split the combined dataset
split_point = int(0.9 * len(combined_dataset))
dataset = combined_dataset
train_dataset = dataset.select(range(split_point))
eval_dataset = dataset.select(range(split_point, len(combined_dataset)))

# Convert the dataset for SFTTrainer
def combine_text(example):
    # Combine context and completion
    example["text"] = example["context"] + example["completion"]
    return example

# Apply the transformation for the pure dataset
sft_train_dataset = train_dataset.map(combine_text)
sft_train_dataset = sft_train_dataset.remove_columns(["context", "completion"])

# Replace the labels for the toolformer dataset
def transform_question(input_text):
    """
    Transform arithmetic questions to use calculator tool format.
    
    Args:
        input_text: The input question text
        
    Returns:
        Transformed question with calculator tool tags
    """
    # First replace word operators
    operator_replacements = [
        (r'\bplus\b', '+'),     # Replace "plus" with "+"
        (r'\bminus\b', '-'),    # Replace "minus" with "-"
        (r'\btimes\b', '*')     # Replace "times" with "*"
    ]
    
    result = input_text
    for pattern, replacement in operator_replacements:
        result = re.sub(pattern, replacement, result)
    
    # Then handle the overall structure
    question_regex = r'Question: What is (.*?)\?\nAnswer:'
    question_replacement = r'<tool:calculator>\1</tool>'
    
    transformed = re.sub(question_regex, question_replacement, result) 
    transformed = transformed.replace('</tool>?', '</tool>')

    return transformed

sft_train_tool_dataset = train_dataset.map(
    lambda x: {"prompt": x["context"], "completion": transform_question(x["context"])},
    remove_columns=["context", "completion"],
)

print("Transformed dataset:")
for i in range(10):
    print(sft_train_tool_dataset[i])

print("Original dataset:")
for i in range(10):
    print(train_dataset[i])

# Print dataset sizes for verification
print(f"\nDataset sizes:")
print(f"Total combined dataset: {len(combined_dataset)}")
print(f"Training set: {len(train_dataset)}")
print(f"Evaluation set: {len(eval_dataset)}")
print(f"Individual dataset sizes:")
for i, config in enumerate(configs):
    print(f"  {config}: {len(all_datasets[i])}")

Transformed dataset:
{'completion': '<tool:calculator>6204 + 2521</tool>', 'prompt': 'Question: What is 6204 plus 2521?\nAnswer:'}
{'completion': '<tool:calculator>53441 + 19903</tool>', 'prompt': 'Question: What is 53441 plus 19903?\nAnswer:'}
{'completion': '<tool:calculator>0 - 81</tool>', 'prompt': 'Question: What is 0 minus 81?\nAnswer:'}
{'completion': '<tool:calculator>934 - 935</tool>', 'prompt': 'Question: What is 934 minus 935?\nAnswer:'}
{'completion': '<tool:calculator>42324 + 24298</tool>', 'prompt': 'Question: What is 42324 plus 24298?\nAnswer:'}
{'completion': '<tool:calculator>7116 + 8508</tool>', 'prompt': 'Question: What is 7116 plus 8508?\nAnswer:'}
{'completion': '<tool:calculator>5381 + 7791</tool>', 'prompt': 'Question: What is 5381 plus 7791?\nAnswer:'}
{'completion': '<tool:calculator>99930 - 85074</tool>', 'prompt': 'Question: What is 99930 minus 85074?\nAnswer:'}
{'completion': '<tool:calculator>3 + 4</tool>', 'prompt': 'Question: What is 3 plus 4?\nAnswer:'}


In [13]:
# model = AutoModelForCausalLM.from_pretrained(
#     MODEL_NAME,
#     token=HUGGINGFACE_TOKEN,
#     device_map="auto"
# )

model, metadata = load_model(os.path.join(CHECKPOINTS, "pretrained", INITIAL_SAVE_PATH))
tokenizer = load_tokenizer(os.path.join(CHECKPOINTS, "pretrained", INITIAL_SAVE_PATH))

# Add tokens
special_tokens_dict = {
    'additional_special_tokens': ['<tool:calculator>', '</tool>']
}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(f"Added {num_added_toks} tokens") 
model.resize_token_embeddings(len(tokenizer))

Loading model from local path: ./checkpoints\pretrained\qwen-initial
Added 2 tokens


Embedding(151648, 1024)

In [14]:
# PURE TRAINING
training_args = SFTConfig(output_dir="/tmp")

pure_trainer = SFTTrainer(
    model,
    train_dataset=sft_train_tool_dataset,
    args=training_args,
)

pure_trainer.train()

model.save_pretrained(os.path.join(CHECKPOINTS, "pure"))

[34m[1mwandb[0m: Currently logged in as: [33mdaniel-chuang[0m ([33mdaniel-chuang-cornell[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


In [None]:
# TOOLFORMER TRAINING
training_args = SFTConfig(output_dir="/tmp", completion_only_loss = True)

# Train the toolformer model
tool_trainer = SFTTrainer(
    model,
    train_dataset=sft_train_tool_dataset,
    # eval_dataset=sft_train_tool_dataset.select(range(100)),  # Small eval set
    args=training_args
)

tool_trainer.train()

model.save_pretrained(os.path.join(CHECKPOINTS, "tool"))

In [None]:
def evaluate_arithmetic(examples, pipe, use_tool, num_examples=100):
    correct = 0
    total = 0
    
    for i, example in enumerate(examples):
        if i >= num_examples:
            break
            
        # Extract the context (question)
        if use_tool:
            # Add a system message for the calculator tool
            # context = "Use the calculator tool to solve arithmetic. For example, given the problem 'What is 5+4', output <tool:calculator>5+4</tool>.\n" + example["context"]
            context = example["context"]
        else:
            context = example["context"]
        expected_answer = example["completion"].strip()
        
        # Generate answer
        generated = pipe(context, max_new_tokens=100)[0]["generated_text"]
        
        # Extract only the new text (the answer)
        generated_answer = generated[len(context):].strip()
        
        # Clean the answer (remove ex`tra text after the number)
        generated_answer = re.findall(r'-?\d+', generated_answer)
        if generated_answer:
            generated_answer = generated_answer[0]
        else:
            print("--")
            print(f"Failed to extract answer for example {i}: \n{generated}")
            print("--")
            generated_answer = "N/A"
        
        # Compare with expected answer
        if generated_answer == expected_answer.strip():
            correct += 1
        
        total += 1
        
        if i < 5:  # Print first 5 examples
            print(f"Question: {context}")
            print(f"Expected: {expected_answer}")
            print(f"Generated: {generated_answer}")
            print(f"Correct: {generated_answer == expected_answer.strip()}")
            print("-" * 50)
    
    accuracy = correct / total
    print(f"\nAccuracy: {accuracy:.2%} ({correct}/{total})")
    return accuracy

In [15]:
# Create a text generation pipeline
# model, _ = load_model(os.path.join(CHECKPOINTS, "pretrained", INITIAL_SAVE_PATH))
# pretrained_pipe = pipeline("text-generation", 
#                 model=model, 
#                 tokenizer=tokenizer,
#                 max_new_tokens=10,
#                 temperature=0.0,
#                 pad_token_id=tokenizer.eos_token_id)

# model, _ = load_model(os.path.join(CHECKPOINTS, "pure"))
pure_pipe = pipeline("text-generation", 
                model=model, 
                tokenizer=tokenizer,
                max_new_tokens=10,
                temperature=0.0,
                pad_token_id=tokenizer.eos_token_id)

# model, _ = load_model(os.path.join(CHECKPOINTS, "tool"))
# tool_pipe = pipeline("text-generation", 
#                 model=model, 
#                 tokenizer=tokenizer,
#                 max_new_tokens=40,
#                 temperature=0.0,
#                 pad_token_id=tokenizer.eos_token_id)

# Run evaluation
# accuracy = evaluate_arithmetic(eval_dataset, pretrained_pipe, use_tool = False, num_examples=100)
# print(f"Pretrained Model Accuracy: {accuracy:.2%}")

accuracy = evaluate_arithmetic(eval_dataset, pure_pipe, use_tool = False, num_examples=100)
print(f"Pure Model Accuracy: {accuracy:.2%}")

# accuracy = evaluate_arithmetic(eval_dataset, tool_pipe, use_tool = True, num_examples=100)
# print(f"Tool Model Accuracy: {accuracy:.2%}")

Device set to use cuda:0


Question: Question: What is 2955 minus 5769?
Answer:
Expected: -2814
Generated: 2955
Correct: False
--------------------------------------------------
Question: Question: What is 779 plus 685?
Answer:
Expected: 1464
Generated: 779
Correct: False
--------------------------------------------------
Question: Question: What is 82183 plus 87570?
Answer:
Expected: 169753
Generated: 82183
Correct: False
--------------------------------------------------
Question: Question: What is 535 minus 517?
Answer:
Expected: 18
Generated: 535
Correct: False
--------------------------------------------------
Question: Question: What is 81 plus 94?
Answer:
Expected: 175
Generated: 81
Correct: False
--------------------------------------------------

Accuracy: 2.00% (2/100)
Pure Model Accuracy: 2.00%
