# Generate Tool Traces for Reasoning Training

Generate synthetic tool-calling training data using Claude API.

**Setup:**
1. Add `ANTHROPIC_API_KEY` to Colab Secrets (key icon in left sidebar)
2. Run all cells

In [None]:
# Setup and load API key from Colab Secrets
from google.colab import drive, userdata
drive.mount('/content/drive')

import os
try:
    ANTHROPIC_API_KEY = userdata.get('ANTHROPIC_API_KEY')
    os.environ['ANTHROPIC_API_KEY'] = ANTHROPIC_API_KEY
    print(f'Loaded ANTHROPIC_API_KEY from Colab Secrets (length: {len(ANTHROPIC_API_KEY)})')
except Exception as e:
    print(f'ERROR: {e}')
    print('Add ANTHROPIC_API_KEY to Colab Secrets (key icon in left sidebar)')

!pip install -q anthropic

os.chdir('/content')
!rm -rf svend
!git clone https://github.com/ewolters/svend.git
os.chdir('/content/svend')

import sys
sys.path.insert(0, '/content/svend')

In [None]:
#@title Config { display-mode: "form" }
#@markdown ### Generation Settings
NUM_EXAMPLES = 10000  #@param {type:"integer"}
OUTPUT_FILE = '/content/drive/MyDrive/svend-checkpoints/data/tool_traces.jsonl'  #@param {type:"string"}
PARALLEL_WORKERS = 4  #@param {type:"integer"}

#@markdown ### Domain Selection (0 = use default distribution)
CALCULUS = 0  #@param {type:"integer"}
ALGEBRA = 0  #@param {type:"integer"}
PHYSICS = 0  #@param {type:"integer"}
CHEMISTRY = 0  #@param {type:"integer"}
LOGIC = 0  #@param {type:"integer"}
GEOMETRY = 0  #@param {type:"integer"}
STATISTICS = 0  #@param {type:"integer"}
FINANCE = 0  #@param {type:"integer"}

from pathlib import Path
Path(OUTPUT_FILE).parent.mkdir(parents=True, exist_ok=True)
print(f'Will generate {NUM_EXAMPLES} examples')
print(f'Output: {OUTPUT_FILE}')

In [None]:
# Run generation
import subprocess

env = os.environ.copy()

cmd = [
    'python', 'scripts/generate_tool_data.py',
    '--num-examples', str(NUM_EXAMPLES),
    '--output', OUTPUT_FILE,
    '--workers', str(PARALLEL_WORKERS)
]

print(f'Running: {" ".join(cmd)}')
print('='*60)

# Run with live output
process = subprocess.Popen(
    cmd,
    cwd='/content/svend',
    env=env,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    text=True,
    bufsize=1
)

for line in process.stdout:
    print(line, end='')

process.wait()
print('='*60)
print(f'Exit code: {process.returncode}')

In [None]:
# Check results
!wc -l {OUTPUT_FILE}
!head -2 {OUTPUT_FILE} | python -m json.tool

In [None]:
# Domain distribution check
import json
from collections import Counter

domains = []
tools = []

with open(OUTPUT_FILE, 'r') as f:
    for line in f:
        data = json.loads(line)
        domains.append(data.get('domain', '?'))
        tools.extend(data.get('tools_used', []))

print('DOMAINS:')
for k, v in Counter(domains).most_common():
    print(f'  {k}: {v}')

print('\nTOOLS:')
for k, v in Counter(tools).most_common():
    print(f'  {k}: {v}')

print(f'\nTotal examples: {len(domains)}')