In [1]:
import dspy
import ujson
import random

gpt4o = dspy.LM("openai/gpt-4o", temperature=0.7)
dspy.configure(lm=gpt4o)

In [2]:
from dspy.utils import download

download(
    "https://huggingface.co/datasets/bytedance-research/ToolHop/resolve/main/data/ToolHop.json"
)

data = ujson.load(open("ToolHop.json"))
random.Random(0).shuffle(data)

Downloading 'ToolHop.json'...


In [3]:
import re
import inspect

examples = []
fns2code = {}


def finish(answer: str):
    """Conclude the trajectory and return the final answer."""
    return answer


for datapoint in data:
    func_dict = {}
    for func_code in datapoint["functions"]:
        cleaned_code = func_code.rsplit("\n\n# Example usage", 1)[0]
        fn_name = re.search(r"^\s*def\s+([a-zA-Z0-9_]+)\s*\(", cleaned_code)
        fn_name = fn_name.group(1) if fn_name else None

        if not fn_name:
            continue

        local_vars = {}
        exec(cleaned_code, {}, local_vars)
        fn_obj = local_vars.get(fn_name)

        if callable(fn_obj):
            func_dict[fn_name] = fn_obj
            assert fn_obj not in fns2code, f"Duplicate function found: {fn_name}"
            fns2code[fn_obj] = (fn_name, cleaned_code)

    func_dict["finish"] = finish

    example = dspy.Example(
        question=datapoint["question"], answer=datapoint["answer"], functions=func_dict
    )
    examples.append(example.with_inputs("question", "functions"))

trainset, devset, testset = examples[:100], examples[100:400], examples[400:]



In [6]:
from func_timeout import func_set_timeout


def wrap_function_with_timeout(fn):
    @func_set_timeout(10)
    def wrapper(*args, **kwargs):
        try:
            return {"return_value": fn(*args, **kwargs), "errors": None}
        except Exception as e:
            return {"return_value": None, "errors": str(e)}

    return wrapper


def fn_metadata(func):
    signature = inspect.signature(func)
    docstring = inspect.getdoc(func) or "No docstring."
    return dict(
        function_name=func.__name__, arguments=str(signature), docstring=docstring
    )


def metric(example, pred, trace=None):
    gold = str(example.answer).rstrip(".0").replace(",", "").lower()
    pred = str(pred.answer).rstrip(".0").replace(",", "").lower()
    return pred == gold  # stricter than the original paper's metric!


evaluate = dspy.Evaluate(
    devset=devset,
    metric=metric,
    num_threads=24,
    display_progress=True,
    display_table=0,
    max_errors=999,
)

In [7]:
class Agent(dspy.Module):
    def __init__(self, max_steps=5):
        self.max_steps = max_steps
        instructions = "For the final answer, produce short (not full sentence) answers in which you format dates as YYYY-MM-DD, names as Firstname Lastname, and numbers without leading 0s."
        signature = dspy.Signature(
            "question, trajectory, functions -> next_selected_fn, args: dict[str, Any]",
            instructions,
        )
        self.react = dspy.ChainOfThought(signature)

    def forward(self, question, functions):
        tools = {fn_name: fn_metadata(fn) for fn_name, fn in functions.items()}
        trajectory = []

        for _ in range(self.max_steps):
            pred = self.react(question=question, trajectory=trajectory, functions=tools)
            selected_fn = pred.next_selected_fn.strip('"').strip("'")
            fn_output = wrap_function_with_timeout(functions[selected_fn])(**pred.args)
            trajectory.append(
                dict(
                    reasoning=pred.reasoning,
                    selected_fn=selected_fn,
                    args=pred.args,
                    **fn_output,
                )
            )

            if selected_fn == "finish":
                break

        return dspy.Prediction(
            answer=fn_output.get("return_value", ""), trajectory=trajectory
        )

In [8]:
agent = Agent()
evaluate(agent)

Average Metric: 100.00 / 267 (37.5%):  89%|████████▉ | 267/300 [02:21<00:21,  1.53it/s]

2025/07/20 13:04:27 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What is the date 10 weeks after the date of death of the director of film Cross Current?', 'answer': '2014-05-18', 'functions': {'film_info_retriever': <function film_info_retriever at 0x13e07a0c0>, 'sports_figure_bio_retriever': <function sports_figure_bio_retriever at 0x13e07a520>, 'date_calculator': <function date_calculator at 0x13e07a5c0>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): 'general_bio_retriever'. Set `provide_traceback=True` for traceback.


Average Metric: 109.00 / 299 (36.5%): 100%|██████████| 300/300 [02:38<00:00,  1.89it/s]

2025/07/20 13:04:43 INFO dspy.evaluate.evaluate: Average Metric: 109.0 / 300 (36.3%)





EvaluationResult(score=36.33, results=<list of 300 results>)

In [9]:
simba = dspy.SIMBA(metric=metric, max_steps=12, max_demos=10)
optimized_agent = simba.compile(agent, trainset=trainset, seed=6793115)

2025/07/20 13:04:44 INFO dspy.teleprompt.simba: Starting batch 1 of 12.
2025/07/20 13:05:17 INFO dspy.teleprompt.simba: Sampling program trajectories on 32 examples x 6 samples.


Processed 31 / 192 examples:  16%|█▌        | 31/192 [00:59<04:50,  1.80s/it]'composer_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 63 / 192 examples:  33%|███▎      | 63/192 [01:44<01:56,  1.11it/s]'composer_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 69 / 192 examples:  36%|███▌      | 69/192 [01:54<03:15,  1.59s/it]'general_person_lookup'
'NoneType' object has no attribute 'answer'
Processed 160 / 192 examples:  83%|████████▎ | 160/192 [04:00<01:05,  2.05s/it]'composer_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 192 / 192 examples: 100%|██████████| 192/192 [04:41<00:00,  1.47s/it]

2025/07/20 13:09:58 INFO dspy.teleprompt.simba: Batch 1: Baseline mini-batch score: 0.2760416666666667

2025/07/20 13:09:58 INFO dspy.teleprompt.simba: Batch 1: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.





2025/07/20 13:10:02 INFO dspy.teleprompt.simba: Batch 1: Invoking strategy: append_a_rule
2025/07/20 13:10:11 INFO dspy.teleprompt.simba_utils: Advice for react.predict: When tasked with identifying familial relationships, ensure that the relationship type used is supported by the available functions. For genealogical queries, use valid relationship types such as 'father', 'mother', 'grandparents', etc., and avoid unsupported terms like 'grandfather' or 'ancestors'. If multiple attempts using valid relationship types fail, consider leveraging external historical knowledge or alternative methods to confirm details. Once the correct individual is identified, proceed with extracting and processing the required data (e.g., first name for vowel counting) accurately.
2025/07/20 13:10:11 INFO dspy.teleprompt.simba: 

2025/07/20 13:10:11 INFO dspy.teleprompt.simba: Batch 1: Processing bucket #2, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/07/20 13:10:11 

  0%|          | 0/224 [00:00<?, ?it/s]'person_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 61 / 224 examples:  27%|██▋       | 61/224 [01:27<04:25,  1.63s/it]'composer_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 65 / 224 examples:  29%|██▉       | 65/224 [01:31<03:34,  1.35s/it]'enhanced_person_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 70 / 224 examples:  31%|███▏      | 70/224 [01:35<02:35,  1.01s/it]'enhanced_director_mother_lookup'
'NoneType' object has no attribute 'answer'
Processed 96 / 224 examples:  43%|████▎     | 96/224 [02:11<02:15,  1.06s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 100 / 224 examples:  45%|████▍     | 100/224 [02:16<02:20,  1.13s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 127 / 224 examples:  57%|█████▋    | 127/224 [02:47<01:19,  1.22it/s]'composer_info_retriever'
'NoneType' object has no att

2025/07/20 13:16:56 INFO dspy.teleprompt.simba: Scores after 1 batches: [0.25, 0.28125, 0.375, 0.3125, 0.21875, 0.375, 0.34375], Best: 0.375

2025/07/20 13:16:56 INFO dspy.teleprompt.simba: Starting batch 2 of 12.





2025/07/20 13:17:39 INFO dspy.teleprompt.simba: Sampling program trajectories on 32 examples x 6 samples.


Processed 24 / 192 examples:  12%|█▎        | 24/192 [01:15<06:36,  2.36s/it]'{ "historical_relationship_finder": {"function_name": "historical_relationship_finder", "arguments": "(figure_name: str, relationship_type: str, context: str = \'\', output_format: str = \'text\', date_range: dict = None, include_extended_family: bool = False)", "docstring": "No docstring."} }'
'NoneType' object has no attribute 'answer'
Processed 55 / 192 examples:  29%|██▊       | 55/192 [02:13<03:01,  1.32s/it]'date_calculator'
'NoneType' object has no attribute 'answer'
Processed 112 / 192 examples:  58%|█████▊    | 112/192 [04:21<02:39,  1.99s/it]'historical_relationship_finder'
'NoneType' object has no attribute 'answer'
Processed 150 / 192 examples:  78%|███████▊  | 150/192 [05:48<01:04,  1.55s/it]'date_calculator'
'NoneType' object has no attribute 'answer'
Processed 183 / 192 examples:  95%|█████████▌| 183/192 [06:34<00:13,  1.54s/it]'date_calculator'
'NoneType' object has no attribute 'answer'
Proce

2025/07/20 13:24:35 INFO dspy.teleprompt.simba: Batch 2: Baseline mini-batch score: 0.234375

2025/07/20 13:24:35 INFO dspy.teleprompt.simba: Batch 2: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/07/20 13:24:35 INFO dspy.teleprompt.simba: Batch 2: Invoking strategy: append_a_demo_
2025/07/20 13:24:35 INFO dspy.teleprompt.simba_utils: Added 1 demos (one each) across all predictors.
2025/07/20 13:24:35 INFO dspy.teleprompt.simba: 

2025/07/20 13:24:35 INFO dspy.teleprompt.simba: Batch 2: Processing bucket #2, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/07/20 13:24:35 INFO dspy.teleprompt.simba: Batch 2: Invoking strategy: append_a_rule
2025/07/20 13:25:11 INFO dspy.teleprompt.simba_utils: Advice for react.predict: If the module has identified the full name of the paternal grandfather and needs to extract the first and last letters of the respective names, it should: 1) Use the `extract_fi

Processed 2 / 224 examples:   1%|          | 2/224 [00:08<12:42,  3.44s/it]'historical_figure_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 23 / 224 examples:  10%|█         | 23/224 [00:37<04:56,  1.48s/it]'date_calculator'
'NoneType' object has no attribute 'answer'
Processed 53 / 224 examples:  24%|██▎       | 53/224 [01:26<04:53,  1.71s/it]'calculate_date'
'NoneType' object has no attribute 'answer'
Processed 59 / 224 examples:  26%|██▋       | 59/224 [01:34<03:56,  1.43s/it]'extract_first_letter'
'NoneType' object has no attribute 'answer'
Processed 62 / 224 examples:  28%|██▊       | 62/224 [01:38<03:41,  1.37s/it]'historical_figure_info'
'NoneType' object has no attribute 'answer'
Processed 161 / 224 examples:  72%|███████▏  | 161/224 [03:47<01:07,  1.07s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 215 / 224 examples:  96%|█████████▌| 215/224 [04:57<00:09,  1.00s/it]'date_calculator'
'NoneType' object has no attr

2025/07/20 13:31:01 INFO dspy.teleprompt.simba: Scores after 2 batches: [0.40625, 0.21875, 0.28125, 0.3125, 0.40625, 0.40625, 0.1875], Best: 0.40625

2025/07/20 13:31:01 INFO dspy.teleprompt.simba: Starting batch 3 of 12.





2025/07/20 13:31:56 INFO dspy.teleprompt.simba: Sampling program trajectories on 32 examples x 6 samples.


Processed 2 / 192 examples:   1%|          | 2/192 [00:07<10:07,  3.20s/it]'historical_relationship_finder'
'NoneType' object has no attribute 'answer'
Processed 18 / 192 examples:   9%|▉         | 18/192 [00:27<03:50,  1.32s/it]'historical_genealogy_lookup'
'NoneType' object has no attribute 'answer'
Processed 49 / 192 examples:  26%|██▌       | 49/192 [01:07<02:14,  1.06it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 77 / 192 examples:  40%|████      | 77/192 [02:03<09:13,  4.81s/it]'historical_relationship_finder'
'NoneType' object has no attribute 'answer'
Processed 116 / 192 examples:  60%|██████    | 116/192 [02:51<00:53,  1.42it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 136 / 192 examples:  71%|███████   | 136/192 [04:07<04:07,  4.42s/it]'enhanced_movie_director_lookup'
'NoneType' object has no attribute 'answer'
Processed 147 / 192 examples:  77%|███████▋  | 147/192 [05:25<11:29, 15.33s/it]'bio

2025/07/20 13:39:55 INFO dspy.teleprompt.simba: Batch 3: Baseline mini-batch score: 0.3177083333333333

2025/07/20 13:39:55 INFO dspy.teleprompt.simba: Batch 3: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/07/20 13:39:55 INFO dspy.teleprompt.simba: Batch 3: Invoking strategy: append_a_rule
2025/07/20 13:40:12 INFO dspy.teleprompt.simba_utils: Advice for react.predict: When faced with an error in acquiring genealogical information, if the database queries fail, cross-reference known historical data accurately to determine key names or relationships. For example, if the genealogy functions return errors or incomplete results, ensure you use accurate historical knowledge or reliable sources to confirm the correct name (e.g., Thomas Monck instead of Charles Monck). Once you have the correct first name, ensure you use the enhanced_alphabetical_sorter function to sort its letters alphabetically.
2025/07/20 13:40:12 INFO dspy.telepr

Processed 41 / 224 examples:  18%|█▊        | 41/224 [01:22<09:14,  3.03s/it]'historical_relationship_finder'
'NoneType' object has no attribute 'answer'
'historical_relationship_finder'
'NoneType' object has no attribute 'answer'
Processed 63 / 224 examples:  28%|██▊       | 63/224 [02:04<05:41,  2.12s/it]'enhanced_movie_director_lookup'
'NoneType' object has no attribute 'answer'
Processed 117 / 224 examples:  52%|█████▏    | 117/224 [03:26<01:40,  1.06it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 177 / 224 examples:  79%|███████▉  | 177/224 [06:47<02:36,  3.32s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 224 / 224 examples: : 225it [10:14,  2.73s/it]                       

2025/07/20 13:50:43 INFO dspy.teleprompt.simba: Scores after 3 batches: [0.3125, 0.25, 0.34375, 0.53125, 0.375, 0.375, 0.53125], Best: 0.53125






2025/07/20 13:50:43 INFO dspy.teleprompt.simba: Starting batch 4 of 12.
2025/07/20 13:51:41 INFO dspy.teleprompt.simba: Sampling program trajectories on 32 examples x 6 samples.


Processed 5 / 192 examples:   2%|▏         | 4/192 [00:00<00:04, 44.82it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 21 / 192 examples:  10%|█         | 20/192 [01:12<07:52,  2.75s/it]'date_calculator'
'NoneType' object has no attribute 'answer'
Processed 31 / 192 examples:  16%|█▌        | 31/192 [01:55<10:00,  3.73s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 46 / 192 examples:  24%|██▍       | 46/192 [02:37<09:12,  3.79s/it]'genealogy_query'
'NoneType' object has no attribute 'answer'
Processed 52 / 192 examples:  27%|██▋       | 52/192 [02:50<06:01,  2.58s/it]'date_calculator'
'NoneType' object has no attribute 'answer'
Processed 54 / 192 examples:  28%|██▊       | 53/192 [02:55<06:02,  2.61s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 80 / 192 examples:  41%|████      | 79/192 [05:15<13:23,  7.11s/it]'biographical_info_retriever'
'NoneType' object has no 

2025/07/20 14:15:02 INFO dspy.teleprompt.simba: Batch 4: Baseline mini-batch score: 0.2604166666666667

2025/07/20 14:15:02 INFO dspy.teleprompt.simba: Batch 4: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/07/20 14:15:02 INFO dspy.teleprompt.simba: Batch 4: Invoking strategy: append_a_demo_
2025/07/20 14:15:02 INFO dspy.teleprompt.simba_utils: Added 1 demos (one each) across all predictors.
2025/07/20 14:15:02 INFO dspy.teleprompt.simba: 

2025/07/20 14:15:02 INFO dspy.teleprompt.simba: Batch 4: Processing bucket #2, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/07/20 14:15:02 INFO dspy.teleprompt.simba: Batch 4: Invoking strategy: append_a_rule
2025/07/20 14:15:12 INFO dspy.teleprompt.simba_utils: Advice for react.predict: If the module receives an error when querying genealogical information due to an invalid relationship type, it should consider using broader categories like 'ancestors

Processed 15 / 224 examples:   7%|▋         | 15/224 [01:23<19:28,  5.59s/it]'date_calculator'
'NoneType' object has no attribute 'answer'
Processed 19 / 224 examples:   8%|▊         | 19/224 [02:11<27:43,  8.12s/it]'date_calculator'
'NoneType' object has no attribute 'answer'
Processed 30 / 224 examples:  13%|█▎        | 30/224 [06:13<44:38, 13.81s/it]  'historical_figure_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 46 / 224 examples:  21%|██        | 46/224 [09:32<35:55, 12.11s/it]'date_calculator'
'NoneType' object has no attribute 'answer'
Processed 48 / 224 examples:  21%|██▏       | 48/224 [09:54<34:20, 11.71s/it]'date_calculator'
'NoneType' object has no attribute 'answer'
Processed 49 / 224 examples:  22%|██▏       | 49/224 [10:05<32:59, 11.31s/it]'genealogy_query'
'NoneType' object has no attribute 'answer'
Processed 61 / 224 examples:  27%|██▋       | 61/224 [12:11<28:26, 10.47s/it]'historical_figure_info_retriever'
'NoneType' object has no attribute 

2025/07/20 14:31:13 INFO dspy.teleprompt.simba: Scores after 4 batches: [0.3125, 0.25, 0.1875, 0.34375, 0.28125, 0.21875, 0.28125], Best: 0.34375

2025/07/20 14:31:13 INFO dspy.teleprompt.simba: Starting batch 5 of 12.





2025/07/20 14:32:14 INFO dspy.teleprompt.simba: Sampling program trajectories on 32 examples x 6 samples.


Processed 16 / 192 examples:   8%|▊         | 16/192 [00:15<02:19,  1.26it/s]'historical_figure_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 82 / 192 examples:  43%|████▎     | 82/192 [01:32<01:30,  1.22it/s]'historical_figure_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 114 / 192 examples:  59%|█████▉    | 114/192 [02:10<01:34,  1.21s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 147 / 192 examples:  77%|███████▋  | 147/192 [02:46<00:40,  1.11it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 175 / 192 examples:  91%|█████████ | 175/192 [03:25<00:27,  1.64s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 192 / 192 examples: 100%|██████████| 192/192 [03:50<00:00,  1.20s/it]


2025/07/20 14:36:07 INFO dspy.teleprompt.simba: Batch 5: Baseline mini-batch score: 0.4270833333333333

2025/07/20 14:36:07 INFO dspy.teleprompt.simba: Batch 5: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/07/20 14:36:07 INFO dspy.teleprompt.simba: Batch 5: Invoking strategy: append_a_demo_, having dropped 1 demos per predictor
2025/07/20 14:36:07 INFO dspy.teleprompt.simba_utils: Added 1 demos (one each) across all predictors.
2025/07/20 14:36:07 INFO dspy.teleprompt.simba: 

2025/07/20 14:36:07 INFO dspy.teleprompt.simba: Batch 5: Processing bucket #2, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/07/20 14:36:07 INFO dspy.teleprompt.simba: Batch 5: Invoking strategy: append_a_rule
2025/07/20 14:36:12 INFO dspy.teleprompt.simba_utils: Advice for react.predict: When you are about to call a function like `vowel_counter` that requires a list as input, always double-check the input format. I

Processed 20 / 224 examples:   9%|▉         | 20/224 [00:28<02:27,  1.38it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 51 / 224 examples:  23%|██▎       | 51/224 [01:10<02:45,  1.04it/s]'historical_figure_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 55 / 224 examples:  25%|██▍       | 55/224 [01:15<03:39,  1.30s/it]'historical_figure_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 73 / 224 examples:  33%|███▎      | 73/224 [01:39<03:52,  1.54s/it]'genealogy_query'
'NoneType' object has no attribute 'answer'
Processed 82 / 224 examples:  37%|███▋      | 82/224 [01:50<02:26,  1.03s/it]'historical_figure_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 146 / 224 examples:  65%|██████▌   | 146/224 [03:08<01:04,  1.21it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 176 / 224 examples:  79%|███████▊  | 176/224 [03:48<01:23,  1.74s/it]'historica

2025/07/20 14:41:16 INFO dspy.teleprompt.simba: Scores after 5 batches: [0.21875, 0.34375, 0.3125, 0.40625, 0.4375, 0.4375, 0.53125], Best: 0.53125






2025/07/20 14:41:16 INFO dspy.teleprompt.simba: Starting batch 6 of 12.
2025/07/20 14:42:26 INFO dspy.teleprompt.simba: Sampling program trajectories on 32 examples x 6 samples.


Processed 4 / 192 examples:   2%|▏         | 3/192 [00:00<00:21,  8.83it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 39 / 192 examples:  20%|██        | 39/192 [00:47<02:56,  1.16s/it]'advanced_date_incrementer'
'NoneType' object has no attribute 'answer'
Processed 69 / 192 examples:  36%|███▌      | 69/192 [01:33<03:22,  1.65s/it]'historical_figure_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 107 / 192 examples:  56%|█████▌    | 107/192 [02:24<01:15,  1.13it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 136 / 192 examples:  71%|███████   | 136/192 [03:03<00:58,  1.04s/it]'advanced_date_incrementer'
'NoneType' object has no attribute 'answer'
Processed 166 / 192 examples:  86%|████████▋ | 166/192 [03:46<00:31,  1.23s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 192 / 192 examples: 100%|██████████| 192/192 [04:22<00:00,  1.37s/it]


2025/07/20 14:46:52 INFO dspy.teleprompt.simba: Batch 6: Baseline mini-batch score: 0.4479166666666667

2025/07/20 14:46:52 INFO dspy.teleprompt.simba: Batch 6: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/07/20 14:46:52 INFO dspy.teleprompt.simba: Batch 6: Invoking strategy: append_a_rule
2025/07/20 14:46:59 INFO dspy.teleprompt.simba_utils: Advice for react.predict: If the module receives a situation where a function call returns an error due to incorrect parameters, it should analyze the error message to adjust and retry with the correct parameters. For example, when using the `biographical_info_retriever` function, ensure that the `work_type` parameter matches one of the valid types (e.g., 'movie', 'series', 'documentary', 'short film') to avoid errors. Additionally, the module should be cautious about using known information as a fallback only when function calls fail completely, and ensure that such information is accur

Processed 4 / 224 examples:   2%|▏         | 4/224 [00:11<09:56,  2.71s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 39 / 224 examples:  17%|█▋        | 39/224 [01:00<03:37,  1.18s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 70 / 224 examples:  31%|███▏      | 70/224 [01:41<03:38,  1.42s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 135 / 224 examples:  60%|██████    | 135/224 [03:09<01:45,  1.19s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 166 / 224 examples:  74%|███████▍  | 166/224 [03:52<01:03,  1.09s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 198 / 224 examples:  88%|████████▊ | 198/224 [04:40<00:47,  1.84s/it]'historical_figure_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 224 / 224 examples: 100%|██████████| 224/224 [05:25<00:00,  1.46s/it]

2025/07/20 14:52:49 INFO dspy.teleprompt.simba: Scores after 6 batches: [0.375, 0.4375, 0.46875, 0.46875, 0.4375, 0.5625, 0.46875], Best: 0.5625






2025/07/20 14:52:49 INFO dspy.teleprompt.simba: Starting batch 7 of 12.
2025/07/20 14:53:50 INFO dspy.teleprompt.simba: Sampling program trajectories on 32 examples x 6 samples.


Processed 1 / 192 examples:   0%|          | 0/192 [00:00<?, ?it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 11 / 192 examples:   5%|▌         | 10/192 [00:08<00:02, 61.73it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 24 / 192 examples:  12%|█▏        | 23/192 [00:16<02:57,  1.05s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 37 / 192 examples:  19%|█▉        | 37/192 [00:29<02:59,  1.16s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 47 / 192 examples:  24%|██▍       | 47/192 [00:41<03:23,  1.40s/it]'date_calculator'
'NoneType' object has no attribute 'answer'
Processed 55 / 192 examples:  28%|██▊       | 54/192 [00:47<02:11,  1.05it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 70 / 192 examples:  36%|███▌      | 69/192 [01:03<01:59,  1.03it/s]'biographical_info_retriever'
'NoneType

2025/07/20 14:57:25 INFO dspy.teleprompt.simba: Batch 7: Baseline mini-batch score: 0.3645833333333333






2025/07/20 14:57:25 INFO dspy.teleprompt.simba: Batch 7: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/07/20 14:57:26 INFO dspy.teleprompt.simba: Batch 7: Invoking strategy: append_a_rule
2025/07/20 14:57:33 INFO dspy.teleprompt.simba_utils: Advice for react.predict: If the module receives a historical query where the death date of a figure like Mahaut, Countess of Artois, is not found in the database, it should not default to a generic date like January 1st of the known year. Instead, use specific historical knowledge to identify the exact date, such as November 27th, 1329, for Mahaut, Countess of Artois. This ensures that subsequent calculations, like subtracting years, are accurate. Additionally, ensure that the arguments for the historical query functions are as complete and precise as possible, and when errors occur, leverage external historical records as a fallback to guide the reasoning process.
2025/07/20 14:57:33 INF

Processed 7 / 224 examples:   3%|▎         | 7/224 [00:09<02:21,  1.54it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 22 / 224 examples:  10%|▉         | 22/224 [00:31<05:29,  1.63s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
'advanced_date_calculator'
'NoneType' object has no attribute 'answer'
Processed 58 / 224 examples:  26%|██▌       | 58/224 [01:16<04:34,  1.66s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 70 / 224 examples:  31%|███       | 69/224 [01:29<02:48,  1.09s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 88 / 224 examples:  39%|███▉      | 88/224 [01:51<02:59,  1.32s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 102 / 224 examples:  46%|████▌     | 102/224 [02:02<01:26,  1.41it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 109 / 224 exa

2025/07/20 15:03:02 INFO dspy.teleprompt.simba: Scores after 7 batches: [0.375, 0.34375, 0.375, 0.34375, 0.3125, 0.4375, 0.34375], Best: 0.4375






2025/07/20 15:03:02 INFO dspy.teleprompt.simba: Starting batch 8 of 12.
2025/07/20 15:04:26 INFO dspy.teleprompt.simba: Sampling program trajectories on 32 examples x 6 samples.


Processed 45 / 192 examples:  23%|██▎       | 45/192 [00:44<01:56,  1.27it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 74 / 192 examples:  39%|███▊      | 74/192 [01:24<02:59,  1.52s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 137 / 192 examples:  71%|███████▏  | 137/192 [02:40<01:20,  1.46s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 168 / 192 examples:  88%|████████▊ | 168/192 [03:14<00:23,  1.02it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 192 / 192 examples: 100%|██████████| 192/192 [03:46<00:00,  1.18s/it]


2025/07/20 15:08:17 INFO dspy.teleprompt.simba: Batch 8: Baseline mini-batch score: 0.4010416666666667

2025/07/20 15:08:17 INFO dspy.teleprompt.simba: Batch 8: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/07/20 15:08:17 INFO dspy.teleprompt.simba: Batch 8: Invoking strategy: append_a_rule
2025/07/20 15:08:27 INFO dspy.teleprompt.simba_utils: Advice for react.predict: If you encounter a query involving a familial relationship that is not directly supported by available functions, such as 'paternal grandmother,' use historical context or internal knowledge to make informed deductions. For example, if 'paternal grandmother' does not yield results, consider known historical figures or context to identify the correct individual. Additionally, ensure that when counting letters or performing calculations, you verify the accuracy of the names and numbers involved to avoid errors in the final output.
2025/07/20 15:08:27 INFO dspy.tel

Processed 12 / 224 examples:   5%|▌         | 12/224 [00:19<03:33,  1.01s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 28 / 224 examples:  12%|█▎        | 28/224 [00:42<03:45,  1.15s/it]'date_calculator'
'NoneType' object has no attribute 'answer'
Processed 39 / 224 examples:  17%|█▋        | 39/224 [00:57<03:19,  1.08s/it]'historical_figure_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 137 / 224 examples:  61%|██████    | 137/224 [03:22<03:04,  2.12s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 202 / 224 examples:  90%|█████████ | 202/224 [04:48<00:29,  1.36s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 224 / 224 examples: 100%|██████████| 224/224 [05:28<00:00,  1.47s/it]


2025/07/20 15:14:24 INFO dspy.teleprompt.simba: Scores after 8 batches: [0.4375, 0.46875, 0.4375, 0.4375, 0.375, 0.46875, 0.40625], Best: 0.46875

2025/07/20 15:14:29 INFO dspy.teleprompt.simba: Starting batch 9 of 12.
2025/07/20 15:16:27 INFO dspy.teleprompt.simba: Sampling program trajectories on 32 examples x 6 samples.


Processed 25 / 192 examples:  13%|█▎        | 25/192 [00:12<01:05,  2.53it/s]'historical_figure_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 54 / 192 examples:  28%|██▊       | 54/192 [01:11<03:14,  1.41s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 88 / 192 examples:  45%|████▌     | 87/192 [01:49<02:02,  1.16s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 96 / 192 examples:  50%|█████     | 96/192 [02:00<02:33,  1.60s/it]'date_calculator'
'NoneType' object has no attribute 'answer'
Processed 122 / 192 examples:  64%|██████▎   | 122/192 [02:28<00:41,  1.70it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 153 / 192 examples:  80%|███████▉  | 153/192 [03:00<00:22,  1.75it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 179 / 192 examples:  93%|█████████▎| 179/192 [03:30<00:11,  1.18it/s]'biographical_info

2025/07/20 15:20:22 INFO dspy.teleprompt.simba: Batch 9: Baseline mini-batch score: 0.4791666666666667

2025/07/20 15:20:22 INFO dspy.teleprompt.simba: Batch 9: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/07/20 15:20:22 INFO dspy.teleprompt.simba: Batch 9: Invoking strategy: append_a_rule
2025/07/20 15:20:27 INFO dspy.teleprompt.simba_utils: Advice for react.predict: If the module receives a question requiring genealogical information and encounters an unsupported relationship type with a function like `historical_relationship_finder`, it should switch to using factual genealogical data, such as known family trees, rather than guessing. In this case, when identifying Harriet Hemings's maternal grandfather, use established historical and genealogical resources to correctly identify John Wayles, instead of incorrectly assuming Thomas Jefferson. This ensures the correct extraction of 'Wayles' as the last name, which will lead t

Processed 24 / 224 examples:  11%|█         | 24/224 [00:32<02:38,  1.26it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 30 / 224 examples:  13%|█▎        | 30/224 [00:43<04:55,  1.52s/it]'family_relationship_finder'
'NoneType' object has no attribute 'answer'
Processed 61 / 224 examples:  27%|██▋       | 61/224 [01:23<04:16,  1.58s/it]'relationship_query'
'NoneType' object has no attribute 'answer'
Processed 80 / 224 examples:  36%|███▌      | 80/224 [01:46<03:12,  1.34s/it]'film_director_lookup'
'NoneType' object has no attribute 'answer'
Processed 88 / 224 examples:  39%|███▉      | 88/224 [01:58<03:13,  1.42s/it]'date_calculator'
'NoneType' object has no attribute 'answer'
Processed 92 / 224 examples:  41%|████      | 92/224 [02:03<03:17,  1.50s/it]'historical_figure_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 95 / 224 examples:  42%|████▏     | 95/224 [02:06<02:42,  1.26s/it]'relationship_query'
'NoneType' object ha

2025/07/20 15:25:41 INFO dspy.teleprompt.simba: Scores after 9 batches: [0.5625, 0.53125, 0.4375, 0.59375, 0.25, 0.53125, 0.5], Best: 0.59375






2025/07/20 15:25:43 INFO dspy.teleprompt.simba: Starting batch 10 of 12.
2025/07/20 15:27:48 INFO dspy.teleprompt.simba: Sampling program trajectories on 32 examples x 6 samples.


Processed 11 / 192 examples:   6%|▌         | 11/192 [00:05<01:44,  1.74it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 39 / 192 examples:  20%|██        | 39/192 [00:29<02:02,  1.25it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 73 / 192 examples:  38%|███▊      | 72/192 [01:08<02:50,  1.42s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 108 / 192 examples:  56%|█████▋    | 108/192 [01:49<01:11,  1.18it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 135 / 192 examples:  70%|███████   | 135/192 [02:13<00:35,  1.61it/s]'historical_figure_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 170 / 192 examples:  88%|████████▊ | 169/192 [02:51<00:18,  1.25it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 192 / 192 examples: 100%|██████████| 192/192 [03:19<00:00,  1.04s/it]


2025/07/20 15:31:15 INFO dspy.teleprompt.simba: Batch 10: Baseline mini-batch score: 0.3645833333333333

2025/07/20 15:31:15 INFO dspy.teleprompt.simba: Batch 10: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/07/20 15:31:15 INFO dspy.teleprompt.simba: Batch 10: Invoking strategy: append_a_rule
2025/07/20 15:31:23 INFO dspy.teleprompt.simba_utils: Advice for react.predict: If the module receives a question involving genealogical relationships not directly supported by the available functions, it should swiftly switch to using historical knowledge to fill in the gaps. For the task of identifying the paternal grandfather, if functions return errors or lack results, directly use known historical data. Specifically, after failing to identify a relationship through `genealogy_lookup` or `historical_genealogy_lookup`, verify historical records for figures like Ferdinand Maximilian and their ancestors. Once the correct individual is i

Processed 11 / 224 examples:   5%|▍         | 11/224 [00:17<03:58,  1.12s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 45 / 224 examples:  20%|██        | 45/224 [00:58<03:03,  1.03s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 69 / 224 examples:  31%|███       | 69/224 [01:28<02:59,  1.16s/it]'enhanced_timezone_converter'
'NoneType' object has no attribute 'answer'
Processed 75 / 224 examples:  33%|███▎      | 75/224 [01:35<02:56,  1.18s/it]'historical_figure_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 90 / 224 examples:  40%|████      | 90/224 [02:01<03:34,  1.60s/it]'date_calculator'
'NoneType' object has no attribute 'answer'
Processed 107 / 224 examples:  48%|████▊     | 107/224 [02:25<03:12,  1.65s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 137 / 224 examples:  61%|██████    | 137/224 [02:57<01:10,  1.24it/s]'biographical_info_r

2025/07/20 15:37:02 INFO dspy.teleprompt.simba: Scores after 10 batches: [0.34375, 0.40625, 0.21875, 0.375, 0.21875, 0.28125, 0.125], Best: 0.40625






2025/07/20 15:37:02 INFO dspy.teleprompt.simba: Starting batch 11 of 12.
2025/07/20 15:38:01 INFO dspy.teleprompt.simba: Sampling program trajectories on 32 examples x 6 samples.


Processed 26 / 192 examples:  14%|█▎        | 26/192 [00:18<02:31,  1.09it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 52 / 192 examples:  27%|██▋       | 52/192 [00:41<01:35,  1.46it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 60 / 192 examples:  31%|███       | 59/192 [00:52<01:56,  1.14it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 157 / 192 examples:  81%|████████▏ | 156/192 [02:29<00:33,  1.07it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 184 / 192 examples:  96%|█████████▌| 184/192 [03:03<00:10,  1.37s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 192 / 192 examples: 100%|██████████| 192/192 [03:17<00:00,  1.03s/it]


2025/07/20 15:41:21 INFO dspy.teleprompt.simba: Batch 11: Baseline mini-batch score: 0.4375

2025/07/20 15:41:21 INFO dspy.teleprompt.simba: Batch 11: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/07/20 15:41:21 INFO dspy.teleprompt.simba: Batch 11: Invoking strategy: append_a_demo_
2025/07/20 15:41:21 INFO dspy.teleprompt.simba_utils: Added 1 demos (one each) across all predictors.
2025/07/20 15:41:21 INFO dspy.teleprompt.simba: 

2025/07/20 15:41:21 INFO dspy.teleprompt.simba: Batch 11: Processing bucket #2, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/07/20 15:41:21 INFO dspy.teleprompt.simba: Batch 11: Invoking strategy: append_a_demo_
2025/07/20 15:41:21 INFO dspy.teleprompt.simba_utils: Added 1 demos (one each) across all predictors.
2025/07/20 15:41:21 INFO dspy.teleprompt.simba: 

2025/07/20 15:41:21 INFO dspy.teleprompt.simba: Batch 11: Processing bucket #3, with max score 1.0, m

Processed 26 / 224 examples:  12%|█▏        | 26/224 [00:41<04:44,  1.43s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 51 / 224 examples:  23%|██▎       | 51/224 [01:08<01:42,  1.68it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 60 / 224 examples:  27%|██▋       | 60/224 [01:21<02:38,  1.04it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 93 / 224 examples:  42%|████▏     | 93/224 [02:02<02:55,  1.34s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 124 / 224 examples:  55%|█████▌    | 124/224 [02:51<01:31,  1.09it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 224 / 224 examples: 100%|██████████| 224/224 [05:14<00:00,  1.40s/it]

2025/07/20 15:47:00 INFO dspy.teleprompt.simba: Scores after 11 batches: [0.53125, 0.4375, 0.53125, 0.5, 0.40625, 0.4375, 0.46875], Best: 0.53125






2025/07/20 15:47:00 INFO dspy.teleprompt.simba: Starting batch 12 of 12.
2025/07/20 15:48:03 INFO dspy.teleprompt.simba: Sampling program trajectories on 32 examples x 6 samples.


  0%|          | 0/192 [00:00<?, ?it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 3 / 192 examples:   1%|          | 2/192 [00:00<00:08, 23.58it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 34 / 192 examples:  18%|█▊        | 34/192 [00:31<02:49,  1.07s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 38 / 192 examples:  20%|█▉        | 38/192 [00:35<03:08,  1.22s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 66 / 192 examples:  34%|███▍      | 66/192 [01:04<01:47,  1.17it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 72 / 192 examples:  38%|███▊      | 72/192 [01:08<01:50,  1.08it/s]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 75 / 192 examples:  39%|███▉      | 75/192 [01:14<02:46,  1.42s/it]'relationship_query'
'NoneType' object has no attribute '

2025/07/20 15:51:11 INFO dspy.teleprompt.simba: Batch 12: Baseline mini-batch score: 0.4322916666666667

2025/07/20 15:51:11 INFO dspy.teleprompt.simba: Batch 12: Processing bucket #1, with max score 1.0, max-to-min gap 1.0, and max-to-avg gap 0.8333333333333334.
2025/07/20 15:51:11 INFO dspy.teleprompt.simba: Batch 12: Invoking strategy: append_a_rule
2025/07/20 15:51:17 INFO dspy.teleprompt.simba_utils: Advice for react.predict: When using the `closest_palindrome_finder` function, if you encounter an error stating that a 'float' object cannot be interpreted as an integer, consider adjusting the parameters to ensure proper data types. Specifically, ensure that the `range_limit` parameter is set to an appropriate integer value, such as 100, to control the search space effectively and avoid type-related errors. This adjustment will help resolve the error and allow the function to return the correct palindrome number.
2025/07/20 15:51:17 INFO dspy.teleprompt.simba: 

2025/07/20 15:51:17 

Processed 1 / 224 examples:   0%|          | 1/224 [00:03<13:45,  3.70s/it]'historical_figure_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 2 / 224 examples:   1%|          | 2/224 [00:05<10:11,  2.76s/it]'date_calculator'
'NoneType' object has no attribute 'answer'
Processed 5 / 224 examples:   2%|▏         | 5/224 [00:09<05:56,  1.63s/it]'historical_figure_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 12 / 224 examples:   5%|▌         | 12/224 [00:15<03:23,  1.04it/s]'genealogy_query'
'NoneType' object has no attribute 'answer'
Processed 32 / 224 examples:  14%|█▍        | 31/224 [00:39<04:00,  1.25s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 36 / 224 examples:  16%|█▌        | 36/224 [00:46<04:27,  1.43s/it]'biographical_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 45 / 224 examples:  20%|██        | 45/224 [00:54<02:45,  1.08it/s]'biographical_info_retriever'
'NoneTy

2025/07/20 15:56:00 INFO dspy.teleprompt.simba: Scores after 12 batches: [0.5, 0.46875, 0.46875, 0.46875, 0.53125, 0.5, 0.46875], Best: 0.53125






2025/07/20 15:56:01 INFO dspy.teleprompt.simba: VALIDATION: Evaluating 7 programs on the full trainset.


Processed 58 / 700 examples:   8%|▊         | 58/700 [01:04<10:43,  1.00s/it]'{ "historical_relationship_finder": {"function_name": "historical_relationship_finder", "arguments": "(figure_name: str, relationship_type: str, context: str = \'\', output_format: str = \'text\', date_range: dict = None, include_extended_family: bool = False)", "docstring": "No docstring."} }'
'NoneType' object has no attribute 'answer'
Processed 108 / 700 examples:  15%|█▌        | 108/700 [01:46<04:19,  2.28it/s]'date_calculator'
'NoneType' object has no attribute 'answer'
Processed 110 / 700 examples:  16%|█▌        | 110/700 [01:52<12:11,  1.24s/it]'historical_figure_info_retriever'
'NoneType' object has no attribute 'answer'
Processed 129 / 700 examples:  18%|█▊        | 128/700 [02:08<13:34,  1.42s/it]'date_calculator'
'NoneType' object has no attribute 'answer'
Processed 140 / 700 examples:  20%|██        | 140/700 [02:15<07:21,  1.27it/s]'historical_figure_info_retriever'
'NoneType' object has no att

2025/07/20 16:06:09 INFO dspy.teleprompt.simba: Final trainset scores: [0.22, 0.35, 0.43, 0.44, 0.39, 0.47, 0.5], Best: 0.5 (at index 6)





In [10]:
evaluate(optimized_agent)

Average Metric: 25.00 / 45 (55.6%):  15%|█▌        | 45/300 [00:25<02:22,  1.79it/s]

2025/07/20 16:06:39 ERROR dspy.utils.parallelizer: Error for Example({'question': "What is the date 6 days after when Prince Nikolai Of Denmark's mother was born?", 'answer': '1964-07-06', 'functions': {'royal_family_info_retriever': <function royal_family_info_retriever at 0x13d52d4e0>, 'sports_figure_bio_retriever': <function sports_figure_bio_retriever at 0x13d52d580>, 'date_calculator': <function date_calculator at 0x13d52d6c0>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): 'biographical_info_retriever'. Set `provide_traceback=True` for traceback.


Average Metric: 28.00 / 49 (57.1%):  17%|█▋        | 50/300 [00:26<01:10,  3.54it/s]

2025/07/20 16:06:40 ERROR dspy.utils.parallelizer: Error for Example({'question': "What is the date 7 days after the director of film Kalteva Torni 's birthday?", 'answer': '1963-11-07', 'functions': {'film_info_retriever': <function film_info_retriever at 0x13d52de40>, 'sports_figure_info_retriever': <function sports_figure_info_retriever at 0x13d52dee0>, 'date_calculator': <function date_calculator at 0x13d52e0c0>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): 'biographical_info_retriever'. Set `provide_traceback=True` for traceback.


Average Metric: 50.00 / 82 (61.0%):  28%|██▊       | 83/300 [00:39<00:56,  3.86it/s]

2025/07/20 16:06:54 ERROR dspy.utils.parallelizer: Error for Example({'question': "What is the sum of all the digits in the day, month and year of when Princess Feodora Of Denmark's father was born?", 'answer': '31', 'functions': {'family_relationship_finder': <function family_relationship_finder at 0x13d589ee0>, 'sports_figure_info_retriever': <function sports_figure_info_retriever at 0x13d58a020>, 'date_digit_sum_enhanced': <function date_digit_sum_enhanced at 0x13d58a0c0>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): 'biographical_info_retriever'. Set `provide_traceback=True` for traceback.


Average Metric: 51.00 / 84 (60.7%):  29%|██▉       | 87/300 [00:41<01:14,  2.84it/s]

2025/07/20 16:06:55 ERROR dspy.utils.parallelizer: Error for Example({'question': "How many days lay between Anthony William Fairbank Edwards's father's birthday and 1997-06-15?", 'answer': '35733', 'functions': {'family_tree_lookup': <function family_tree_lookup at 0x13d58aac0>, 'sports_figure_info_retriever': <function sports_figure_info_retriever at 0x13d58ac00>, 'date_difference_calculator': <function date_difference_calculator at 0x13d58ad40>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): 'biographical_info_retriever'. Set `provide_traceback=True` for traceback.


Average Metric: 94.00 / 161 (58.4%):  55%|█████▌    | 165/300 [01:10<00:58,  2.32it/s]

2025/07/20 16:07:25 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What is the day of the date of birth of the director of Kon-Tiki (1950 Film)?', 'answer': '6', 'functions': {'film_director_lookup': <function film_director_lookup at 0x13d75b7e0>, 'sports_figure_info_retriever': <function sports_figure_info_retriever at 0x13d75b880>, 'day_of_week_calculator': <function day_of_week_calculator at 0x13d75b9c0>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): 'biographical_info_retriever'. Set `provide_traceback=True` for traceback.


Average Metric: 100.00 / 170 (58.8%):  58%|█████▊    | 175/300 [01:16<01:23,  1.50it/s]

2025/07/20 16:07:30 ERROR dspy.utils.parallelizer: Error for Example({'question': "What is the date one week before when George Cochrane (Politician)'s father died?", 'answer': '1778-10-24', 'functions': {'enhanced_family_relationship_finder': <function enhanced_family_relationship_finder at 0x13d77cd60>, 'historical_biography_retriever': <function historical_biography_retriever at 0x13d77ce00>, 'date_calculator': <function date_calculator at 0x13d77cc20>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): 'biographical_info_retriever'. Set `provide_traceback=True` for traceback.


Average Metric: 107.00 / 181 (59.1%):  62%|██████▏   | 187/300 [01:20<00:44,  2.55it/s]

2025/07/20 16:07:35 ERROR dspy.utils.parallelizer: Error for Example({'question': "What is the total number of letters in the first name of Viacheslav I Of Kiev's maternal grandfather?", 'answer': '6', 'functions': {'historical_family_tree': <function historical_family_tree at 0x13d75a160>, 'genealogy_query': <function genealogy_query at 0x13d75a020>, 'extract_first_name': <function extract_first_name at 0x13d75a2a0>, 'string_length_calculator_v2': <function string_length_calculator_v2 at 0x13d75a3e0>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o in organization org-x03WTFMofvP4w8cBY4XBkePo on tokens per min (TPM): Limit 2000000, Used 2000000, Requested 6785. Please try again in 203ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 107.00 / 181 (59.1%):  63%|██████▎   | 188/300 [01:22<01:17,  1.44it/s]

2025/07/20 16:07:35 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What is the sum of the (unique) prime factors of when the publisher of American Scientist was founded?', 'answer': '66', 'functions': {'publication_info_retriever': <function publication_info_retriever at 0x13d758ea0>, 'historical_data_retriever': <function historical_data_retriever at 0x13d759080>, 'prime_factorizer': <function prime_factorizer at 0x13d759120>, 'advanced_sum_calculator': <function advanced_sum_calculator at 0x13d759260>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o in organization org-x03WTFMofvP4w8cBY4XBkePo on tokens per min (TPM): Limit 2000000, Used 2000000, Requested 6764. Please try again in 202ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 108.00 / 182 (59.3%):  63%|██████▎   | 190/300 [01:22<00:47,  2.33it/s]

2025/07/20 16:07:36 ERROR dspy.utils.parallelizer: Error for Example({'question': 'How many additional letters does the last name of Kara Malone have compared to the first name of the maternal grandfather of Vyacheslav Yaroslavich?', 'answer': '2', 'functions': {'genealogy_lookup': <function genealogy_lookup at 0x13d77e3e0>, 'extract_last_name': <function extract_last_name at 0x13d77e520>, 'count_letters': <function count_letters at 0x13d77e5c0>, 'extract_first_name': <function extract_first_name at 0x13d77e660>, 'name_length_counter': <function name_length_counter at 0x13d77e840>, 'calculate_difference': <function calculate_difference at 0x13d77e8e0>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o in organization org-x03WTFMofvP4w8cBY4XBkePo on tokens per min (TPM): Limit 2000000, Used 2000000, Requested 6487. Please try again in 194ms. Visit https://platform.

Average Metric: 114.00 / 195 (58.5%):  68%|██████▊   | 204/300 [01:33<00:54,  1.76it/s]

2025/07/20 16:07:47 ERROR dspy.utils.parallelizer: Error for Example({'question': 'How many vowels are there in the last name of the spouse of the director of film På Solsiden?', 'answer': '2', 'functions': {'film_metadata_query': <function film_metadata_query at 0x13d7b6020>, 'spouse_finder': <function spouse_finder at 0x13d7b6160>, 'extract_last_name': <function extract_last_name at 0x13d7b6200>, 'vowel_counter': <function vowel_counter at 0x13d7b6340>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o in organization org-x03WTFMofvP4w8cBY4XBkePo on tokens per min (TPM): Limit 2000000, Used 2000000, Requested 6310. Please try again in 189ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 114.00 / 195 (58.5%):  68%|██████▊   | 204/300 [01:33<00:54,  1.76it/s]

2025/07/20 16:07:48 ERROR dspy.utils.parallelizer: Error for Example({'question': "What is the ASCII code of the first letter of the first name of the spouse of the director of film The Belles Of St. Trinian'S in lowercase?", 'answer': '98', 'functions': {'movie_info_retriever': <function movie_info_retriever at 0x13d7b6480>, 'relationship_query': <function relationship_query at 0x13d7b6520>, 'extract_first_name': <function extract_first_name at 0x13d7b65c0>, 'extract_first_letter': <function extract_first_letter at 0x13d7b6660>, 'ascii_lookup_plus': <function ascii_lookup_plus at 0x13d7b67a0>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o in organization org-x03WTFMofvP4w8cBY4XBkePo on tokens per min (TPM): Limit 2000000, Used 2000000, Requested 6684. Please try again in 200ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide

Average Metric: 114.00 / 195 (58.5%):  69%|██████▊   | 206/300 [01:34<00:45,  2.09it/s]

2025/07/20 16:07:48 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What is the sum of all the digits in the day, month and year of the date of birth of the English inventor that developed the Richard Hornsby & Sons oil engine?', 'answer': '30', 'functions': {'inventor_lookup': <function inventor_lookup at 0x13d7f1260>, 'sports_figure_bio_retriever': <function sports_figure_bio_retriever at 0x13d7f13a0>, 'date_digit_sum': <function date_digit_sum at 0x13d7f1440>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): 'biographical_info_retriever'. Set `provide_traceback=True` for traceback.


Average Metric: 116.00 / 198 (58.6%):  70%|███████   | 210/300 [01:35<00:29,  3.03it/s]

2025/07/20 16:07:50 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What is the date one week before the date of death of the composer of film Oomana Thinkal?', 'answer': '1988-03-02', 'functions': {'song_composer_finder': <function song_composer_finder at 0x13d7b71a0>, 'sports_figure_info_retriever': <function sports_figure_info_retriever at 0x13d7b7240>, 'advanced_date_calculator': <function advanced_date_calculator at 0x13d7b72e0>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): 'biographical_info_retriever'. Set `provide_traceback=True` for traceback.


Average Metric: 131.00 / 220 (59.5%):  78%|███████▊  | 233/300 [01:53<00:26,  2.52it/s]

2025/07/20 16:08:07 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What is the date one day after the date of death of the director of film Nallavan Vazhvan?', 'answer': '1992-09-04', 'functions': {'movie_director_lookup': <function movie_director_lookup at 0x13e0168e0>, 'sports_figure_bio_info': <function sports_figure_bio_info at 0x13e016980>, 'date_calculator': <function date_calculator at 0x13e016b60>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): 'biographical_info_retriever'. Set `provide_traceback=True` for traceback.


Average Metric: 131.00 / 221 (59.3%):  78%|███████▊  | 235/300 [01:54<00:23,  2.76it/s]

2025/07/20 16:08:09 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What is the reverse order of the letters in the first name of the spouse of the director of film The Invisible Menace?', 'answer': 'neeruam', 'functions': {'movie_info_retriever': <function movie_info_retriever at 0x13e0145e0>, 'relationship_query': <function relationship_query at 0x13e014720>, 'extract_first_name': <function extract_first_name at 0x13e014860>, 'reverse_string': <function reverse_string at 0x13e014900>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o in organization org-x03WTFMofvP4w8cBY4XBkePo on tokens per min (TPM): Limit 2000000, Used 2000000, Requested 6695. Please try again in 200ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 139.00 / 237 (58.6%):  84%|████████▎ | 251/300 [02:07<00:48,  1.01it/s]

2025/07/20 16:08:21 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What is the combined length of the first name and last name of the founder of the political party of Dimuthu Bandara Abayakoon?', 'answer': '15', 'functions': {'political_affiliation_lookup': <function political_affiliation_lookup at 0x13e044180>, 'founder_identifier': <function founder_identifier at 0x13e0442c0>, 'extract_name_component': <function extract_name_component at 0x13e044360>, 'extract_last_name': <function extract_last_name at 0x13e044400>, 'enhanced_combined_length_calculator': <function enhanced_combined_length_calculator at 0x13e0444a0>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o in organization org-x03WTFMofvP4w8cBY4XBkePo on tokens per min (TPM): Limit 2000000, Used 2000000, Requested 6630. Please try again in 198ms. Visit https://platform.openai.com/account

Average Metric: 143.00 / 243 (58.8%):  86%|████████▋ | 259/300 [02:12<00:29,  1.40it/s]

2025/07/20 16:08:27 ERROR dspy.utils.parallelizer: Error for Example({'question': "What is the date one day before when Joachim I, Prince Of Anhalt-Dessau's father died?", 'answer': '1516-06-11', 'functions': {'genealogy_query': <function genealogy_query at 0x13e045e40>, 'historical_figure_info_retriever': <function historical_figure_info_retriever at 0x13e046020>, 'date_calculator': <function date_calculator at 0x13e0460c0>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o in organization org-x03WTFMofvP4w8cBY4XBkePo on tokens per min (TPM): Limit 2000000, Used 2000000, Requested 6507. Please try again in 195ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 146.00 / 246 (59.3%):  88%|████████▊ | 263/300 [02:15<00:24,  1.48it/s]

2025/07/20 16:08:29 ERROR dspy.utils.parallelizer: Error for Example({'question': "What is the binary code of the first letter of the last name of Hugh Cholmondeley, 1St Earl Of Cholmondeley's paternal grandmother in lowercase?", 'answer': '1100010', 'functions': {'genealogy_query': <function genealogy_query at 0x13e047740>, 'genealogy_lookup': <function genealogy_lookup at 0x13e047880>, 'extract_last_name': <function extract_last_name at 0x13e047920>, 'extract_first_letter': <function extract_first_letter at 0x13e047a60>, 'char_to_binary': <function char_to_binary at 0x13e047b00>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o in organization org-x03WTFMofvP4w8cBY4XBkePo on tokens per min (TPM): Limit 2000000, Used 2000000, Requested 6833. Please try again in 204ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=Tr

Average Metric: 148.00 / 252 (58.7%):  90%|█████████ | 270/300 [02:18<00:14,  2.11it/s]

2025/07/20 16:08:33 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What is the date one day after the birthdate of this American comedian, actress, and writer, who was a supporting cast remember in Brooklyn Nine-Nine as Gina?', 'answer': '1978-02-21', 'functions': {'person_identifier': <function person_identifier at 0x13e07b560>, 'sports_figure_info_retriever': <function sports_figure_info_retriever at 0x13e07b420>, 'date_incrementer': <function date_incrementer at 0x13e07b600>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): 'biographical_info_retriever'. Set `provide_traceback=True` for traceback.


Average Metric: 150.00 / 254 (59.1%):  91%|█████████ | 273/300 [02:21<00:21,  1.28it/s]

2025/07/20 16:08:35 ERROR dspy.utils.parallelizer: Error for Example({'question': "How many unique letters are there in the last name of the father of Nim's actress in Return to Nim's Island?", 'answer': '4', 'functions': {'movie_cast_lookup': <function movie_cast_lookup at 0x13e016fc0>, 'enhanced_family_relationship_finder': <function enhanced_family_relationship_finder at 0x13e0171a0>, 'extract_last_name': <function extract_last_name at 0x13e017240>, 'enhanced_unique_letter_counter': <function enhanced_unique_letter_counter at 0x13e0174c0>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o in organization org-x03WTFMofvP4w8cBY4XBkePo on tokens per min (TPM): Limit 2000000, Used 2000000, Requested 6812. Please try again in 204ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 150.00 / 254 (59.1%):  91%|█████████▏| 274/300 [02:21<00:16,  1.55it/s]

2025/07/20 16:08:36 ERROR dspy.utils.parallelizer: Error for Example({'question': 'What is the date 10 weeks after the date of death of the director of film Cross Current?', 'answer': '2014-05-18', 'functions': {'film_info_retriever': <function film_info_retriever at 0x13e07a0c0>, 'sports_figure_bio_retriever': <function sports_figure_bio_retriever at 0x13e07a520>, 'date_calculator': <function date_calculator at 0x13e07a5c0>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o in organization org-x03WTFMofvP4w8cBY4XBkePo on tokens per min (TPM): Limit 2000000, Used 1996475, Requested 6186. Please try again in 79ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 152.00 / 257 (59.1%):  93%|█████████▎| 278/300 [02:23<00:09,  2.35it/s]

2025/07/20 16:08:37 ERROR dspy.utils.parallelizer: Error for Example({'question': "What is the sum of all the digits in the day, month and year of when Prince Nikolai Of Denmark's mother was born?", 'answer': '29', 'functions': {'family_relationship_finder': <function family_relationship_finder at 0x13e0459e0>, 'biographical_info_retriever': <function biographical_info_retriever at 0x13e045b20>, 'date_digit_sum': <function date_digit_sum at 0x13e045a80>, 'finish': <function finish at 0x13d2ea520>}}) (input_keys={'functions', 'question'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4o in organization org-x03WTFMofvP4w8cBY4XBkePo on tokens per min (TPM): Limit 2000000, Used 1994852, Requested 6965. Please try again in 54ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 166.00 / 278 (59.7%): 100%|██████████| 300/300 [02:35<00:00,  1.92it/s]

2025/07/20 16:08:49 INFO dspy.evaluate.evaluate: Average Metric: 166.0 / 300 (55.3%)





EvaluationResult(score=55.33, results=<list of 300 results>)

In [11]:
dspy.inspect_history()





[34m[2025-07-20T16:08:49.545101][0m

[31mSystem message:[0m

Your input fields are:
1. `question` (str): 
2. `trajectory` (str): 
3. `functions` (str):
Your output fields are:
1. `reasoning` (str): 
2. `next_selected_fn` (str): 
3. `args` (dict[str, Any]):
All interactions will be structured in the following way, with the appropriate values filled in.

[[ ## question ## ]]
{question}

[[ ## trajectory ## ]]
{trajectory}

[[ ## functions ## ]]
{functions}

[[ ## reasoning ## ]]
{reasoning}

[[ ## next_selected_fn ## ]]
{next_selected_fn}

[[ ## args ## ]]
{args}        # note: the value you produce must adhere to the JSON schema: {"type": "object", "additionalProperties": true}

[[ ## completed ## ]]
In adhering to this structure, your objective is: 
        For the final answer, produce short (not full sentence) answers in which you format dates as YYYY-MM-DD, names as Firstname Lastname, and numbers without leading 0s.
        
        If the module receives a question involvi

In [12]:
optimized_agent.save("optimized_gpt4o.json", save_program=False)