In [1]:
import sys
import os
import dspy 
from common.my_settings import MySettings  
from common.utils import md
from common.llm_client_factory import LlmClientFactory
from dspy_utils.dspy_helpers import md_dspy

settings = MySettings().get()

Getting keys from environment variables


In [2]:
# Smaller LLM, this is the one that we are trying to optimize for, the prompts are going to be tweaked
# to get the best out of this model
lm_gpt35 = dspy.LM('gpt-3.5-turbo', temperature=0.8, model_type='chat', cache=False, api_key=settings.OPENAI_API_KEY)
#dspy.configure(lm=lm_gpt35)

# lm_gpt41mini = dspy.LM('gpt-4.1-mini', temperature=0.8, model_type='chat', cache=False, api_key=settings.OPENAI_API_KEY)
# lm_gpt4omini = dspy.LM('gpt-4o-mini', temperature=0.8, model_type='chat', cache=False, api_key=settings.OPENAI_API_KEY)


# # Larger LLM, this is the one that we are going to use to optimize the prompts
# # It will be the helper/teach/AI Judge to assist in the optimization process
lm_gpt4 = dspy.LM('gpt-4.1', temperature=0.9, model_type='chat', cache=False, api_key=settings.OPENAI_API_KEY)
dspy.configure(lm=lm_gpt4)

In [4]:
# Create domain classes
from typing import Literal
    
class NumberPicker(dspy.Signature):
    """Guess a number"""
    number_guess: str = dspy.InputField()
    answer: Literal["one", "two"] = dspy.OutputField()

numberPickerPredict = dspy.Predict(NumberPicker)
numberPickerPredict(number_guess="even")

Prediction(
    answer='two'
)

In [5]:
trainset = [
    dspy.Example(number_guess="1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="One", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Two", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="The number: 1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="The number: 2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Select the number: 1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Select the number: 2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Choose: 1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Choose: 2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Pick 1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Pick 2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Number one", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Number two", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Option 1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Option 2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Answer is 1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Answer is 2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Digit: 1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Digit: 2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Numeric value: 1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Numeric value: 2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Textual value: one", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Textual value: two", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Guess: 1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Guess: 2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="One", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Two", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="The number: 1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="The number: 2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Select the number: 1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Select the number: 2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Choose: 1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Choose: 2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Pick 1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Pick 2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Number one", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Number two", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Option 1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Option 2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Answer is 1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Answer is 2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Digit: 1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Digit: 2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Numeric value: 1", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Numeric value: 2", answer="two").with_inputs("number_guess"),
    dspy.Example(number_guess="Textual value: one", answer="one").with_inputs("number_guess"),
    dspy.Example(number_guess="Textual value: two", answer="two").with_inputs("number_guess")
]


In [6]:
def validate_match(expected, actual, trace=None) -> bool:
    print()
    md("**expected**: ", expected)
    print("**actual**: ", actual)
    md("**Is match**: ", actual.answer == "two")
    print()
    return (actual.answer == "two")

from dspy.teleprompt import *

tp = dspy.MIPROv2(metric=validate_match, auto="heavy", prompt_model=lm_gpt35, task_model=lm_gpt4)
optimized_matcher = tp.compile(numberPickerPredict, trainset=trainset, requires_permission_to_run=False)

2025/09/04 14:52:51 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING HEAVY AUTO RUN SETTINGS:
num_trials: 27
minibatch: False
num_fewshot_candidates: 18
num_instruct_candidates: 9
valset size: 40

2025/09/04 14:52:51 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/09/04 14:52:51 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/09/04 14:52:51 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=18 sets of demonstrations...


Bootstrapping set 1/18
Bootstrapping set 2/18
Bootstrapping set 3/18


  0%|          | 0/10 [00:00<?, ?it/s]




**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 10%|█         | 1/10 [00:02<00:23,  2.59s/it]





**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 20%|██        | 2/10 [00:03<00:12,  1.53s/it]





**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 30%|███       | 3/10 [00:04<00:09,  1.40s/it]





**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 40%|████      | 4/10 [00:05<00:06,  1.14s/it]





**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 50%|█████     | 5/10 [00:06<00:04,  1.03it/s]





**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 60%|██████    | 6/10 [00:07<00:04,  1.20s/it]





**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 70%|███████   | 7/10 [00:08<00:03,  1.06s/it]





**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 80%|████████  | 8/10 [00:09<00:02,  1.15s/it]



Bootstrapped 4 full traces after 8 examples for up to 1 rounds, amounting to 8 attempts.
Bootstrapping set 4/18


  0%|          | 0/10 [00:00<?, ?it/s]




**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 10%|█         | 1/10 [00:01<00:10,  1.16s/it]





**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 20%|██        | 2/10 [00:02<00:07,  1.02it/s]





**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 30%|███       | 3/10 [00:02<00:06,  1.11it/s]





**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 40%|████      | 4/10 [00:03<00:04,  1.24it/s]





**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 50%|█████     | 5/10 [00:04<00:03,  1.31it/s]





**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 60%|██████    | 6/10 [00:05<00:03,  1.03it/s]





**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 70%|███████   | 7/10 [00:06<00:02,  1.10it/s]



Bootstrapped 4 full traces after 7 examples for up to 1 rounds, amounting to 7 attempts.
Bootstrapping set 5/18


  0%|          | 0/10 [00:00<?, ?it/s]




**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 10%|█         | 1/10 [00:00<00:06,  1.44it/s]





**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 20%|██        | 2/10 [00:01<00:06,  1.26it/s]





**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 30%|███       | 3/10 [00:02<00:05,  1.24it/s]





**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 40%|████      | 4/10 [00:03<00:04,  1.23it/s]





**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 50%|█████     | 5/10 [00:03<00:03,  1.27it/s]



Bootstrapped 2 full traces after 5 examples for up to 1 rounds, amounting to 5 attempts.
Bootstrapping set 6/18


  0%|          | 0/10 [00:00<?, ?it/s]




**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 10%|█         | 1/10 [00:00<00:07,  1.14it/s]





**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 20%|██        | 2/10 [00:01<00:06,  1.30it/s]





**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 30%|███       | 3/10 [00:02<00:05,  1.36it/s]





**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 40%|████      | 4/10 [00:02<00:04,  1.40it/s]



Bootstrapped 2 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 7/18


  0%|          | 0/10 [00:00<?, ?it/s]




**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 10%|█         | 1/10 [00:01<00:13,  1.51s/it]



Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 8/18


  0%|          | 0/10 [00:00<?, ?it/s]




**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 10%|█         | 1/10 [00:02<00:23,  2.63s/it]





**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 20%|██        | 2/10 [00:03<00:13,  1.64s/it]





**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 30%|███       | 3/10 [00:05<00:11,  1.68s/it]



Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 9/18


  0%|          | 0/10 [00:00<?, ?it/s]




**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 10%|█         | 1/10 [00:00<00:07,  1.16it/s]





**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 20%|██        | 2/10 [00:01<00:06,  1.22it/s]





**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 30%|███       | 3/10 [00:02<00:05,  1.24it/s]





**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 40%|████      | 4/10 [00:03<00:04,  1.29it/s]



Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 10/18


  0%|          | 0/10 [00:00<?, ?it/s]




**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 10%|█         | 1/10 [00:01<00:09,  1.01s/it]



Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 11/18


  0%|          | 0/10 [00:00<?, ?it/s]




**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 10%|█         | 1/10 [00:00<00:06,  1.29it/s]





**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 20%|██        | 2/10 [00:01<00:05,  1.40it/s]





**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 30%|███       | 3/10 [00:02<00:04,  1.49it/s]





**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 40%|████      | 4/10 [00:02<00:04,  1.35it/s]





**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 50%|█████     | 5/10 [00:03<00:03,  1.38it/s]





**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 60%|██████    | 6/10 [00:04<00:03,  1.33it/s]



Bootstrapped 3 full traces after 6 examples for up to 1 rounds, amounting to 6 attempts.
Bootstrapping set 12/18


  0%|          | 0/10 [00:00<?, ?it/s]




**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 10%|█         | 1/10 [00:01<00:11,  1.25s/it]





**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 20%|██        | 2/10 [00:02<00:08,  1.00s/it]





**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 30%|███       | 3/10 [00:03<00:06,  1.03it/s]





**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 40%|████      | 4/10 [00:03<00:05,  1.09it/s]



Bootstrapped 2 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 13/18


  0%|          | 0/10 [00:00<?, ?it/s]




**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 10%|█         | 1/10 [00:00<00:06,  1.46it/s]





**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 20%|██        | 2/10 [00:01<00:05,  1.50it/s]





**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 30%|███       | 3/10 [00:02<00:04,  1.46it/s]





**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 40%|████      | 4/10 [00:02<00:04,  1.46it/s]





**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 50%|█████     | 5/10 [00:03<00:03,  1.33it/s]





**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 60%|██████    | 6/10 [00:04<00:03,  1.29it/s]





**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 70%|███████   | 7/10 [00:05<00:02,  1.33it/s]



Bootstrapped 3 full traces after 7 examples for up to 1 rounds, amounting to 7 attempts.
Bootstrapping set 14/18


  0%|          | 0/10 [00:00<?, ?it/s]




**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 10%|█         | 1/10 [00:00<00:07,  1.19it/s]





**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 20%|██        | 2/10 [00:01<00:07,  1.06it/s]





**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 30%|███       | 3/10 [00:02<00:06,  1.09it/s]



Bootstrapped 2 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 15/18


  0%|          | 0/10 [00:00<?, ?it/s]




**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 10%|█         | 1/10 [00:00<00:06,  1.40it/s]



Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 16/18


  0%|          | 0/10 [00:00<?, ?it/s]




**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 10%|█         | 1/10 [00:00<00:07,  1.26it/s]



Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 17/18


  0%|          | 0/10 [00:00<?, ?it/s]




**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 10%|█         | 1/10 [00:00<00:07,  1.26it/s]





**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 20%|██        | 2/10 [00:02<00:09,  1.22s/it]





**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 30%|███       | 3/10 [00:02<00:06,  1.05it/s]





**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 40%|████      | 4/10 [00:03<00:05,  1.13it/s]





**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 50%|█████     | 5/10 [00:04<00:03,  1.26it/s]





**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 60%|██████    | 6/10 [00:05<00:03,  1.32it/s]





**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 70%|███████   | 7/10 [00:05<00:02,  1.21it/s]



Bootstrapped 3 full traces after 7 examples for up to 1 rounds, amounting to 7 attempts.
Bootstrapping set 18/18


  0%|          | 0/10 [00:00<?, ?it/s]




**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 10%|█         | 1/10 [00:00<00:06,  1.32it/s]





**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 20%|██        | 2/10 [00:01<00:06,  1.30it/s]





**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

 30%|███       | 3/10 [00:02<00:05,  1.25it/s]





**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True

 40%|████      | 4/10 [00:03<00:05,  1.10it/s]
2025/09/04 14:53:52 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/09/04 14:53:52 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.



Bootstrapped 2 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.


2025/09/04 14:53:54 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=9 instructions...

2025/09/04 14:54:26 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/09/04 14:54:26 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Guess a number

2025/09/04 14:54:26 INFO dspy.teleprompt.mipro_optimizer_v2: 1: Convert the given textual representation of a number into its numerical equivalent.

2025/09/04 14:54:26 INFO dspy.teleprompt.mipro_optimizer_v2: 2: Provide a textual representation of a number and the Language Model will predict the corresponding numerical value.

2025/09/04 14:54:26 INFO dspy.teleprompt.mipro_optimizer_v2: 3: Convert the given numerical input into its corresponding word form!

2025/09/04 14:54:26 INFO dspy.teleprompt.mipro_optimizer_v2: 4: Interpret the user input and predict the corresponding number or written form.

2025/09/04 14:54:26 INFO dspy.teleprompt.mipro_optimizer_v2: 5: You are in a high-stakes game show where every correct 

  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 0.00 / 1 (0.0%):   2%|▎         | 1/40 [00:00<00:32,  1.21it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 2 (50.0%):   2%|▎         | 1/40 [00:00<00:32,  1.21it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 3 (66.7%):   8%|▊         | 3/40 [00:01<00:10,  3.55it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False



Average Metric: 2.00 / 4 (50.0%):  10%|█         | 4/40 [00:01<00:07,  4.50it/s]

**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 5 (60.0%):  10%|█         | 4/40 [00:01<00:07,  4.50it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 6 (66.7%):  15%|█▌        | 6/40 [00:01<00:05,  6.67it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 7 (71.4%):  18%|█▊        | 7/40 [00:01<00:04,  6.87it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 8 (62.5%):  20%|██        | 8/40 [00:01<00:04,  6.92it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 9 (66.7%):  22%|██▎       | 9/40 [00:01<00:04,  7.09it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 10 (60.0%):  22%|██▎       | 9/40 [00:01<00:04,  7.09it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 11 (54.5%):  28%|██▊       | 11/40 [00:01<00:03,  7.98it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 12 (58.3%):  30%|███       | 12/40 [00:02<00:04,  6.16it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 13 (61.5%):  30%|███       | 12/40 [00:02<00:04,  6.16it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 8.00 / 14 (57.1%):  35%|███▌      | 14/40 [00:02<00:03,  7.01it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 15 (60.0%):  35%|███▌      | 14/40 [00:02<00:03,  7.01it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 16 (56.2%):  38%|███▊      | 15/40 [00:02<00:03,  7.01it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 17 (52.9%):  42%|████▎     | 17/40 [00:02<00:02,  8.92it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 18 (55.6%):  42%|████▎     | 17/40 [00:02<00:02,  8.92it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 19 (57.9%):  48%|████▊     | 19/40 [00:02<00:01, 10.72it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 20 (55.0%):  48%|████▊     | 19/40 [00:02<00:01, 10.72it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 21 (57.1%):  52%|█████▎    | 21/40 [00:02<00:01,  9.70it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 22 (59.1%):  52%|█████▎    | 21/40 [00:03<00:01,  9.70it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 23 (60.9%):  57%|█████▊    | 23/40 [00:03<00:02,  8.09it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 24 (58.3%):  57%|█████▊    | 23/40 [00:03<00:02,  8.09it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 25 (56.0%):  62%|██████▎   | 25/40 [00:03<00:01,  8.22it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 26 (53.8%):  62%|██████▎   | 25/40 [00:03<00:01,  8.22it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 27 (51.9%):  68%|██████▊   | 27/40 [00:03<00:01,  9.36it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 28 (53.6%):  68%|██████▊   | 27/40 [00:03<00:01,  9.36it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 29 (55.2%):  72%|███████▎  | 29/40 [00:03<00:01,  8.59it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 30 (56.7%):  75%|███████▌  | 30/40 [00:04<00:01,  8.48it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 31 (54.8%):  78%|███████▊  | 31/40 [00:04<00:01,  7.82it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 32 (53.1%):  78%|███████▊  | 31/40 [00:04<00:01,  7.82it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 33 (51.5%):  80%|████████  | 32/40 [00:04<00:01,  7.82it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 34 (50.0%):  85%|████████▌ | 34/40 [00:04<00:00,  9.41it/s]



**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)
**actual**:  Prediction(
    answer='two'
)


**Is match**: False

**Is match**: True


Average Metric: 17.00 / 35 (48.6%):  85%|████████▌ | 34/40 [00:04<00:00,  9.41it/s]
Average Metric: 18.00 / 36 (50.0%):  88%|████████▊ | 35/40 [00:04<00:00,  7.73it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 37 (51.4%):  90%|█████████ | 36/40 [00:04<00:00,  7.73it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 38 (50.0%):  95%|█████████▌| 38/40 [00:05<00:00,  7.08it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 39 (48.7%):  98%|█████████▊| 39/40 [00:05<00:00,  6.97it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:05<00:00,  7.14it/s]

2025/09/04 14:54:32 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 14:54:32 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 50.0

2025/09/04 14:54:32 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 2 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:26,  1.49it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 2 (100.0%):   2%|▎         | 1/40 [00:00<00:26,  1.49it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 3 (100.0%):   5%|▌         | 2/40 [00:00<00:25,  1.49it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 4 (75.0%):  10%|█         | 4/40 [00:00<00:05,  6.17it/s] 


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)



**Is match**: False

**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})


Average Metric: 3.00 / 5 (60.0%):  10%|█         | 4/40 [00:00<00:05,  6.17it/s]**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 6 (66.7%):  12%|█▎        | 5/40 [00:00<00:05,  6.17it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 7 (71.4%):  18%|█▊        | 7/40 [00:00<00:03, 10.23it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 8 (62.5%):  18%|█▊        | 7/40 [00:01<00:03, 10.23it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 9 (66.7%):  22%|██▎       | 9/40 [00:01<00:04,  7.63it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 10 (70.0%):  22%|██▎       | 9/40 [00:01<00:04,  7.63it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 7.00 / 11 (63.6%):  28%|██▊       | 11/40 [00:01<00:03,  8.42it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 12 (66.7%):  28%|██▊       | 11/40 [00:01<00:03,  8.42it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 13 (69.2%):  32%|███▎      | 13/40 [00:01<00:02,  9.43it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 14 (64.3%):  32%|███▎      | 13/40 [00:01<00:02,  9.43it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 15 (60.0%):  38%|███▊      | 15/40 [00:01<00:02, 10.81it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 16 (62.5%):  38%|███▊      | 15/40 [00:02<00:02, 10.81it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 17 (58.8%):  42%|████▎     | 17/40 [00:02<00:03,  7.06it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 18 (61.1%):  42%|████▎     | 17/40 [00:02<00:03,  7.06it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 19 (63.2%):  45%|████▌     | 18/40 [00:02<00:03,  7.06it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 20 (60.0%):  50%|█████     | 20/40 [00:02<00:01, 10.08it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 21 (57.1%):  50%|█████     | 20/40 [00:02<00:01, 10.08it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 22 (59.1%):  55%|█████▌    | 22/40 [00:02<00:01,  9.42it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 23 (56.5%):  55%|█████▌    | 22/40 [00:03<00:01,  9.42it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)



**Is match**: False

**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)

Average Metric: 13.00 / 24 (54.2%):  60%|██████    | 24/40 [00:03<00:02,  7.63it/s]

**Is match**: True


Average Metric: 14.00 / 25 (56.0%):  60%|██████    | 24/40 [00:03<00:02,  7.63it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 26 (57.7%):  62%|██████▎   | 25/40 [00:03<00:01,  7.63it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 27 (55.6%):  68%|██████▊   | 27/40 [00:03<00:01, 10.36it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 28 (53.6%):  68%|██████▊   | 27/40 [00:03<00:01, 10.36it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 29 (51.7%):  72%|███████▎  | 29/40 [00:03<00:00, 11.24it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 30 (50.0%):  72%|███████▎  | 29/40 [00:03<00:00, 11.24it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 31 (51.6%):  78%|███████▊  | 31/40 [00:03<00:01,  8.56it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 32 (50.0%):  78%|███████▊  | 31/40 [00:03<00:01,  8.56it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)



**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**Is match**: False

**actual**:  Prediction(
    answer='two'
)

Average Metric: 16.00 / 33 (48.5%):  80%|████████  | 32/40 [00:03<00:00,  8.56it/s]

**Is match**: True


Average Metric: 17.00 / 34 (50.0%):  85%|████████▌ | 34/40 [00:03<00:00, 11.62it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 35 (51.4%):  85%|████████▌ | 34/40 [00:03<00:00, 11.62it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 36 (50.0%):  90%|█████████ | 36/40 [00:03<00:00, 11.15it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 37 (51.4%):  90%|█████████ | 36/40 [00:04<00:00, 11.15it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 38 (50.0%):  95%|█████████▌| 38/40 [00:04<00:00,  9.30it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 39 (48.7%):  95%|█████████▌| 38/40 [00:04<00:00,  9.30it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:04<00:00,  9.25it/s]

2025/09/04 14:54:36 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 14:54:36 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 17'].
2025/09/04 14:54:36 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0]
2025/09/04 14:54:36 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 14:54:36 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 3 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:23,  1.68it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 2 (100.0%):   5%|▌         | 2/40 [00:00<00:11,  3.21it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)




**Is match**: False

**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='one'
)
Average Metric: 2.00 / 3 (66.7%):   5%|▌         | 2/40 [00:00<00:11,  3.21it/s] **actual**:  Prediction(
    answer='two'
)


**Is match**: True

**Is match**: False


Average Metric: 2.00 / 4 (50.0%):   8%|▊         | 3/40 [00:00<00:11,  3.21it/s]
Average Metric: 3.00 / 5 (60.0%):  10%|█         | 4/40 [00:00<00:11,  3.21it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True



Average Metric: 4.00 / 6 (66.7%):  15%|█▌        | 6/40 [00:00<00:03, 10.83it/s]

**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 7 (57.1%):  15%|█▌        | 6/40 [00:00<00:03, 10.83it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 8 (62.5%):  18%|█▊        | 7/40 [00:00<00:03, 10.83it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 9 (66.7%):  22%|██▎       | 9/40 [00:01<00:04,  7.23it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 10 (70.0%):  22%|██▎       | 9/40 [00:01<00:04,  7.23it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='one'
)


**Is match**: False

**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='two'
)
Average Metric: 7.00 / 11 (63.6%):  28%|██▊       | 11/40 [00:01<00:03,  8.68it/s]

**Is match**: True


Average Metric: 8.00 / 12 (66.7%):  28%|██▊       | 11/40 [00:01<00:03,  8.68it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 13 (69.2%):  32%|███▎      | 13/40 [00:01<00:02, 10.01it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 14 (64.3%):  32%|███▎      | 13/40 [00:01<00:02, 10.01it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 15 (60.0%):  38%|███▊      | 15/40 [00:01<00:02,  9.25it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 16 (56.2%):  38%|███▊      | 15/40 [00:02<00:02,  9.25it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 17 (58.8%):  42%|████▎     | 17/40 [00:02<00:02,  8.36it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 18 (61.1%):  42%|████▎     | 17/40 [00:02<00:02,  8.36it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 19 (57.9%):  45%|████▌     | 18/40 [00:02<00:02,  8.36it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 20 (55.0%):  50%|█████     | 20/40 [00:02<00:02,  8.13it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 21 (57.1%):  52%|█████▎    | 21/40 [00:02<00:02,  7.71it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 22 (59.1%):  55%|█████▌    | 22/40 [00:02<00:02,  7.66it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 23 (56.5%):  57%|█████▊    | 23/40 [00:02<00:02,  7.97it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 24 (54.2%):  60%|██████    | 24/40 [00:03<00:02,  7.76it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 25 (52.0%):  60%|██████    | 24/40 [00:03<00:02,  7.76it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False



Average Metric: 13.00 / 26 (50.0%):  65%|██████▌   | 26/40 [00:03<00:01,  7.08it/s]

**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 27 (51.9%):  65%|██████▌   | 26/40 [00:03<00:01,  7.08it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 28 (53.6%):  68%|██████▊   | 27/40 [00:03<00:01,  7.08it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 29 (55.2%):  72%|███████▎  | 29/40 [00:03<00:01, 10.12it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 30 (53.3%):  72%|███████▎  | 29/40 [00:03<00:01, 10.12it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 31 (51.6%):  78%|███████▊  | 31/40 [00:03<00:00, 10.77it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 32 (50.0%):  78%|███████▊  | 31/40 [00:03<00:00, 10.77it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 33 (51.5%):  82%|████████▎ | 33/40 [00:03<00:00, 10.27it/s]



**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)

**actual**:  Prediction(
    answer='two'
)


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**Is match**: False

**Is match**: True

**actual**:  Prediction(
    answer='one'
)

Average Metric: 17.00 / 34 (50.0%):  82%|████████▎ | 33/40 [00:04<00:00, 10.27it/s]
Average Metric: 18.00 / 35 (51.4%):  85%|████████▌ | 34/40 [00:04<00:00, 10.27it/s]

**Is match**: False


Average Metric: 18.00 / 36 (50.0%):  88%|████████▊ | 35/40 [00:04<00:00, 11.12it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 37 (51.4%):  92%|█████████▎| 37/40 [00:04<00:00, 10.00it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 38 (50.0%):  92%|█████████▎| 37/40 [00:04<00:00, 10.00it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 39 (48.7%):  98%|█████████▊| 39/40 [00:04<00:00,  9.83it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:04<00:00,  8.66it/s]

2025/09/04 14:54:41 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 14:54:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 12'].
2025/09/04 14:54:41 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0]
2025/09/04 14:54:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 14:54:41 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 4 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:27,  1.40it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True



Average Metric: 2.00 / 2 (100.0%):   2%|▎         | 1/40 [00:00<00:27,  1.40it/s]

**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True



Average Metric: 3.00 / 3 (100.0%):   5%|▌         | 2/40 [00:00<00:27,  1.40it/s]

**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)



**Is match**: False

**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='two'
)
Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 3/40 [00:00<00:26,  1.40it/s] 

**Is match**: True

Average Metric: 3.00 / 4 (75.0%):  10%|█         | 4/40 [00:00<00:05,  6.12it/s]
Average Metric: 4.00 / 5 (80.0%):  10%|█         | 4/40 [00:00<00:05,  6.12it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 6 (83.3%):  12%|█▎        | 5/40 [00:00<00:05,  6.12it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 7 (71.4%):  15%|█▌        | 6/40 [00:00<00:05,  6.12it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 8 (62.5%):  18%|█▊        | 7/40 [00:00<00:05,  6.12it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 9 (55.6%):  22%|██▎       | 9/40 [00:01<00:04,  6.65it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 10 (50.0%):  22%|██▎       | 9/40 [00:01<00:04,  6.65it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 11 (54.5%):  25%|██▌       | 10/40 [00:01<00:04,  6.65it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 12 (50.0%):  30%|███       | 12/40 [00:01<00:03,  8.90it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 13 (53.8%):  30%|███       | 12/40 [00:01<00:03,  8.90it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 14 (57.1%):  32%|███▎      | 13/40 [00:01<00:03,  8.90it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 8.00 / 15 (53.3%):  38%|███▊      | 15/40 [00:01<00:02, 10.84it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 16 (56.2%):  38%|███▊      | 15/40 [00:02<00:02, 10.84it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 17 (52.9%):  42%|████▎     | 17/40 [00:02<00:03,  6.87it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 18 (55.6%):  42%|████▎     | 17/40 [00:02<00:03,  6.87it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 19 (57.9%):  48%|████▊     | 19/40 [00:03<00:04,  4.94it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 20 (55.0%):  50%|█████     | 20/40 [00:08<00:18,  1.07it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 21 (52.4%):  52%|█████▎    | 21/40 [00:08<00:16,  1.14it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 22 (54.5%):  52%|█████▎    | 21/40 [00:08<00:16,  1.14it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 23 (56.5%):  57%|█████▊    | 23/40 [00:09<00:10,  1.55it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 24 (54.2%):  60%|██████    | 24/40 [00:09<00:08,  1.78it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 25 (52.0%):  62%|██████▎   | 25/40 [00:09<00:07,  2.04it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 26 (53.8%):  62%|██████▎   | 25/40 [00:09<00:07,  2.04it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 27 (51.9%):  68%|██████▊   | 27/40 [00:09<00:04,  3.07it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 28 (50.0%):  68%|██████▊   | 27/40 [00:09<00:04,  3.07it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 29 (51.7%):  72%|███████▎  | 29/40 [00:13<00:10,  1.07it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 30 (50.0%):  72%|███████▎  | 29/40 [00:13<00:10,  1.07it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 31 (51.6%):  78%|███████▊  | 31/40 [00:13<00:05,  1.53it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)



**Is match**: True

**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})


Average Metric: 17.00 / 32 (53.1%):  78%|███████▊  | 31/40 [00:14<00:05,  1.53it/s]**actual**:  Prediction(
    answer='one'
)
Average Metric: 17.00 / 32 (53.1%):  80%|████████  | 32/40 [00:14<00:05,  1.54it/s]

**Is match**: False


Average Metric: 17.00 / 33 (51.5%):  80%|████████  | 32/40 [00:14<00:05,  1.54it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 34 (50.0%):  82%|████████▎ | 33/40 [00:14<00:04,  1.54it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 35 (51.4%):  88%|████████▊ | 35/40 [00:14<00:01,  2.64it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 36 (50.0%):  88%|████████▊ | 35/40 [00:14<00:01,  2.64it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 37 (48.6%):  92%|█████████▎| 37/40 [00:14<00:00,  3.43it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 38 (50.0%):  95%|█████████▌| 38/40 [00:19<00:02,  1.13s/it]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 39 (48.7%):  98%|█████████▊| 39/40 [00:20<00:01,  1.09s/it]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:21<00:00,  1.82it/s]

2025/09/04 14:55:03 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 14:55:03 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 1'].
2025/09/04 14:55:03 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0]
2025/09/04 14:55:03 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 14:55:03 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 5 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 0.00 / 1 (0.0%):   2%|▎         | 1/40 [00:00<00:26,  1.45it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 2 (50.0%):   2%|▎         | 1/40 [00:00<00:26,  1.45it/s]
Average Metric: 1.00 / 2 (50.0%):   5%|▌         | 2/40 [00:00<00:14,  2.59it/s]

**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 3 (66.7%):   5%|▌         | 2/40 [00:00<00:14,  2.59it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 2.00 / 4 (50.0%):   8%|▊         | 3/40 [00:00<00:14,  2.59it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)



**Is match**: False

**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='two'
)
Average Metric: 2.00 / 5 (40.0%):  10%|█         | 4/40 [00:00<00:13,  2.59it/s]

**Is match**: True


Average Metric: 3.00 / 6 (50.0%):  12%|█▎        | 5/40 [00:00<00:13,  2.59it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 7 (57.1%):  18%|█▊        | 7/40 [00:01<00:04,  7.37it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)



**Is match**: True

**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='two'
)
Average Metric: 5.00 / 8 (62.5%):  20%|██        | 8/40 [00:01<00:07,  4.45it/s]

**Is match**: True


Average Metric: 6.00 / 9 (66.7%):  20%|██        | 8/40 [00:01<00:07,  4.45it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 10 (70.0%):  22%|██▎       | 9/40 [00:01<00:06,  4.45it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 7.00 / 11 (63.6%):  28%|██▊       | 11/40 [00:01<00:04,  7.04it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 7.00 / 12 (58.3%):  28%|██▊       | 11/40 [00:06<00:04,  7.04it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 13 (61.5%):  32%|███▎      | 13/40 [00:06<00:22,  1.20it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 8.00 / 14 (57.1%):  35%|███▌      | 14/40 [00:07<00:18,  1.42it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 8.00 / 15 (53.3%):  38%|███▊      | 15/40 [00:07<00:16,  1.56it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 16 (56.2%):  40%|████      | 16/40 [00:07<00:13,  1.80it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 17 (58.8%):  42%|████▎     | 17/40 [00:07<00:10,  2.20it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 18 (61.1%):  42%|████▎     | 17/40 [00:07<00:10,  2.20it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 19 (57.9%):  48%|████▊     | 19/40 [00:11<00:19,  1.07it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 20 (60.0%):  48%|████▊     | 19/40 [00:11<00:19,  1.07it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 21 (61.9%):  52%|█████▎    | 21/40 [00:11<00:12,  1.55it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 22 (59.1%):  52%|█████▎    | 21/40 [00:11<00:12,  1.55it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 23 (60.9%):  57%|█████▊    | 23/40 [00:11<00:08,  2.07it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 24 (58.3%):  60%|██████    | 24/40 [00:11<00:06,  2.29it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 25 (56.0%):  62%|██████▎   | 25/40 [00:12<00:05,  2.69it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 26 (53.8%):  65%|██████▌   | 26/40 [00:12<00:04,  3.04it/s]

2025/09/04 14:55:17 ERROR dspy.utils.parallelizer: Error for Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4.1 in organization org-q446iSedRzWhcp3jJDP32sfL on tokens per min (TPM): Limit 30000, Used 30000, Requested 273. Please try again in 546ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 14.00 / 26 (53.8%):  68%|██████▊   | 27/40 [00:13<00:07,  1.69it/s]

2025/09/04 14:55:17 ERROR dspy.utils.parallelizer: Error for Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4.1 in organization org-q446iSedRzWhcp3jJDP32sfL on tokens per min (TPM): Limit 30000, Used 30000, Requested 273. Please try again in 546ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 14.00 / 26 (53.8%):  70%|███████   | 28/40 [00:14<00:07,  1.58it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 27 (51.9%):  72%|███████▎  | 29/40 [00:16<00:10,  1.08it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='one'
)


**Is match**: False

**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='two'
)
Average Metric: 14.00 / 28 (50.0%):  75%|███████▌  | 30/40 [00:16<00:07,  1.29it/s]

**Is match**: True


Average Metric: 15.00 / 29 (51.7%):  75%|███████▌  | 30/40 [00:16<00:07,  1.29it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 30 (50.0%):  80%|████████  | 32/40 [00:16<00:03,  2.15it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 31 (51.6%):  82%|████████▎ | 33/40 [00:16<00:02,  2.57it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 32 (50.0%):  85%|████████▌ | 34/40 [00:17<00:02,  2.70it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True



Average Metric: 17.00 / 33 (51.5%):  88%|████████▊ | 35/40 [00:17<00:01,  2.85it/s]

**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 34 (52.9%):  88%|████████▊ | 35/40 [00:17<00:01,  2.85it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 35 (51.4%):  92%|█████████▎| 37/40 [00:18<00:01,  1.90it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 36 (52.8%):  95%|█████████▌| 38/40 [00:22<00:02,  1.18s/it]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 37 (51.4%):  98%|█████████▊| 39/40 [00:23<00:01,  1.17s/it]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 38 (50.0%): 100%|██████████| 40/40 [00:23<00:00,  1.69it/s]

2025/09/04 14:55:27 INFO dspy.evaluate.evaluate: Average Metric: 19.0 / 40 (47.5%)
2025/09/04 14:55:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 47.5 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 12'].
2025/09/04 14:55:27 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5]
2025/09/04 14:55:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 14:55:27 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 6 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:22,  1.73it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 2 (100.0%):   2%|▎         | 1/40 [00:00<00:22,  1.73it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 2.00 / 3 (66.7%):   5%|▌         | 2/40 [00:00<00:21,  1.73it/s] 


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True



Average Metric: 3.00 / 4 (75.0%):  10%|█         | 4/40 [00:00<00:06,  5.75it/s]

**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 5 (80.0%):  10%|█         | 4/40 [00:00<00:06,  5.75it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 6 (83.3%):  15%|█▌        | 6/40 [00:00<00:04,  7.80it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 7 (71.4%):  15%|█▌        | 6/40 [00:01<00:04,  7.80it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 8 (75.0%):  20%|██        | 8/40 [00:01<00:04,  7.60it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 9 (77.8%):  22%|██▎       | 9/40 [00:01<00:04,  7.51it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 7.00 / 10 (70.0%):  25%|██▌       | 10/40 [00:01<00:04,  6.38it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 7.00 / 11 (63.6%):  28%|██▊       | 11/40 [00:07<00:47,  1.62s/it]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 7.00 / 12 (58.3%):  30%|███       | 12/40 [00:07<00:35,  1.25s/it]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 7.00 / 13 (53.8%):  32%|███▎      | 13/40 [00:07<00:25,  1.07it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 7.00 / 14 (50.0%):  35%|███▌      | 14/40 [00:08<00:20,  1.24it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 7.00 / 15 (46.7%):  35%|███▌      | 14/40 [00:08<00:20,  1.24it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 16 (50.0%):  40%|████      | 16/40 [00:08<00:11,  2.04it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 17 (52.9%):  42%|████▎     | 17/40 [00:08<00:09,  2.40it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 18 (50.0%):  42%|████▎     | 17/40 [00:08<00:09,  2.40it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 19 (52.6%):  45%|████▌     | 18/40 [00:08<00:09,  2.40it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 20 (55.0%):  50%|█████     | 20/40 [00:08<00:04,  4.19it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 21 (57.1%):  52%|█████▎    | 21/40 [00:09<00:04,  4.14it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 22 (54.5%):  55%|█████▌    | 22/40 [00:09<00:03,  4.55it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='one'
)


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**Is match**: False


Average Metric: 12.00 / 23 (52.2%):  55%|█████▌    | 22/40 [00:15<00:03,  4.55it/s]**actual**:  Prediction(
    answer='two'
)
Average Metric: 12.00 / 23 (52.2%):  57%|█████▊    | 23/40 [00:15<00:29,  1.74s/it]

**Is match**: True


Average Metric: 13.00 / 24 (54.2%):  57%|█████▊    | 23/40 [00:15<00:29,  1.74s/it]



**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)
**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 25 (56.0%):  60%|██████    | 24/40 [00:16<00:27,  1.74s/it]

**Is match**: True


Average Metric: 15.00 / 26 (57.7%):  62%|██████▎   | 25/40 [00:16<00:15,  1.06s/it]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 27 (55.6%):  68%|██████▊   | 27/40 [00:16<00:09,  1.36it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 28 (53.6%):  68%|██████▊   | 27/40 [00:16<00:09,  1.36it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 29 (51.7%):  72%|███████▎  | 29/40 [00:16<00:05,  1.86it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 30 (53.3%):  72%|███████▎  | 29/40 [00:16<00:05,  1.86it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 31 (54.8%):  78%|███████▊  | 31/40 [00:17<00:03,  2.29it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 32 (53.1%):  80%|████████  | 32/40 [00:22<00:10,  1.33s/it]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 33 (51.5%):  82%|████████▎ | 33/40 [00:23<00:08,  1.20s/it]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 34 (50.0%):  85%|████████▌ | 34/40 [00:23<00:05,  1.06it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 35 (51.4%):  88%|████████▊ | 35/40 [00:23<00:03,  1.26it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 36 (50.0%):  90%|█████████ | 36/40 [00:23<00:02,  1.55it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 37 (51.4%):  90%|█████████ | 36/40 [00:23<00:02,  1.55it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 38 (50.0%):  95%|█████████▌| 38/40 [00:24<00:00,  2.12it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 39 (51.3%):  98%|█████████▊| 39/40 [00:24<00:00,  2.53it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:24<00:00,  1.64it/s]

2025/09/04 14:55:51 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 14:55:51 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 12'].
2025/09/04 14:55:51 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0]
2025/09/04 14:55:51 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 14:55:51 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:04<02:52,  4.43s/it]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 2 (100.0%):   2%|▎         | 1/40 [00:04<02:52,  4.43s/it]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 2.00 / 3 (66.7%):   5%|▌         | 2/40 [00:04<02:48,  4.43s/it] 


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 2.00 / 4 (50.0%):  10%|█         | 4/40 [00:04<00:35,  1.02it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 5 (60.0%):  12%|█▎        | 5/40 [00:05<00:26,  1.31it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 6 (50.0%):  12%|█▎        | 5/40 [00:05<00:26,  1.31it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 7 (57.1%):  18%|█▊        | 7/40 [00:05<00:15,  2.15it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 8 (50.0%):  18%|█▊        | 7/40 [00:05<00:15,  2.15it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 9 (44.4%):  22%|██▎       | 9/40 [00:05<00:12,  2.53it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 10 (50.0%):  25%|██▌       | 10/40 [00:06<00:14,  2.13it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 11 (54.5%):  28%|██▊       | 11/40 [00:10<00:33,  1.16s/it]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 12 (50.0%):  30%|███       | 12/40 [00:10<00:25,  1.09it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 13 (53.8%):  32%|███▎      | 13/40 [00:10<00:19,  1.40it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 14 (57.1%):  35%|███▌      | 14/40 [00:10<00:16,  1.61it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 15 (60.0%):  38%|███▊      | 15/40 [00:10<00:12,  2.05it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 16 (62.5%):  40%|████      | 16/40 [00:11<00:10,  2.26it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 17 (64.7%):  42%|████▎     | 17/40 [00:13<00:22,  1.02it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 18 (66.7%):  42%|████▎     | 17/40 [00:13<00:22,  1.02it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 19 (68.4%):  45%|████▌     | 18/40 [00:13<00:21,  1.02it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 20 (70.0%):  50%|█████     | 20/40 [00:13<00:09,  2.10it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 21 (66.7%):  50%|█████     | 20/40 [00:13<00:09,  2.10it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 22 (63.6%):  55%|█████▌    | 22/40 [00:13<00:06,  2.87it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 23 (65.2%):  57%|█████▊    | 23/40 [00:14<00:06,  2.61it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False



Average Metric: 15.00 / 24 (62.5%):  60%|██████    | 24/40 [00:17<00:16,  1.02s/it]

**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 25 (60.0%):  60%|██████    | 24/40 [00:17<00:16,  1.02s/it]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 26 (57.7%):  65%|██████▌   | 26/40 [00:18<00:10,  1.34it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 27 (55.6%):  65%|██████▌   | 26/40 [00:18<00:10,  1.34it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 28 (57.1%):  70%|███████   | 28/40 [00:18<00:06,  1.92it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 29 (55.2%):  72%|███████▎  | 29/40 [00:18<00:04,  2.23it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 30 (53.3%):  75%|███████▌  | 30/40 [00:19<00:04,  2.36it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 31 (51.6%):  78%|███████▊  | 31/40 [00:19<00:03,  2.61it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 32 (50.0%):  78%|███████▊  | 31/40 [00:19<00:03,  2.61it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 33 (48.5%):  82%|████████▎ | 33/40 [00:20<00:02,  2.38it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 34 (47.1%):  85%|████████▌ | 34/40 [00:23<00:06,  1.08s/it]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 35 (45.7%):  88%|████████▊ | 35/40 [00:23<00:04,  1.14it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 36 (47.2%):  90%|█████████ | 36/40 [00:24<00:02,  1.35it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 37 (48.6%):  92%|█████████▎| 37/40 [00:24<00:01,  1.64it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 38 (47.4%):  95%|█████████▌| 38/40 [00:25<00:01,  1.53it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 39 (48.7%):  98%|█████████▊| 39/40 [00:26<00:00,  1.13it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:26<00:00,  1.49it/s]

2025/09/04 14:56:18 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 14:56:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 16'].
2025/09/04 14:56:18 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0]
2025/09/04 14:56:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 14:56:18 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 8 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:30,  1.26it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 2 (100.0%):   2%|▎         | 1/40 [00:00<00:30,  1.26it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 2.00 / 3 (66.7%):   8%|▊         | 3/40 [00:01<00:12,  2.89it/s] 


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 4 (75.0%):  10%|█         | 4/40 [00:01<00:12,  2.79it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 5 (60.0%):  10%|█         | 4/40 [00:01<00:12,  2.79it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 6 (66.7%):  12%|█▎        | 5/40 [00:01<00:12,  2.79it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 7 (57.1%):  18%|█▊        | 7/40 [00:04<00:25,  1.31it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 8 (62.5%):  20%|██        | 8/40 [00:04<00:20,  1.54it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 9 (66.7%):  22%|██▎       | 9/40 [00:05<00:19,  1.56it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 10 (60.0%):  22%|██▎       | 9/40 [00:05<00:19,  1.56it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 11 (54.5%):  28%|██▊       | 11/40 [00:05<00:11,  2.43it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 12 (50.0%):  28%|██▊       | 11/40 [00:05<00:11,  2.43it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 13 (46.2%):  32%|███▎      | 13/40 [00:06<00:10,  2.66it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 14 (50.0%):  35%|███▌      | 14/40 [00:06<00:09,  2.73it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='one'
)


**Is match**: False

**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})


Average Metric: 7.00 / 15 (46.7%):  35%|███▌      | 14/40 [00:09<00:09,  2.73it/s]**actual**:  Prediction(
    answer='two'
)


**Is match**: True

Average Metric: 7.00 / 15 (46.7%):  38%|███▊      | 15/40 [00:09<00:24,  1.04it/s]
Average Metric: 8.00 / 16 (50.0%):  38%|███▊      | 15/40 [00:09<00:24,  1.04it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 17 (52.9%):  42%|████▎     | 17/40 [00:09<00:14,  1.54it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 18 (55.6%):  45%|████▌     | 18/40 [00:10<00:11,  1.85it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 19 (57.9%):  45%|████▌     | 18/40 [00:10<00:11,  1.85it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 20 (55.0%):  50%|█████     | 20/40 [00:10<00:09,  2.18it/s]

2025/09/04 14:56:30 ERROR dspy.utils.parallelizer: Error for Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4.1 in organization org-q446iSedRzWhcp3jJDP32sfL on tokens per min (TPM): Limit 30000, Used 29900, Requested 242. Please try again in 284ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 11.00 / 20 (55.0%):  52%|█████▎    | 21/40 [00:12<00:12,  1.49it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 21 (52.4%):  55%|█████▌    | 22/40 [00:13<00:13,  1.36it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 22 (50.0%):  55%|█████▌    | 22/40 [00:13<00:13,  1.36it/s]



**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)
**actual**:  Prediction(
    answer='two'
)


**Is match**: True

**Is match**: True



Average Metric: 13.00 / 24 (54.2%):  60%|██████    | 24/40 [00:13<00:07,  2.15it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 25 (56.0%):  62%|██████▎   | 25/40 [00:13<00:06,  2.15it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 26 (53.8%):  65%|██████▌   | 26/40 [00:13<00:06,  2.15it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 27 (55.6%):  70%|███████   | 28/40 [00:14<00:03,  3.03it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 28 (53.6%):  72%|███████▎  | 29/40 [00:17<00:08,  1.30it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 29 (55.2%):  72%|███████▎  | 29/40 [00:17<00:08,  1.30it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 30 (56.7%):  78%|███████▊  | 31/40 [00:17<00:05,  1.69it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 31 (54.8%):  80%|████████  | 32/40 [00:17<00:04,  1.92it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 32 (56.2%):  80%|████████  | 32/40 [00:17<00:04,  1.92it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 33 (54.5%):  85%|████████▌ | 34/40 [00:18<00:02,  2.62it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 34 (52.9%):  88%|████████▊ | 35/40 [00:18<00:01,  2.81it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 35 (51.4%):  90%|█████████ | 36/40 [00:21<00:04,  1.03s/it]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 36 (50.0%):  92%|█████████▎| 37/40 [00:22<00:02,  1.16it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 37 (51.4%):  95%|█████████▌| 38/40 [00:22<00:01,  1.28it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 38 (50.0%):  95%|█████████▌| 38/40 [00:22<00:01,  1.28it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 39 (48.7%): 100%|██████████| 40/40 [00:23<00:00,  1.72it/s]

2025/09/04 14:56:41 INFO dspy.evaluate.evaluate: Average Metric: 19.0 / 40 (47.5%)
2025/09/04 14:56:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 47.5 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 13'].
2025/09/04 14:56:41 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5]
2025/09/04 14:56:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 14:56:41 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 9 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 0.00 / 1 (0.0%):   2%|▎         | 1/40 [00:00<00:28,  1.37it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='one'
)


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**Is match**: False

**actual**:  Prediction(
    answer='two'
)

Average Metric: 0.00 / 2 (0.0%):   2%|▎         | 1/40 [00:00<00:28,  1.37it/s]

**Is match**: True


Average Metric: 1.00 / 3 (33.3%):   5%|▌         | 2/40 [00:00<00:27,  1.37it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 4 (50.0%):  10%|█         | 4/40 [00:01<00:08,  4.12it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)



**Is match**: True

**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})


Average Metric: 3.00 / 5 (60.0%):  10%|█         | 4/40 [00:01<00:08,  4.12it/s]**actual**:  Prediction(
    answer='two'
)
Average Metric: 3.00 / 5 (60.0%):  12%|█▎        | 5/40 [00:01<00:09,  3.82it/s]

**Is match**: True


Average Metric: 4.00 / 6 (66.7%):  12%|█▎        | 5/40 [00:01<00:09,  3.82it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 7 (71.4%):  18%|█▊        | 7/40 [00:01<00:06,  5.32it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 8 (62.5%):  20%|██        | 8/40 [00:05<00:35,  1.10s/it]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 9 (55.6%):  22%|██▎       | 9/40 [00:05<00:26,  1.16it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 10 (50.0%):  25%|██▌       | 10/40 [00:06<00:22,  1.35it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 11 (54.5%):  25%|██▌       | 10/40 [00:06<00:22,  1.35it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 12 (58.3%):  30%|███       | 12/40 [00:06<00:12,  2.24it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 7.00 / 13 (53.8%):  30%|███       | 12/40 [00:06<00:12,  2.24it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 14 (57.1%):  32%|███▎      | 13/40 [00:06<00:12,  2.24it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 15 (60.0%):  35%|███▌      | 14/40 [00:06<00:11,  2.24it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 16 (56.2%):  40%|████      | 16/40 [00:06<00:05,  4.42it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 17 (58.8%):  40%|████      | 16/40 [00:06<00:05,  4.42it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 18 (55.6%):  45%|████▌     | 18/40 [00:11<00:20,  1.09it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 19 (57.9%):  48%|████▊     | 19/40 [00:12<00:17,  1.23it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 20 (60.0%):  50%|█████     | 20/40 [00:12<00:15,  1.32it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 21 (57.1%):  52%|█████▎    | 21/40 [00:12<00:12,  1.56it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 22 (54.5%):  55%|█████▌    | 22/40 [00:13<00:10,  1.78it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 23 (56.5%):  57%|█████▊    | 23/40 [00:13<00:08,  2.09it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 24 (54.2%):  57%|█████▊    | 23/40 [00:13<00:08,  2.09it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)



**Is match**: True


Average Metric: 14.00 / 25 (56.0%):  62%|██████▎   | 25/40 [00:13<00:04,  3.05it/s]

**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 26 (57.7%):  62%|██████▎   | 25/40 [00:13<00:04,  3.05it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 27 (55.6%):  65%|██████▌   | 26/40 [00:13<00:04,  3.05it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 28 (53.6%):  70%|███████   | 28/40 [00:17<00:10,  1.19it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 29 (51.7%):  72%|███████▎  | 29/40 [00:18<00:07,  1.40it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 30 (50.0%):  75%|███████▌  | 30/40 [00:18<00:05,  1.70it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 31 (51.6%):  78%|███████▊  | 31/40 [00:18<00:04,  2.01it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 32 (53.1%):  80%|████████  | 32/40 [00:18<00:03,  2.34it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 33 (51.5%):  80%|████████  | 32/40 [00:18<00:03,  2.34it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 34 (50.0%):  85%|████████▌ | 34/40 [00:18<00:01,  3.65it/s]

2025/09/04 14:57:01 ERROR dspy.utils.parallelizer: Error for Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4.1 in organization org-q446iSedRzWhcp3jJDP32sfL on tokens per min (TPM): Limit 30000, Used 30000, Requested 245. Please try again in 489ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 17.00 / 34 (50.0%):  88%|████████▊ | 35/40 [00:19<00:02,  2.48it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 35 (51.4%):  90%|█████████ | 36/40 [00:22<00:03,  1.07it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 36 (52.8%):  90%|█████████ | 36/40 [00:22<00:03,  1.07it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 37 (51.4%):  95%|█████████▌| 38/40 [00:22<00:01,  1.37it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 38 (50.0%):  98%|█████████▊| 39/40 [00:23<00:00,  1.39it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 39 (51.3%): 100%|██████████| 40/40 [00:24<00:00,  1.63it/s]

2025/09/04 14:57:06 INFO dspy.evaluate.evaluate: Average Metric: 20.0 / 40 (50.0%)
2025/09/04 14:57:06 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 12'].
2025/09/04 14:57:06 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0]
2025/09/04 14:57:06 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 14:57:06 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 10 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:29,  1.33it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 2 (100.0%):   2%|▎         | 1/40 [00:00<00:29,  1.33it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 3 (100.0%):   5%|▌         | 2/40 [00:00<00:28,  1.33it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 3/40 [00:00<00:27,  1.33it/s] 


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 5 (80.0%):  12%|█▎        | 5/40 [00:01<00:06,  5.39it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 6 (83.3%):  15%|█▌        | 6/40 [00:01<00:07,  4.41it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 7 (71.4%):  18%|█▊        | 7/40 [00:01<00:08,  4.00it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 8 (75.0%):  20%|██        | 8/40 [00:04<00:32,  1.02s/it]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 9 (66.7%):  22%|██▎       | 9/40 [00:05<00:30,  1.02it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 10 (70.0%):  25%|██▌       | 10/40 [00:06<00:28,  1.04it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 11 (72.7%):  28%|██▊       | 11/40 [00:07<00:25,  1.15it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 8.00 / 12 (66.7%):  30%|███       | 12/40 [00:07<00:20,  1.35it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 8.00 / 13 (61.5%):  32%|███▎      | 13/40 [00:08<00:15,  1.71it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 8.00 / 14 (57.1%):  32%|███▎      | 13/40 [00:08<00:15,  1.71it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 15 (60.0%):  38%|███▊      | 15/40 [00:08<00:09,  2.53it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 16 (62.5%):  40%|████      | 16/40 [00:08<00:07,  3.02it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 17 (64.7%):  42%|████▎     | 17/40 [00:09<00:09,  2.38it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 18 (66.7%):  45%|████▌     | 18/40 [00:09<00:08,  2.65it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 19 (63.2%):  48%|████▊     | 19/40 [00:12<00:23,  1.13s/it]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 20 (60.0%):  50%|█████     | 20/40 [00:12<00:17,  1.17it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 21 (61.9%):  52%|█████▎    | 21/40 [00:13<00:13,  1.40it/s]

2025/09/04 14:57:19 ERROR dspy.utils.parallelizer: Error for Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4.1 in organization org-q446iSedRzWhcp3jJDP32sfL on tokens per min (TPM): Limit 30000, Used 29954, Requested 273. Please try again in 454ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 13.00 / 21 (61.9%):  55%|█████▌    | 22/40 [00:13<00:11,  1.63it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 22 (63.6%):  57%|█████▊    | 23/40 [00:13<00:08,  1.90it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 23 (60.9%):  60%|██████    | 24/40 [00:13<00:06,  2.33it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 24 (58.3%):  62%|██████▎   | 25/40 [00:14<00:06,  2.21it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 25 (56.0%):  65%|██████▌   | 26/40 [00:14<00:05,  2.48it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 26 (53.8%):  68%|██████▊   | 27/40 [00:17<00:12,  1.01it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 27 (55.6%):  70%|███████   | 28/40 [00:17<00:08,  1.38it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 28 (57.1%):  72%|███████▎  | 29/40 [00:19<00:13,  1.24s/it]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)



**Is match**: True

**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='one'
)
Average Metric: 17.00 / 29 (58.6%):  72%|███████▎  | 29/40 [00:20<00:13,  1.24s/it]

**Is match**: False

Average Metric: 17.00 / 29 (58.6%):  75%|███████▌  | 30/40 [00:20<00:11,  1.16s/it]
Average Metric: 17.00 / 30 (56.7%):  75%|███████▌  | 30/40 [00:20<00:11,  1.16s/it]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 31 (54.8%):  78%|███████▊  | 31/40 [00:20<00:10,  1.16s/it]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 32 (53.1%):  82%|████████▎ | 33/40 [00:21<00:04,  1.68it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 33 (51.5%):  85%|████████▌ | 34/40 [00:22<00:04,  1.36it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 34 (50.0%):  85%|████████▌ | 34/40 [00:22<00:04,  1.36it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 35 (51.4%):  90%|█████████ | 36/40 [00:22<00:01,  2.02it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 36 (52.8%):  92%|█████████▎| 37/40 [00:24<00:02,  1.21it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 37 (54.1%):  95%|█████████▌| 38/40 [00:25<00:01,  1.21it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 20.00 / 38 (52.6%):  98%|█████████▊| 39/40 [00:28<00:01,  1.38s/it]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 20.00 / 39 (51.3%): 100%|██████████| 40/40 [00:28<00:00,  1.39it/s]

2025/09/04 14:57:35 INFO dspy.evaluate.evaluate: Average Metric: 20.0 / 40 (50.0%)
2025/09/04 14:57:35 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 8', 'Predictor 0: Few-Shot Set 10'].
2025/09/04 14:57:35 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0]
2025/09/04 14:57:35 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 14:57:35 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 11 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 0.00 / 1 (0.0%):   2%|▎         | 1/40 [00:00<00:25,  1.52it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 0.00 / 2 (0.0%):   5%|▌         | 2/40 [00:00<00:13,  2.85it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 3 (33.3%):   5%|▌         | 2/40 [00:00<00:13,  2.85it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 4 (50.0%):   8%|▊         | 3/40 [00:00<00:12,  2.85it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 5 (60.0%):  12%|█▎        | 5/40 [00:00<00:04,  7.71it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 6 (66.7%):  12%|█▎        | 5/40 [00:00<00:04,  7.71it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 7 (57.1%):  18%|█▊        | 7/40 [00:01<00:05,  6.33it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 8 (62.5%):  18%|█▊        | 7/40 [00:01<00:05,  6.33it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 9 (55.6%):  22%|██▎       | 9/40 [00:01<00:03,  8.06it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 10 (60.0%):  22%|██▎       | 9/40 [00:01<00:03,  8.06it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 11 (54.5%):  28%|██▊       | 11/40 [00:01<00:03,  9.48it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 12 (50.0%):  28%|██▊       | 11/40 [00:01<00:03,  9.48it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 13 (53.8%):  32%|███▎      | 13/40 [00:01<00:02, 11.39it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 14 (57.1%):  32%|███▎      | 13/40 [00:02<00:02, 11.39it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 15 (60.0%):  38%|███▊      | 15/40 [00:06<00:20,  1.21it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 16 (56.2%):  38%|███▊      | 15/40 [00:06<00:20,  1.21it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 17 (58.8%):  42%|████▎     | 17/40 [00:06<00:13,  1.66it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 18 (55.6%):  45%|████▌     | 18/40 [00:06<00:11,  1.84it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 19 (57.9%):  48%|████▊     | 19/40 [00:07<00:12,  1.65it/s]



**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 20 (60.0%):  48%|████▊     | 19/40 [00:09<00:12,  1.65it/s]

**Is match**: True

Average Metric: 12.00 / 20 (60.0%):  50%|█████     | 20/40 [00:09<00:16,  1.21it/s]
Average Metric: 13.00 / 21 (61.9%):  50%|█████     | 20/40 [00:09<00:16,  1.21it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 22 (59.1%):  55%|█████▌    | 22/40 [00:09<00:09,  1.84it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 23 (56.5%):  57%|█████▊    | 23/40 [00:09<00:07,  2.17it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 24 (58.3%):  60%|██████    | 24/40 [00:10<00:07,  2.27it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 25 (60.0%):  62%|██████▎   | 25/40 [00:12<00:13,  1.08it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 26 (57.7%):  65%|██████▌   | 26/40 [00:13<00:13,  1.03it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 27 (59.3%):  65%|██████▌   | 26/40 [00:13<00:13,  1.03it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 28 (57.1%):  70%|███████   | 28/40 [00:14<00:07,  1.52it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 29 (55.2%):  72%|███████▎  | 29/40 [00:14<00:06,  1.80it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 30 (53.3%):  72%|███████▎  | 29/40 [00:14<00:06,  1.80it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 31 (51.6%):  78%|███████▊  | 31/40 [00:14<00:04,  2.23it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 32 (53.1%):  80%|████████  | 32/40 [00:15<00:03,  2.13it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 33 (51.5%):  82%|████████▎ | 33/40 [00:15<00:03,  2.30it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 34 (50.0%):  85%|████████▌ | 34/40 [00:18<00:05,  1.10it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 35 (51.4%):  88%|████████▊ | 35/40 [00:18<00:03,  1.44it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 36 (50.0%):  90%|█████████ | 36/40 [00:18<00:02,  1.81it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 37 (48.6%):  90%|█████████ | 36/40 [00:18<00:02,  1.81it/s]
Average Metric: 18.00 / 37 (48.6%):  92%|█████████▎| 37/40 [00:18<00:01,  2.14it/s]

**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 38 (47.4%):  92%|█████████▎| 37/40 [00:18<00:01,  2.14it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 39 (48.7%):  98%|█████████▊| 39/40 [00:21<00:00,  1.18it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:21<00:00,  1.86it/s]

2025/09/04 14:57:56 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 14:57:56 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 8'].
2025/09/04 14:57:56 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0]
2025/09/04 14:57:56 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 14:57:56 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 12 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True



Average Metric: 1.00 / 1 (100.0%):   0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)
Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:26,  1.49it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)



**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**Is match**: True

**actual**:  Prediction(
    answer='two'
)



**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)
Average Metric: 2.00 / 2 (100.0%):   2%|▎         | 1/40 [00:00<00:26,  1.49it/s]

**Is match**: True

**Is match**: True


Average Metric: 3.00 / 3 (100.0%):   5%|▌         | 2/40 [00:00<00:25,  1.49it/s]

Average Metric: 4.00 / 4 (100.0%):   8%|▊         | 3/40 [00:00<00:24,  1.49it/s]


**Is match**: False




**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

Average Metric: 4.00 / 5 (80.0%):  10%|█         | 4/40 [00:00<00:24,  1.49it/s] **actual**:  Prediction(
    answer='two'
)


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

**Is match**: True


Average Metric: 4.00 / 6 (66.7%):  12%|█▎        | 5/40 [00:00<00:23,  1.49it/s]
Average Metric: 5.00 / 7 (71.4%):  15%|█▌        | 6/40 [00:00<00:22,  1.49it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='two'
)


**Is match**: True

**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)

Average Metric: 6.00 / 8 (75.0%):  20%|██        | 8/40 [00:01<00:04,  6.73it/s]

**Is match**: False


Average Metric: 6.00 / 9 (66.7%):  20%|██        | 8/40 [00:01<00:04,  6.73it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 10 (70.0%):  25%|██▌       | 10/40 [00:04<00:17,  1.71it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 11 (72.7%):  28%|██▊       | 11/40 [00:04<00:14,  1.96it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 12 (75.0%):  28%|██▊       | 11/40 [00:05<00:14,  1.96it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 13 (69.2%):  32%|███▎      | 13/40 [00:05<00:10,  2.48it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 14 (71.4%):  35%|███▌      | 14/40 [00:05<00:10,  2.51it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 15 (66.7%):  38%|███▊      | 15/40 [00:05<00:08,  2.96it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 16 (62.5%):  38%|███▊      | 15/40 [00:05<00:08,  2.96it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 17 (64.7%):  42%|████▎     | 17/40 [00:05<00:05,  4.07it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 18 (66.7%):  45%|████▌     | 18/40 [00:06<00:05,  4.39it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 19 (68.4%):  48%|████▊     | 19/40 [00:06<00:06,  3.16it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 20 (65.0%):  50%|█████     | 20/40 [00:09<00:16,  1.25it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 21 (61.9%):  52%|█████▎    | 21/40 [00:09<00:12,  1.50it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 22 (59.1%):  52%|█████▎    | 21/40 [00:09<00:12,  1.50it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 23 (56.5%):  55%|█████▌    | 22/40 [00:09<00:12,  1.50it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 24 (58.3%):  60%|██████    | 24/40 [00:09<00:05,  2.96it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 25 (56.0%):  62%|██████▎   | 25/40 [00:09<00:04,  3.26it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 26 (57.7%):  65%|██████▌   | 26/40 [00:10<00:04,  3.09it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 27 (59.3%):  68%|██████▊   | 27/40 [00:10<00:04,  3.21it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 28 (57.1%):  70%|███████   | 28/40 [00:10<00:03,  3.67it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 29 (58.6%):  72%|███████▎  | 29/40 [00:12<00:07,  1.53it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 30 (56.7%):  75%|███████▌  | 30/40 [00:12<00:05,  1.67it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 31 (54.8%):  78%|███████▊  | 31/40 [00:13<00:05,  1.76it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 32 (56.2%):  80%|████████  | 32/40 [00:13<00:03,  2.25it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 33 (57.6%):  80%|████████  | 32/40 [00:13<00:03,  2.25it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 34 (55.9%):  85%|████████▌ | 34/40 [00:13<00:01,  3.31it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 35 (54.3%):  88%|████████▊ | 35/40 [00:13<00:01,  3.16it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 36 (52.8%):  88%|████████▊ | 35/40 [00:13<00:01,  3.16it/s]



**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)
**actual**:  Prediction(
    answer='two'
)


**Is match**: True

**Is match**: False



Average Metric: 20.00 / 38 (52.6%):  92%|█████████▎| 37/40 [00:14<00:00,  4.02it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 20.00 / 39 (51.3%):  98%|█████████▊| 39/40 [00:14<00:00,  4.36it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:17<00:00,  2.30it/s]

2025/09/04 14:58:14 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 14:58:14 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 0'].
2025/09/04 14:58:14 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0]
2025/09/04 14:58:14 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 14:58:14 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   0%|          | 0/40 [00:00<?, ?it/s]

Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:27,  1.41it/s]

**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 1.00 / 2 (50.0%):   2%|▎         | 1/40 [00:00<00:27,  1.41it/s] 

**Is match**: False


Average Metric: 1.00 / 3 (33.3%):   5%|▌         | 2/40 [00:00<00:26,  1.41it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 4 (50.0%):   8%|▊         | 3/40 [00:00<00:26,  1.41it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 2.00 / 5 (40.0%):  10%|█         | 4/40 [00:00<00:25,  1.41it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 6 (50.0%):  15%|█▌        | 6/40 [00:00<00:04,  7.81it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 7 (57.1%):  15%|█▌        | 6/40 [00:01<00:04,  7.81it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 8 (50.0%):  20%|██        | 8/40 [00:01<00:05,  6.14it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 9 (44.4%):  22%|██▎       | 9/40 [00:01<00:05,  5.79it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 10 (40.0%):  22%|██▎       | 9/40 [00:01<00:05,  5.79it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 11 (45.5%):  28%|██▊       | 11/40 [00:06<00:29,  1.03s/it]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 12 (50.0%):  30%|███       | 12/40 [00:06<00:23,  1.17it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 13 (46.2%):  32%|███▎      | 13/40 [00:07<00:22,  1.22it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 14 (50.0%):  35%|███▌      | 14/40 [00:07<00:17,  1.48it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 15 (53.3%):  38%|███▊      | 15/40 [00:08<00:14,  1.72it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 16 (56.2%):  40%|████      | 16/40 [00:08<00:11,  2.07it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 17 (58.8%):  40%|████      | 16/40 [00:08<00:11,  2.07it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 18 (55.6%):  45%|████▌     | 18/40 [00:08<00:07,  3.09it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 19 (57.9%):  45%|████▌     | 18/40 [00:08<00:07,  3.09it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 20 (55.0%):  50%|█████     | 20/40 [00:08<00:04,  4.21it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 21 (57.1%):  52%|█████▎    | 21/40 [00:13<00:21,  1.13s/it]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 22 (59.1%):  55%|█████▌    | 22/40 [00:13<00:16,  1.12it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 23 (56.5%):  55%|█████▌    | 22/40 [00:13<00:16,  1.12it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 24 (58.3%):  60%|██████    | 24/40 [00:13<00:08,  1.78it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 25 (60.0%):  62%|██████▎   | 25/40 [00:13<00:07,  2.07it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 26 (61.5%):  65%|██████▌   | 26/40 [00:13<00:06,  2.28it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 27 (63.0%):  68%|██████▊   | 27/40 [00:17<00:14,  1.11s/it]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 28 (60.7%):  68%|██████▊   | 27/40 [00:17<00:14,  1.11s/it]




**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='one'
)
Average Metric: 17.00 / 29 (58.6%):  70%|███████   | 28/40 [00:17<00:13,  1.11s/it]**actual**:  Prediction(
    answer='one'
)


**Is match**: False

**Is match**: False



Average Metric: 17.00 / 31 (54.8%):  75%|███████▌  | 30/40 [00:17<00:11,  1.11s/it]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 32 (53.1%):  80%|████████  | 32/40 [00:17<00:03,  2.22it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 33 (54.5%):  82%|████████▎ | 33/40 [00:17<00:02,  2.47it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 34 (52.9%):  82%|████████▎ | 33/40 [00:17<00:02,  2.47it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 35 (51.4%):  88%|████████▊ | 35/40 [00:17<00:01,  3.28it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 36 (50.0%):  90%|█████████ | 36/40 [00:22<00:04,  1.17s/it]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 37 (51.4%):  92%|█████████▎| 37/40 [00:24<00:03,  1.21s/it]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 38 (50.0%):  92%|█████████▎| 37/40 [00:24<00:03,  1.21s/it]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 39 (48.7%):  98%|█████████▊| 39/40 [00:24<00:00,  1.17it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:25<00:00,  1.58it/s]

2025/09/04 14:58:39 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 14:58:39 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 15'].
2025/09/04 14:58:39 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0]
2025/09/04 14:58:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 14:58:39 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 14 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:24,  1.62it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 2 (100.0%):   2%|▎         | 1/40 [00:00<00:24,  1.62it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 2.00 / 3 (66.7%):   8%|▊         | 3/40 [00:00<00:07,  4.80it/s] 



**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: True

**Is match**: False



Average Metric: 3.00 / 5 (60.0%):  10%|█         | 4/40 [00:00<00:07,  4.80it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 6 (66.7%):  12%|█▎        | 5/40 [00:00<00:07,  4.80it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 7 (71.4%):  15%|█▌        | 6/40 [00:00<00:07,  4.80it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 8 (62.5%):  20%|██        | 8/40 [00:01<00:03,  9.24it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)



**Is match**: False


Average Metric: 5.00 / 9 (55.6%):  20%|██        | 8/40 [00:01<00:03,  9.24it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)
**actual**:  Prediction(
    answer='two'
)



**Is match**: True

**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**Is match**: True


**actual**:  Prediction(
    answer='one'
)

Average Metric: 7.00 / 11 (63.6%):  25%|██▌       | 10/40 [00:01<00:03,  8.11it/s]

**Is match**: False


Average Metric: 7.00 / 12 (58.3%):  28%|██▊       | 11/40 [00:01<00:03,  8.11it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 13 (61.5%):  32%|███▎      | 13/40 [00:01<00:02, 10.41it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 8.00 / 14 (57.1%):  32%|███▎      | 13/40 [00:01<00:02, 10.41it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 15 (60.0%):  38%|███▊      | 15/40 [00:05<00:15,  1.62it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 16 (56.2%):  40%|████      | 16/40 [00:05<00:12,  1.87it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 17 (58.8%):  42%|████▎     | 17/40 [00:05<00:10,  2.21it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 18 (61.1%):  45%|████▌     | 18/40 [00:05<00:08,  2.57it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 19 (63.2%):  48%|████▊     | 19/40 [00:06<00:07,  2.65it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 20 (60.0%):  48%|████▊     | 19/40 [00:06<00:07,  2.65it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 21 (57.1%):  52%|█████▎    | 21/40 [00:06<00:05,  3.53it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 22 (54.5%):  55%|█████▌    | 22/40 [00:08<00:11,  1.59it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 23 (56.5%):  55%|█████▌    | 22/40 [00:08<00:11,  1.59it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 24 (58.3%):  60%|██████    | 24/40 [00:08<00:06,  2.36it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 25 (60.0%):  60%|██████    | 24/40 [00:08<00:06,  2.36it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 26 (57.7%):  65%|██████▌   | 26/40 [00:08<00:04,  3.42it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 27 (55.6%):  68%|██████▊   | 27/40 [00:09<00:03,  3.55it/s]



**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)
**actual**:  Prediction(
    answer='one'
)


**Is match**: False

**Is match**: True



Average Metric: 16.00 / 28 (57.1%):  68%|██████▊   | 27/40 [00:09<00:03,  3.55it/s]
Average Metric: 16.00 / 28 (57.1%):  70%|███████   | 28/40 [00:09<00:03,  3.48it/s]

**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

Average Metric: 16.00 / 29 (55.2%):  70%|███████   | 28/40 [00:09<00:03,  3.48it/s]**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 30 (53.3%):  72%|███████▎  | 29/40 [00:09<00:03,  3.48it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 31 (54.8%):  75%|███████▌  | 30/40 [00:09<00:02,  3.48it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 32 (53.1%):  78%|███████▊  | 31/40 [00:09<00:02,  3.48it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 33 (51.5%):  82%|████████▎ | 33/40 [00:13<00:04,  1.66it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 34 (52.9%):  82%|████████▎ | 33/40 [00:13<00:04,  1.66it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 35 (51.4%):  88%|████████▊ | 35/40 [00:13<00:02,  2.15it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 36 (50.0%):  90%|█████████ | 36/40 [00:13<00:01,  2.34it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 37 (48.6%):  90%|█████████ | 36/40 [00:13<00:01,  2.34it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)



**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**Is match**: True

**actual**: 
 Prediction(
    answer='two'
)
Average Metric: 19.00 / 38 (50.0%):  92%|█████████▎| 37/40 [00:13<00:01,  2.34it/s]

**Is match**: True

Average Metric: 19.00 / 38 (50.0%):  95%|█████████▌| 38/40 [00:13<00:00,  2.94it/s]
Average Metric: 20.00 / 39 (51.3%):  95%|█████████▌| 38/40 [00:13<00:00,  2.94it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:15<00:00,  2.53it/s]

2025/09/04 14:58:55 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 14:58:55 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 0'].
2025/09/04 14:58:55 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0]
2025/09/04 14:58:55 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 14:58:55 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 15 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:23,  1.65it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 1.00 / 2 (50.0%):   2%|▎         | 1/40 [00:00<00:23,  1.65it/s] 


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 3 (66.7%):   5%|▌         | 2/40 [00:00<00:23,  1.65it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 2.00 / 4 (50.0%):  10%|█         | 4/40 [00:00<00:05,  6.38it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 5 (60.0%):  10%|█         | 4/40 [00:00<00:05,  6.38it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 6 (50.0%):  15%|█▌        | 6/40 [00:00<00:04,  8.15it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 7 (57.1%):  15%|█▌        | 6/40 [00:00<00:04,  8.15it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 8 (62.5%):  20%|██        | 8/40 [00:01<00:03,  9.87it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 9 (66.7%):  20%|██        | 8/40 [00:01<00:03,  9.87it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 10 (70.0%):  25%|██▌       | 10/40 [00:07<00:37,  1.26s/it]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 7.00 / 11 (63.6%):  28%|██▊       | 11/40 [00:07<00:30,  1.04s/it]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 12 (66.7%):  28%|██▊       | 11/40 [00:08<00:30,  1.04s/it]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 8.00 / 13 (61.5%):  32%|███▎      | 13/40 [00:08<00:18,  1.45it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 8.00 / 14 (57.1%):  32%|███▎      | 13/40 [00:08<00:18,  1.45it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 15 (60.0%):  38%|███▊      | 15/40 [00:08<00:12,  2.03it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 16 (62.5%):  38%|███▊      | 15/40 [00:08<00:12,  2.03it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 17 (58.8%):  40%|████      | 16/40 [00:08<00:11,  2.03it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 18 (55.6%):  45%|████▌     | 18/40 [00:08<00:07,  3.04it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 19 (57.9%):  48%|████▊     | 19/40 [00:08<00:06,  3.19it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 20 (60.0%):  50%|█████     | 20/40 [00:09<00:06,  3.28it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 21 (57.1%):  52%|█████▎    | 21/40 [00:14<00:25,  1.33s/it]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 22 (59.1%):  55%|█████▌    | 22/40 [00:14<00:20,  1.12s/it]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 23 (60.9%):  55%|█████▌    | 22/40 [00:14<00:20,  1.12s/it]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 24 (62.5%):  60%|██████    | 24/40 [00:14<00:11,  1.38it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 25 (60.0%):  62%|██████▎   | 25/40 [00:15<00:09,  1.54it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 26 (61.5%):  65%|██████▌   | 26/40 [00:15<00:08,  1.74it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 27 (59.3%):  65%|██████▌   | 26/40 [00:15<00:08,  1.74it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 28 (57.1%):  70%|███████   | 28/40 [00:18<00:11,  1.02it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 29 (55.2%):  72%|███████▎  | 29/40 [00:19<00:08,  1.23it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 30 (53.3%):  72%|███████▎  | 29/40 [00:19<00:08,  1.23it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 31 (51.6%):  78%|███████▊  | 31/40 [00:19<00:04,  1.92it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 32 (50.0%):  80%|████████  | 32/40 [00:19<00:03,  2.20it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 33 (48.5%):  82%|████████▎ | 33/40 [00:19<00:03,  2.30it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 34 (50.0%):  82%|████████▎ | 33/40 [00:19<00:03,  2.30it/s]

2025/09/04 14:59:17 ERROR dspy.utils.parallelizer: Error for Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4.1 in organization org-q446iSedRzWhcp3jJDP32sfL on tokens per min (TPM): Limit 30000, Used 30000, Requested 281. Please try again in 562ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 17.00 / 34 (50.0%):  88%|████████▊ | 35/40 [00:21<00:03,  1.59it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 35 (48.6%):  90%|█████████ | 36/40 [00:22<00:03,  1.28it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 36 (47.2%):  92%|█████████▎| 37/40 [00:23<00:02,  1.34it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 37 (48.6%):  92%|█████████▎| 37/40 [00:23<00:02,  1.34it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 38 (50.0%):  98%|█████████▊| 39/40 [00:24<00:00,  1.71it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 39 (48.7%): 100%|██████████| 40/40 [00:24<00:00,  1.63it/s]

2025/09/04 14:59:19 INFO dspy.evaluate.evaluate: Average Metric: 19.0 / 40 (47.5%)
2025/09/04 14:59:20 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 47.5 with parameters ['Predictor 0: Instruction 7', 'Predictor 0: Few-Shot Set 17'].
2025/09/04 14:59:20 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5]
2025/09/04 14:59:20 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 14:59:20 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 16 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:21,  1.84it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 2 (100.0%):   2%|▎         | 1/40 [00:00<00:21,  1.84it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 3 (100.0%):   8%|▊         | 3/40 [00:00<00:06,  5.38it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 3/40 [00:00<00:06,  5.38it/s] 


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 5 (60.0%):  12%|█▎        | 5/40 [00:03<00:31,  1.11it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 6 (50.0%):  15%|█▌        | 6/40 [00:04<00:25,  1.35it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 7 (57.1%):  15%|█▌        | 6/40 [00:04<00:25,  1.35it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 8 (62.5%):  20%|██        | 8/40 [00:04<00:16,  1.96it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 9 (55.6%):  20%|██        | 8/40 [00:04<00:16,  1.96it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 10 (50.0%):  22%|██▎       | 9/40 [00:04<00:15,  1.96it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 11 (54.5%):  28%|██▊       | 11/40 [00:04<00:09,  3.21it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)



**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**Is match**: True

**actual**:  Prediction(
    answer='two'
)

Average Metric: 7.00 / 12 (58.3%):  28%|██▊       | 11/40 [00:04<00:09,  3.21it/s]

**Is match**: True


Average Metric: 8.00 / 13 (61.5%):  30%|███       | 12/40 [00:04<00:08,  3.21it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 14 (64.3%):  35%|███▌      | 14/40 [00:05<00:06,  3.89it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 15 (60.0%):  38%|███▊      | 15/40 [00:05<00:05,  4.31it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 16 (56.2%):  40%|████      | 16/40 [00:10<00:29,  1.23s/it]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 17 (52.9%):  42%|████▎     | 17/40 [00:11<00:24,  1.05s/it]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 18 (55.6%):  45%|████▌     | 18/40 [00:11<00:19,  1.14it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 19 (57.9%):  48%|████▊     | 19/40 [00:12<00:16,  1.30it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 20 (55.0%):  50%|█████     | 20/40 [00:12<00:12,  1.60it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 21 (57.1%):  50%|█████     | 20/40 [00:12<00:12,  1.60it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 22 (59.1%):  52%|█████▎    | 21/40 [00:12<00:11,  1.60it/s]
Average Metric: 13.00 / 22 (59.1%):  55%|█████▌    | 22/40 [00:12<00:06,  2.59it/s]

**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 23 (56.5%):  55%|█████▌    | 22/40 [00:12<00:06,  2.59it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 24 (54.2%):  60%|██████    | 24/40 [00:12<00:04,  3.54it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 25 (56.0%):  62%|██████▎   | 25/40 [00:16<00:15,  1.02s/it]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 26 (53.8%):  65%|██████▌   | 26/40 [00:16<00:11,  1.17it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 27 (51.9%):  68%|██████▊   | 27/40 [00:16<00:09,  1.43it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 28 (53.6%):  70%|███████   | 28/40 [00:17<00:06,  1.83it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 29 (51.7%):  72%|███████▎  | 29/40 [00:17<00:05,  2.07it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 30 (50.0%):  75%|███████▌  | 30/40 [00:17<00:04,  2.36it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 31 (51.6%):  78%|███████▊  | 31/40 [00:17<00:03,  2.97it/s]

2025/09/04 14:59:38 ERROR dspy.utils.parallelizer: Error for Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4.1 in organization org-q446iSedRzWhcp3jJDP32sfL on tokens per min (TPM): Limit 30000, Used 30000, Requested 254. Please try again in 508ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 16.00 / 31 (51.6%):  80%|████████  | 32/40 [00:18<00:02,  3.22it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 32 (50.0%):  82%|████████▎ | 33/40 [00:20<00:06,  1.04it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 33 (48.5%):  82%|████████▎ | 33/40 [00:20<00:06,  1.04it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 34 (47.1%):  88%|████████▊ | 35/40 [00:20<00:02,  1.80it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 35 (45.7%):  90%|█████████ | 36/40 [00:21<00:01,  2.02it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 36 (47.2%):  92%|█████████▎| 37/40 [00:21<00:01,  2.49it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 37 (48.6%):  95%|█████████▌| 38/40 [00:21<00:00,  2.79it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 38 (47.4%):  98%|█████████▊| 39/40 [00:24<00:01,  1.16s/it]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 39 (48.7%): 100%|██████████| 40/40 [00:24<00:00,  1.61it/s]

2025/09/04 14:59:44 INFO dspy.evaluate.evaluate: Average Metric: 19.0 / 40 (47.5%)
2025/09/04 14:59:44 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 47.5 with parameters ['Predictor 0: Instruction 6', 'Predictor 0: Few-Shot Set 17'].
2025/09/04 14:59:44 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 47.5]
2025/09/04 14:59:44 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 14:59:44 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 17 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:22,  1.77it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 2 (100.0%):   2%|▎         | 1/40 [00:00<00:22,  1.77it/s]



**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: True

**Is match**: False



Average Metric: 3.00 / 3 (100.0%):   8%|▊         | 3/40 [00:00<00:06,  5.41it/s]
Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 3/40 [00:00<00:06,  5.41it/s] 

**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='two'
)


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: True

**Is match**: False



Average Metric: 4.00 / 6 (66.7%):  12%|█▎        | 5/40 [00:00<00:06,  5.41it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 7 (57.1%):  18%|█▊        | 7/40 [00:01<00:04,  7.15it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 8 (62.5%):  18%|█▊        | 7/40 [00:01<00:04,  7.15it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 9 (55.6%):  22%|██▎       | 9/40 [00:01<00:04,  6.85it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 10 (50.0%):  22%|██▎       | 9/40 [00:01<00:04,  6.85it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 11 (45.5%):  28%|██▊       | 11/40 [00:07<00:31,  1.08s/it]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 12 (50.0%):  30%|███       | 12/40 [00:07<00:25,  1.11it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 13 (53.8%):  30%|███       | 12/40 [00:07<00:25,  1.11it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 14 (57.1%):  32%|███▎      | 13/40 [00:08<00:24,  1.11it/s]
Average Metric: 8.00 / 14 (57.1%):  35%|███▌      | 14/40 [00:08<00:17,  1.50it/s]

**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 15 (60.0%):  35%|███▌      | 14/40 [00:08<00:17,  1.50it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 16 (62.5%):  38%|███▊      | 15/40 [00:08<00:16,  1.50it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 17 (64.7%):  42%|████▎     | 17/40 [00:08<00:09,  2.47it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 18 (61.1%):  42%|████▎     | 17/40 [00:08<00:09,  2.47it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 19 (57.9%):  48%|████▊     | 19/40 [00:08<00:07,  2.89it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 20 (55.0%):  48%|████▊     | 19/40 [00:08<00:07,  2.89it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 21 (57.1%):  52%|█████▎    | 21/40 [00:08<00:04,  3.83it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 22 (59.1%):  52%|█████▎    | 21/40 [00:15<00:04,  3.83it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True



Average Metric: 14.00 / 23 (60.9%):  57%|█████▊    | 23/40 [00:15<00:19,  1.15s/it]

**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 24 (58.3%):  57%|█████▊    | 23/40 [00:15<00:19,  1.15s/it]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 25 (60.0%):  60%|██████    | 24/40 [00:15<00:18,  1.15s/it]
Average Metric: 15.00 / 25 (60.0%):  62%|██████▎   | 25/40 [00:15<00:12,  1.20it/s]

**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 26 (57.7%):  62%|██████▎   | 25/40 [00:15<00:12,  1.20it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 27 (55.6%):  68%|██████▊   | 27/40 [00:15<00:08,  1.55it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 28 (53.6%):  68%|██████▊   | 27/40 [00:16<00:08,  1.55it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 29 (55.2%):  72%|███████▎  | 29/40 [00:16<00:05,  2.13it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 30 (53.3%):  72%|███████▎  | 29/40 [00:16<00:05,  2.13it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 31 (51.6%):  78%|███████▊  | 31/40 [00:16<00:03,  2.81it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 32 (50.0%):  78%|███████▊  | 31/40 [00:16<00:03,  2.81it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 33 (48.5%):  82%|████████▎ | 33/40 [00:16<00:02,  3.41it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 34 (50.0%):  85%|████████▌ | 34/40 [00:23<00:08,  1.40s/it]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 35 (48.6%):  85%|████████▌ | 34/40 [00:23<00:08,  1.40s/it]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)



**Is match**: True


Average Metric: 18.00 / 36 (50.0%):  88%|████████▊ | 35/40 [00:23<00:06,  1.40s/it]

**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

Average Metric: 18.00 / 36 (50.0%):  90%|█████████ | 36/40 [00:23<00:03,  1.05it/s]**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 37 (48.6%):  90%|█████████ | 36/40 [00:23<00:03,  1.05it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 38 (50.0%):  92%|█████████▎| 37/40 [00:23<00:02,  1.05it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 39 (48.7%):  95%|█████████▌| 38/40 [00:23<00:01,  1.05it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:23<00:00,  1.68it/s]

2025/09/04 15:00:08 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 15:00:08 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 17'].
2025/09/04 15:00:08 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 47.5, 50.0]
2025/09/04 15:00:08 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 15:00:08 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 18 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:33,  1.16it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 1.00 / 2 (50.0%):   5%|▌         | 2/40 [00:03<01:02,  1.65s/it] 


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 3 (66.7%):   5%|▌         | 2/40 [00:03<01:02,  1.65s/it]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 4 (75.0%):  10%|█         | 4/40 [00:03<00:23,  1.52it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 5 (80.0%):  10%|█         | 4/40 [00:03<00:23,  1.52it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 6 (66.7%):  15%|█▌        | 6/40 [00:03<00:12,  2.66it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 7 (71.4%):  15%|█▌        | 6/40 [00:03<00:12,  2.66it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 8 (62.5%):  20%|██        | 8/40 [00:03<00:08,  3.90it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 9 (66.7%):  20%|██        | 8/40 [00:06<00:08,  3.90it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 10 (60.0%):  25%|██▌       | 10/40 [00:07<00:29,  1.03it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 11 (54.5%):  25%|██▌       | 10/40 [00:07<00:29,  1.03it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='one'
)


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**Is match**: False


**actual**:  Prediction(
    answer='two'
)
Average Metric: 6.00 / 12 (50.0%):  28%|██▊       | 11/40 [00:08<00:28,  1.03it/s]

**Is match**: True


Average Metric: 7.00 / 13 (53.8%):  30%|███       | 12/40 [00:08<00:18,  1.49it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 14 (57.1%):  35%|███▌      | 14/40 [00:08<00:12,  2.08it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 15 (60.0%):  35%|███▌      | 14/40 [00:08<00:12,  2.08it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 16 (56.2%):  40%|████      | 16/40 [00:08<00:08,  2.68it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 17 (58.8%):  42%|████▎     | 17/40 [00:12<00:23,  1.02s/it]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 18 (55.6%):  45%|████▌     | 18/40 [00:12<00:18,  1.20it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 19 (52.6%):  45%|████▌     | 18/40 [00:12<00:18,  1.20it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 20 (55.0%):  50%|█████     | 20/40 [00:12<00:11,  1.80it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 21 (52.4%):  52%|█████▎    | 21/40 [00:13<00:10,  1.88it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 22 (50.0%):  52%|█████▎    | 21/40 [00:13<00:10,  1.88it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 23 (52.2%):  57%|█████▊    | 23/40 [00:13<00:06,  2.76it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)



**Is match**: True


Average Metric: 13.00 / 24 (54.2%):  57%|█████▊    | 23/40 [00:13<00:06,  2.76it/s]

**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)
Average Metric: 13.00 / 24 (54.2%):  60%|██████    | 24/40 [00:13<00:05,  3.11it/s]

**Is match**: False


Average Metric: 13.00 / 25 (52.0%):  60%|██████    | 24/40 [00:13<00:05,  3.11it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 26 (53.8%):  65%|██████▌   | 26/40 [00:17<00:14,  1.04s/it]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 27 (51.9%):  65%|██████▌   | 26/40 [00:18<00:14,  1.04s/it]
Average Metric: 14.00 / 27 (51.9%):  68%|██████▊   | 27/40 [00:18<00:11,  1.16it/s]

**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 28 (50.0%):  68%|██████▊   | 27/40 [00:18<00:11,  1.16it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 29 (48.3%):  72%|███████▎  | 29/40 [00:18<00:06,  1.78it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 30 (50.0%):  75%|███████▌  | 30/40 [00:18<00:04,  2.00it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 31 (48.4%):  78%|███████▊  | 31/40 [00:18<00:03,  2.30it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 32 (50.0%):  80%|████████  | 32/40 [00:21<00:08,  1.00s/it]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 33 (51.5%):  80%|████████  | 32/40 [00:21<00:08,  1.00s/it]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 34 (52.9%):  82%|████████▎ | 33/40 [00:21<00:07,  1.00s/it]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 35 (51.4%):  88%|████████▊ | 35/40 [00:21<00:02,  1.98it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 36 (50.0%):  88%|████████▊ | 35/40 [00:21<00:02,  1.98it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 37 (51.4%):  90%|█████████ | 36/40 [00:21<00:02,  1.98it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 38 (50.0%):  95%|█████████▌| 38/40 [00:21<00:00,  3.05it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 39 (48.7%):  95%|█████████▌| 38/40 [00:22<00:00,  3.05it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:28<00:00,  1.40it/s]

2025/09/04 15:00:37 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 15:00:37 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 7'].
2025/09/04 15:00:37 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 47.5, 50.0, 50.0]
2025/09/04 15:00:37 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 15:00:37 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 0.00 / 1 (0.0%):   2%|▎         | 1/40 [00:00<00:28,  1.38it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 2 (50.0%):   2%|▎         | 1/40 [00:00<00:28,  1.38it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 3 (66.7%):   5%|▌         | 2/40 [00:00<00:27,  1.38it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 2.00 / 4 (50.0%):  10%|█         | 4/40 [00:00<00:06,  5.88it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 5 (60.0%):  10%|█         | 4/40 [00:00<00:06,  5.88it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 6 (66.7%):  15%|█▌        | 6/40 [00:00<00:04,  8.28it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 7 (57.1%):  15%|█▌        | 6/40 [00:01<00:04,  8.28it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 8 (62.5%):  20%|██        | 8/40 [00:01<00:03,  8.50it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 9 (66.7%):  20%|██        | 8/40 [00:01<00:03,  8.50it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 10 (70.0%):  25%|██▌       | 10/40 [00:01<00:05,  5.38it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 7.00 / 11 (63.6%):  25%|██▌       | 10/40 [00:01<00:05,  5.38it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 7.00 / 12 (58.3%):  28%|██▊       | 11/40 [00:01<00:05,  5.38it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 13 (61.5%):  32%|███▎      | 13/40 [00:01<00:03,  7.91it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 8.00 / 14 (57.1%):  32%|███▎      | 13/40 [00:02<00:03,  7.91it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 15 (60.0%):  38%|███▊      | 15/40 [00:07<00:22,  1.12it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 16 (62.5%):  40%|████      | 16/40 [00:07<00:18,  1.30it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 17 (58.8%):  42%|████▎     | 17/40 [00:08<00:17,  1.31it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 18 (61.1%):  42%|████▎     | 17/40 [00:08<00:17,  1.31it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 19 (57.9%):  48%|████▊     | 19/40 [00:09<00:13,  1.61it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 20 (60.0%):  48%|████▊     | 19/40 [00:09<00:13,  1.61it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 21 (61.9%):  50%|█████     | 20/40 [00:09<00:12,  1.61it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 22 (59.1%):  55%|█████▌    | 22/40 [00:09<00:06,  2.71it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 23 (56.5%):  55%|█████▌    | 22/40 [00:10<00:06,  2.71it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 24 (54.2%):  60%|██████    | 24/40 [00:11<00:09,  1.71it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 25 (56.0%):  62%|██████▎   | 25/40 [00:13<00:13,  1.11it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 26 (53.8%):  62%|██████▎   | 25/40 [00:13<00:13,  1.11it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 27 (51.9%):  68%|██████▊   | 27/40 [00:14<00:08,  1.60it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 28 (50.0%):  70%|███████   | 28/40 [00:14<00:06,  1.76it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 29 (51.7%):  72%|███████▎  | 29/40 [00:14<00:05,  1.88it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 30 (50.0%):  72%|███████▎  | 29/40 [00:14<00:05,  1.88it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 31 (48.4%):  75%|███████▌  | 30/40 [00:14<00:05,  1.88it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 32 (46.9%):  80%|████████  | 32/40 [00:16<00:04,  1.78it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 33 (48.5%):  82%|████████▎ | 33/40 [00:18<00:06,  1.16it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 34 (50.0%):  85%|████████▌ | 34/40 [00:18<00:04,  1.37it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 35 (48.6%):  88%|████████▊ | 35/40 [00:19<00:03,  1.45it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 36 (50.0%):  90%|█████████ | 36/40 [00:19<00:02,  1.68it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 37 (51.4%):  92%|█████████▎| 37/40 [00:21<00:02,  1.13it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 38 (52.6%):  92%|█████████▎| 37/40 [00:21<00:02,  1.13it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 20.00 / 39 (51.3%):  98%|█████████▊| 39/40 [00:22<00:00,  1.49it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:22<00:00,  1.76it/s]

2025/09/04 15:01:00 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 15:01:00 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 5'].
2025/09/04 15:01:00 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 47.5, 50.0, 50.0, 50.0]
2025/09/04 15:01:00 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 15:01:00 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 20 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:28,  1.35it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 2 (100.0%):   2%|▎         | 1/40 [00:00<00:28,  1.35it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 3 (100.0%):   5%|▌         | 2/40 [00:00<00:28,  1.35it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 4 (75.0%):  10%|█         | 4/40 [00:01<00:08,  4.00it/s] 


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 5 (60.0%):  12%|█▎        | 5/40 [00:01<00:08,  4.19it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 6 (50.0%):  15%|█▌        | 6/40 [00:01<00:07,  4.64it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 7 (42.9%):  18%|█▊        | 7/40 [00:04<00:37,  1.15s/it]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 8 (50.0%):  20%|██        | 8/40 [00:05<00:28,  1.12it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 9 (44.4%):  22%|██▎       | 9/40 [00:06<00:26,  1.15it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 10 (50.0%):  22%|██▎       | 9/40 [00:06<00:26,  1.15it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 11 (54.5%):  25%|██▌       | 10/40 [00:06<00:26,  1.15it/s]

Average Metric: 6.00 / 11 (54.5%):  28%|██▊       | 11/40 [00:06<00:17,  1.63it/s]

**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

**Is match**: True



Average Metric: 7.00 / 13 (53.8%):  30%|███       | 12/40 [00:06<00:17,  1.63it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 14 (57.1%):  35%|███▌      | 14/40 [00:06<00:08,  2.97it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 15 (60.0%):  38%|███▊      | 15/40 [00:10<00:22,  1.13it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 16 (56.2%):  38%|███▊      | 15/40 [00:10<00:22,  1.13it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 17 (58.8%):  42%|████▎     | 17/40 [00:10<00:14,  1.59it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 18 (61.1%):  42%|████▎     | 17/40 [00:10<00:14,  1.59it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 19 (57.9%):  45%|████▌     | 18/40 [00:10<00:13,  1.59it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 20 (60.0%):  50%|█████     | 20/40 [00:10<00:07,  2.63it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 21 (61.9%):  50%|█████     | 20/40 [00:10<00:07,  2.63it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 22 (59.1%):  55%|█████▌    | 22/40 [00:14<00:14,  1.23it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 23 (56.5%):  57%|█████▊    | 23/40 [00:14<00:13,  1.25it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)



**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**Is match**: False

**actual**:  Prediction(
    answer='one'
)

Average Metric: 13.00 / 24 (54.2%):  60%|██████    | 24/40 [00:15<00:12,  1.33it/s]

**Is match**: False


Average Metric: 13.00 / 25 (52.0%):  60%|██████    | 24/40 [00:15<00:12,  1.33it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 26 (53.8%):  62%|██████▎   | 25/40 [00:15<00:11,  1.33it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 27 (51.9%):  68%|██████▊   | 27/40 [00:17<00:08,  1.45it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 28 (53.6%):  70%|███████   | 28/40 [00:17<00:07,  1.71it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 29 (51.7%):  72%|███████▎  | 29/40 [00:17<00:05,  2.02it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 30 (53.3%):  75%|███████▌  | 30/40 [00:17<00:04,  2.37it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 31 (54.8%):  75%|███████▌  | 30/40 [00:17<00:04,  2.37it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 32 (53.1%):  80%|████████  | 32/40 [00:18<00:02,  2.78it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 33 (51.5%):  80%|████████  | 32/40 [00:18<00:02,  2.78it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 34 (52.9%):  85%|████████▌ | 34/40 [00:22<00:05,  1.09it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 35 (51.4%):  85%|████████▌ | 34/40 [00:22<00:05,  1.09it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 36 (52.8%):  90%|█████████ | 36/40 [00:22<00:02,  1.53it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 37 (51.4%):  92%|█████████▎| 37/40 [00:23<00:02,  1.42it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 38 (52.6%):  92%|█████████▎| 37/40 [00:23<00:02,  1.42it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 20.00 / 39 (51.3%):  98%|█████████▊| 39/40 [00:25<00:00,  1.25it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:25<00:00,  1.57it/s]

2025/09/04 15:01:25 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 15:01:25 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 4'].
2025/09/04 15:01:25 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 47.5, 50.0, 50.0, 50.0, 50.0]
2025/09/04 15:01:25 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 15:01:25 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 21 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:02<01:19,  2.03s/it]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 2 (100.0%):   2%|▎         | 1/40 [00:02<01:19,  2.03s/it]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 3 (100.0%):   8%|▊         | 3/40 [00:02<00:20,  1.77it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 3/40 [00:02<00:20,  1.77it/s] 


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 5 (60.0%):  12%|█▎        | 5/40 [00:02<00:11,  3.01it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 6 (50.0%):  12%|█▎        | 5/40 [00:02<00:11,  3.01it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 7 (57.1%):  15%|█▌        | 6/40 [00:02<00:11,  3.01it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 8 (50.0%):  20%|██        | 8/40 [00:05<00:20,  1.53it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 9 (55.6%):  22%|██▎       | 9/40 [00:05<00:16,  1.84it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 10 (60.0%):  22%|██▎       | 9/40 [00:05<00:16,  1.84it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 11 (54.5%):  28%|██▊       | 11/40 [00:05<00:10,  2.72it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 12 (58.3%):  28%|██▊       | 11/40 [00:05<00:10,  2.72it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 7.00 / 13 (53.8%):  32%|███▎      | 13/40 [00:06<00:09,  2.73it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 14 (57.1%):  32%|███▎      | 13/40 [00:06<00:09,  2.73it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 8.00 / 15 (53.3%):  38%|███▊      | 15/40 [00:09<00:19,  1.29it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 16 (56.2%):  40%|████      | 16/40 [00:09<00:17,  1.39it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 17 (58.8%):  40%|████      | 16/40 [00:09<00:17,  1.39it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 18 (55.6%):  45%|████▌     | 18/40 [00:10<00:11,  1.86it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 19 (52.6%):  48%|████▊     | 19/40 [00:10<00:09,  2.14it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 20 (55.0%):  50%|█████     | 20/40 [00:11<00:13,  1.48it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 21 (57.1%):  52%|█████▎    | 21/40 [00:13<00:15,  1.26it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 22 (59.1%):  55%|█████▌    | 22/40 [00:13<00:11,  1.52it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 23 (56.5%):  57%|█████▊    | 23/40 [00:13<00:08,  1.97it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 24 (58.3%):  60%|██████    | 24/40 [00:13<00:06,  2.29it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 25 (56.0%):  62%|██████▎   | 25/40 [00:14<00:06,  2.29it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 26 (57.7%):  62%|██████▎   | 25/40 [00:14<00:06,  2.29it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 27 (55.6%):  68%|██████▊   | 27/40 [00:16<00:10,  1.20it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 28 (57.1%):  70%|███████   | 28/40 [00:16<00:08,  1.47it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 29 (55.2%):  70%|███████   | 28/40 [00:16<00:08,  1.47it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 30 (53.3%):  72%|███████▎  | 29/40 [00:16<00:07,  1.47it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 31 (51.6%):  78%|███████▊  | 31/40 [00:17<00:03,  2.26it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 32 (53.1%):  80%|████████  | 32/40 [00:17<00:03,  2.37it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 33 (51.5%):  82%|████████▎ | 33/40 [00:17<00:02,  2.84it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 34 (50.0%):  85%|████████▌ | 34/40 [00:18<00:02,  2.46it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 35 (48.6%):  88%|████████▊ | 35/40 [00:21<00:05,  1.13s/it]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 36 (47.2%):  88%|████████▊ | 35/40 [00:21<00:05,  1.13s/it]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 37 (48.6%):  92%|█████████▎| 37/40 [00:22<00:02,  1.33it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 38 (50.0%):  95%|█████████▌| 38/40 [00:22<00:01,  1.40it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 39 (51.3%):  98%|█████████▊| 39/40 [00:23<00:00,  1.55it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:24<00:00,  1.61it/s]

2025/09/04 15:01:50 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 15:01:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 9'].
2025/09/04 15:01:50 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0]
2025/09/04 15:01:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 15:01:50 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 22 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)



**Is match**: True

**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})


Average Metric: 1.00 / 1 (100.0%):   0%|          | 0/40 [00:00<?, ?it/s]**actual**:  Prediction(
    answer='two'
)
Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:26,  1.45it/s]

**Is match**: True


Average Metric: 2.00 / 2 (100.0%):   2%|▎         | 1/40 [00:00<00:26,  1.45it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 3 (100.0%):   8%|▊         | 3/40 [00:00<00:08,  4.57it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 4 (100.0%):   8%|▊         | 3/40 [00:00<00:08,  4.57it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 5 (80.0%):  10%|█         | 4/40 [00:00<00:07,  4.57it/s] 


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

Average Metric: 4.00 / 5 (80.0%):  12%|█▎        | 5/40 [00:00<00:05,  6.94it/s]**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 6 (66.7%):  12%|█▎        | 5/40 [00:00<00:05,  6.94it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 7 (71.4%):  18%|█▊        | 7/40 [00:01<00:05,  6.00it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 8 (62.5%):  20%|██        | 8/40 [00:05<00:36,  1.14s/it]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 9 (66.7%):  22%|██▎       | 9/40 [00:05<00:27,  1.12it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 10 (70.0%):  22%|██▎       | 9/40 [00:05<00:27,  1.12it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 11 (72.7%):  25%|██▌       | 10/40 [00:05<00:26,  1.12it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 8.00 / 12 (66.7%):  30%|███       | 12/40 [00:05<00:12,  2.20it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 13 (69.2%):  30%|███       | 12/40 [00:06<00:12,  2.20it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 14 (71.4%):  35%|███▌      | 14/40 [00:09<00:23,  1.12it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 15 (73.3%):  35%|███▌      | 14/40 [00:09<00:23,  1.12it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 16 (68.8%):  40%|████      | 16/40 [00:09<00:15,  1.57it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 17 (70.6%):  42%|████▎     | 17/40 [00:09<00:12,  1.85it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 18 (66.7%):  45%|████▌     | 18/40 [00:10<00:11,  1.98it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 19 (68.4%):  48%|████▊     | 19/40 [00:10<00:10,  2.03it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 20 (65.0%):  48%|████▊     | 19/40 [00:10<00:10,  2.03it/s]

2025/09/04 15:02:04 ERROR dspy.utils.parallelizer: Error for Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4.1 in organization org-q446iSedRzWhcp3jJDP32sfL on tokens per min (TPM): Limit 30000, Used 29780, Requested 299. Please try again in 158ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 13.00 / 20 (65.0%):  52%|█████▎    | 21/40 [00:13<00:17,  1.10it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 21 (61.9%):  55%|█████▌    | 22/40 [00:14<00:16,  1.11it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 22 (59.1%):  57%|█████▊    | 23/40 [00:14<00:12,  1.35it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 23 (60.9%):  60%|██████    | 24/40 [00:15<00:09,  1.60it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 24 (58.3%):  60%|██████    | 24/40 [00:15<00:09,  1.60it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False



Average Metric: 14.00 / 25 (56.0%):  62%|██████▎   | 25/40 [00:15<00:09,  1.60it/s]

**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 26 (53.8%):  65%|██████▌   | 26/40 [00:15<00:08,  1.60it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 27 (55.6%):  70%|███████   | 28/40 [00:18<00:09,  1.30it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 28 (53.6%):  72%|███████▎  | 29/40 [00:18<00:07,  1.53it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 29 (55.2%):  72%|███████▎  | 29/40 [00:18<00:07,  1.53it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 30 (53.3%):  78%|███████▊  | 31/40 [00:18<00:04,  2.13it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 31 (51.6%):  80%|████████  | 32/40 [00:20<00:04,  1.62it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 32 (50.0%):  82%|████████▎ | 33/40 [00:22<00:06,  1.13it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 33 (48.5%):  82%|████████▎ | 33/40 [00:22<00:06,  1.13it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 34 (50.0%):  88%|████████▊ | 35/40 [00:22<00:02,  1.68it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 35 (51.4%):  88%|████████▊ | 35/40 [00:22<00:02,  1.68it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 36 (50.0%):  92%|█████████▎| 37/40 [00:22<00:01,  2.39it/s]

2025/09/04 15:02:13 ERROR dspy.utils.parallelizer: Error for Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4.1 in organization org-q446iSedRzWhcp3jJDP32sfL on tokens per min (TPM): Limit 30000, Used 30000, Requested 299. Please try again in 598ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 18.00 / 36 (50.0%):  95%|█████████▌| 38/40 [00:23<00:00,  2.21it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 37 (48.6%):  98%|█████████▊| 39/40 [00:23<00:00,  2.24it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 38 (50.0%): 100%|██████████| 40/40 [00:26<00:00,  1.53it/s]

2025/09/04 15:02:16 INFO dspy.evaluate.evaluate: Average Metric: 19.0 / 40 (47.5%)
2025/09/04 15:02:16 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 47.5 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 3'].
2025/09/04 15:02:16 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5]
2025/09/04 15:02:16 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 15:02:16 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 23 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:23,  1.64it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 2 (100.0%):   2%|▎         | 1/40 [00:00<00:23,  1.64it/s]


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 3 (100.0%):   5%|▌         | 2/40 [00:00<00:23,  1.64it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 4 (75.0%):  10%|█         | 4/40 [00:00<00:05,  6.61it/s] 


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 5 (80.0%):  10%|█         | 4/40 [00:00<00:05,  6.61it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 6 (66.7%):  15%|█▌        | 6/40 [00:01<00:05,  6.57it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 7 (71.4%):  15%|█▌        | 6/40 [00:01<00:05,  6.57it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 8 (75.0%):  20%|██        | 8/40 [00:05<00:29,  1.07it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 9 (66.7%):  20%|██        | 8/40 [00:05<00:29,  1.07it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 10 (70.0%):  22%|██▎       | 9/40 [00:05<00:29,  1.07it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 7.00 / 11 (63.6%):  28%|██▊       | 11/40 [00:05<00:15,  1.86it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 12 (66.7%):  28%|██▊       | 11/40 [00:05<00:15,  1.86it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 13 (69.2%):  32%|███▎      | 13/40 [00:05<00:11,  2.39it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 14 (64.3%):  35%|███▌      | 14/40 [00:06<00:10,  2.42it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 15 (60.0%):  38%|███▊      | 15/40 [00:06<00:09,  2.54it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 16 (56.2%):  38%|███▊      | 15/40 [00:06<00:09,  2.54it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 17 (52.9%):  42%|████▎     | 17/40 [00:10<00:23,  1.02s/it]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 18 (55.6%):  45%|████▌     | 18/40 [00:11<00:19,  1.13it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 19 (52.6%):  48%|████▊     | 19/40 [00:11<00:15,  1.35it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 20 (55.0%):  50%|█████     | 20/40 [00:11<00:11,  1.68it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 21 (57.1%):  52%|█████▎    | 21/40 [00:11<00:09,  1.94it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 22 (54.5%):  55%|█████▌    | 22/40 [00:12<00:10,  1.65it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 23 (56.5%):  57%|█████▊    | 23/40 [00:12<00:08,  2.05it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 24 (54.2%):  60%|██████    | 24/40 [00:14<00:12,  1.31it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 25 (56.0%):  62%|██████▎   | 25/40 [00:15<00:12,  1.15it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 26 (57.7%):  65%|██████▌   | 26/40 [00:15<00:09,  1.45it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 27 (59.3%):  68%|██████▊   | 27/40 [00:15<00:06,  1.93it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 28 (57.1%):  70%|███████   | 28/40 [00:16<00:05,  2.36it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 29 (55.2%):  72%|███████▎  | 29/40 [00:18<00:10,  1.01it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 30 (56.7%):  72%|███████▎  | 29/40 [00:18<00:10,  1.01it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 31 (54.8%):  78%|███████▊  | 31/40 [00:18<00:05,  1.54it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 32 (56.2%):  80%|████████  | 32/40 [00:19<00:04,  1.86it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 33 (54.5%):  80%|████████  | 32/40 [00:19<00:04,  1.86it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 34 (52.9%):  82%|████████▎ | 33/40 [00:19<00:03,  1.86it/s]
Average Metric: 18.00 / 34 (52.9%):  85%|████████▌ | 34/40 [00:19<00:02,  2.53it/s]

**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 35 (51.4%):  85%|████████▌ | 34/40 [00:19<00:02,  2.53it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 36 (52.8%):  90%|█████████ | 36/40 [00:22<00:03,  1.12it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 37 (51.4%):  90%|█████████ | 36/40 [00:22<00:03,  1.12it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 38 (50.0%):  95%|█████████▌| 38/40 [00:23<00:01,  1.56it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 39 (48.7%):  98%|█████████▊| 39/40 [00:23<00:00,  1.85it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:23<00:00,  1.67it/s]

2025/09/04 15:02:40 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 15:02:40 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 11'].
2025/09/04 15:02:40 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 50.0]
2025/09/04 15:02:40 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 15:02:40 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 24 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 0.00 / 1 (0.0%):   2%|▎         | 1/40 [00:01<00:56,  1.45s/it]



**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)
**actual**:  Prediction(
    answer='two'
)


**Is match**: True

**Is match**: True



Average Metric: 2.00 / 3 (66.7%):   5%|▌         | 2/40 [00:01<00:55,  1.45s/it]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 2.00 / 4 (50.0%):  10%|█         | 4/40 [00:02<00:16,  2.21it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 5 (60.0%):  12%|█▎        | 5/40 [00:02<00:13,  2.54it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 6 (50.0%):  15%|█▌        | 6/40 [00:05<00:40,  1.20s/it]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 7 (57.1%):  15%|█▌        | 6/40 [00:05<00:40,  1.20s/it]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 8 (50.0%):  18%|█▊        | 7/40 [00:05<00:39,  1.20s/it]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 9 (55.6%):  22%|██▎       | 9/40 [00:06<00:22,  1.36it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 10 (60.0%):  25%|██▌       | 10/40 [00:07<00:19,  1.51it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 11 (63.6%):  28%|██▊       | 11/40 [00:07<00:16,  1.76it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 12 (66.7%):  30%|███       | 12/40 [00:07<00:12,  2.16it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 8.00 / 13 (61.5%):  32%|███▎      | 13/40 [00:10<00:31,  1.15s/it]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 14 (64.3%):  32%|███▎      | 13/40 [00:10<00:31,  1.15s/it]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 15 (60.0%):  35%|███▌      | 14/40 [00:10<00:30,  1.15s/it]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 16 (62.5%):  40%|████      | 16/40 [00:10<00:13,  1.73it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 17 (64.7%):  42%|████▎     | 17/40 [00:11<00:12,  1.91it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 18 (66.7%):  45%|████▌     | 18/40 [00:11<00:10,  2.16it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 19 (68.4%):  45%|████▌     | 18/40 [00:11<00:10,  2.16it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 20 (65.0%):  50%|█████     | 20/40 [00:11<00:06,  2.86it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 21 (66.7%):  50%|█████     | 20/40 [00:15<00:06,  2.86it/s]
Average Metric: 14.00 / 21 (66.7%):  52%|█████▎    | 21/40 [00:15<00:22,  1.16s/it]

**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 22 (68.2%):  52%|█████▎    | 21/40 [00:15<00:22,  1.16s/it]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 23 (65.2%):  57%|█████▊    | 23/40 [00:16<00:13,  1.22it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 24 (62.5%):  60%|██████    | 24/40 [00:17<00:14,  1.09it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 25 (60.0%):  62%|██████▎   | 25/40 [00:17<00:11,  1.29it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 26 (57.7%):  65%|██████▌   | 26/40 [00:18<00:12,  1.15it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 27 (55.6%):  68%|██████▊   | 27/40 [00:19<00:09,  1.42it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 28 (53.6%):  68%|██████▊   | 27/40 [00:19<00:09,  1.42it/s]

2025/09/04 15:03:00 ERROR dspy.utils.parallelizer: Error for Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4.1 in organization org-q446iSedRzWhcp3jJDP32sfL on tokens per min (TPM): Limit 30000, Used 30000, Requested 303. Please try again in 606ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 15.00 / 28 (53.6%):  72%|███████▎  | 29/40 [00:19<00:04,  2.25it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 29 (51.7%):  75%|███████▌  | 30/40 [00:21<00:08,  1.18it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 30 (53.3%):  75%|███████▌  | 30/40 [00:21<00:08,  1.18it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 31 (54.8%):  80%|████████  | 32/40 [00:21<00:04,  1.74it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 32 (53.1%):  82%|████████▎ | 33/40 [00:22<00:03,  2.06it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)



**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**Is match**: True

**actual**:  Prediction(
    answer='two'
)

Average Metric: 18.00 / 33 (54.5%):  85%|████████▌ | 34/40 [00:24<00:05,  1.12it/s]

**Is match**: True


Average Metric: 19.00 / 34 (55.9%):  85%|████████▌ | 34/40 [00:24<00:05,  1.12it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 35 (54.3%):  90%|█████████ | 36/40 [00:24<00:02,  1.74it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 36 (52.8%):  92%|█████████▎| 37/40 [00:24<00:01,  2.11it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 37 (54.1%):  95%|█████████▌| 38/40 [00:24<00:00,  2.44it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 20.00 / 38 (52.6%):  98%|█████████▊| 39/40 [00:28<00:01,  1.37s/it]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 20.00 / 39 (51.3%): 100%|██████████| 40/40 [00:42<00:00,  1.06s/it]

2025/09/04 15:03:23 INFO dspy.evaluate.evaluate: Average Metric: 20.0 / 40 (50.0%)
2025/09/04 15:03:23 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 2'].
2025/09/04 15:03:23 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0]
2025/09/04 15:03:23 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 15:03:23 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 25 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True



Average Metric: 1.00 / 1 (100.0%):   0%|          | 0/40 [00:00<?, ?it/s]

**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)
Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:31,  1.26it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='one'
)


**Is match**: False

**Is match**: True


Average Metric: 1.00 / 2 (50.0%):   2%|▎         | 1/40 [00:00<00:31,  1.26it/s] 



**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)
Average Metric: 2.00 / 3 (66.7%):   5%|▌         | 2/40 [00:00<00:30,  1.26it/s]

**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 3/40 [00:00<00:29,  1.26it/s]

**Is match**: True


Average Metric: 4.00 / 5 (80.0%):  10%|█         | 4/40 [00:00<00:28,  1.26it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 6 (66.7%):  12%|█▎        | 5/40 [00:00<00:27,  1.26it/s]



**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)
**actual**:  Prediction(
    answer='one'
)


**Is match**: False

**Is match**: True


Average Metric: 5.00 / 7 (71.4%):  15%|█▌        | 6/40 [00:01<00:27,  1.26it/s]
Average Metric: 5.00 / 8 (62.5%):  18%|█▊        | 7/40 [00:01<00:04,  7.21it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False



Average Metric: 5.00 / 9 (55.6%):  22%|██▎       | 9/40 [00:01<00:04,  6.48it/s]

**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 10 (60.0%):  22%|██▎       | 9/40 [00:01<00:04,  6.48it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 11 (54.5%):  28%|██▊       | 11/40 [00:01<00:03,  7.75it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 12 (58.3%):  28%|██▊       | 11/40 [00:01<00:03,  7.75it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 7.00 / 13 (53.8%):  30%|███       | 12/40 [00:01<00:03,  7.75it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 7.00 / 14 (50.0%):  35%|███▌      | 14/40 [00:02<00:03,  8.29it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 15 (53.3%):  38%|███▊      | 15/40 [00:02<00:03,  7.68it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 16 (56.2%):  40%|████      | 16/40 [00:02<00:03,  7.93it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 17 (58.8%):  40%|████      | 16/40 [00:02<00:03,  7.93it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 18 (61.1%):  45%|████▌     | 18/40 [00:02<00:02,  8.61it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 19 (63.2%):  45%|████▌     | 18/40 [00:02<00:02,  8.61it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 20 (60.0%):  50%|█████     | 20/40 [00:02<00:02,  8.65it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 21 (57.1%):  50%|█████     | 20/40 [00:02<00:02,  8.65it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 22 (54.5%):  55%|█████▌    | 22/40 [00:03<00:02,  7.17it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 23 (56.5%):  55%|█████▌    | 22/40 [00:03<00:02,  7.17it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 24 (54.2%):  60%|██████    | 24/40 [00:03<00:03,  5.11it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 25 (56.0%):  62%|██████▎   | 25/40 [00:03<00:02,  5.53it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)



**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**Is match**: False

**actual**: 
Average Metric: 14.00 / 26 (53.8%):  62%|██████▎   | 25/40 [00:03<00:02,  5.53it/s] Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 27 (55.6%):  65%|██████▌   | 26/40 [00:03<00:02,  5.53it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 28 (53.6%):  70%|███████   | 28/40 [00:04<00:01,  7.20it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 29 (51.7%):  70%|███████   | 28/40 [00:04<00:01,  7.20it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 30 (50.0%):  75%|███████▌  | 30/40 [00:04<00:01,  6.11it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 31 (48.4%):  75%|███████▌  | 30/40 [00:04<00:01,  6.11it/s]



**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 32 (50.0%):  78%|███████▊  | 31/40 [00:04<00:01,  6.11it/s]

**Is match**: True

Average Metric: 16.00 / 32 (50.0%):  80%|████████  | 32/40 [00:04<00:01,  7.40it/s]
Average Metric: 17.00 / 33 (51.5%):  80%|████████  | 32/40 [00:04<00:01,  7.40it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 34 (52.9%):  85%|████████▌ | 34/40 [00:04<00:00,  8.51it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 35 (51.4%):  85%|████████▌ | 34/40 [00:04<00:00,  8.51it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 36 (50.0%):  88%|████████▊ | 35/40 [00:04<00:00,  8.51it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 37 (51.4%):  92%|█████████▎| 37/40 [00:05<00:00, 10.34it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 38 (50.0%):  92%|█████████▎| 37/40 [00:05<00:00, 10.34it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 39 (48.7%):  98%|█████████▊| 39/40 [00:05<00:00,  8.10it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:05<00:00,  7.18it/s]

2025/09/04 15:03:29 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 15:03:29 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 0'].
2025/09/04 15:03:29 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0]
2025/09/04 15:03:29 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 15:03:29 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 26 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:24,  1.62it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 2 (100.0%):   5%|▌         | 2/40 [00:00<00:14,  2.67it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)



**Is match**: True

**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='one'
)
Average Metric: 3.00 / 3 (100.0%):   5%|▌         | 2/40 [00:00<00:14,  2.67it/s]

**Is match**: False


Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 3/40 [00:00<00:09,  3.82it/s] 


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 5 (80.0%):  12%|█▎        | 5/40 [00:01<00:05,  6.51it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 6 (66.7%):  12%|█▎        | 5/40 [00:01<00:05,  6.51it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 7 (57.1%):  15%|█▌        | 6/40 [00:01<00:05,  6.51it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 8 (62.5%):  20%|██        | 8/40 [00:01<00:03, 10.56it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 9 (66.7%):  20%|██        | 8/40 [00:01<00:03, 10.56it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})


**actual**:  Prediction(
    answer='two'
)


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**Is match**: True

**actual**:  Prediction(
    answer='one'
)

Average Metric: 7.00 / 10 (70.0%):  25%|██▌       | 10/40 [00:01<00:03,  7.66it/s]

**Is match**: False


Average Metric: 7.00 / 11 (63.6%):  25%|██▌       | 10/40 [00:01<00:03,  7.66it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 12 (66.7%):  30%|███       | 12/40 [00:01<00:03,  7.90it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 8.00 / 13 (61.5%):  30%|███       | 12/40 [00:01<00:03,  7.90it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 14 (64.3%):  35%|███▌      | 14/40 [00:02<00:02,  9.20it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 15 (66.7%):  35%|███▌      | 14/40 [00:02<00:02,  9.20it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 16 (62.5%):  40%|████      | 16/40 [00:02<00:03,  7.18it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 17 (58.8%):  42%|████▎     | 17/40 [00:02<00:04,  5.08it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 18 (55.6%):  45%|████▌     | 18/40 [00:07<00:26,  1.21s/it]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 19 (57.9%):  48%|████▊     | 19/40 [00:08<00:23,  1.13s/it]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 20 (60.0%):  50%|█████     | 20/40 [00:08<00:17,  1.13it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 21 (57.1%):  50%|█████     | 20/40 [00:09<00:17,  1.13it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 22 (59.1%):  55%|█████▌    | 22/40 [00:09<00:10,  1.71it/s]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 23 (56.5%):  57%|█████▊    | 23/40 [00:09<00:08,  2.04it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 24 (58.3%):  57%|█████▊    | 23/40 [00:09<00:08,  2.04it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 25 (56.0%):  62%|██████▎   | 25/40 [00:09<00:05,  2.77it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 26 (57.7%):  62%|██████▎   | 25/40 [00:09<00:05,  2.77it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 27 (55.6%):  65%|██████▌   | 26/40 [00:09<00:05,  2.77it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 28 (53.6%):  70%|███████   | 28/40 [00:09<00:02,  4.59it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 29 (55.2%):  70%|███████   | 28/40 [00:09<00:02,  4.59it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 30 (53.3%):  75%|███████▌  | 30/40 [00:10<00:01,  5.77it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 31 (51.6%):  75%|███████▌  | 30/40 [00:16<00:01,  5.77it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 32 (53.1%):  80%|████████  | 32/40 [00:16<00:09,  1.16s/it]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 33 (51.5%):  82%|████████▎ | 33/40 [00:17<00:07,  1.03s/it]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 34 (50.0%):  85%|████████▌ | 34/40 [00:17<00:05,  1.18it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 35 (51.4%):  88%|████████▊ | 35/40 [00:17<00:03,  1.45it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 36 (50.0%):  90%|█████████ | 36/40 [00:17<00:02,  1.76it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 37 (48.6%):  92%|█████████▎| 37/40 [00:17<00:01,  2.17it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 38 (50.0%):  95%|█████████▌| 38/40 [00:19<00:01,  1.30it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 39 (48.7%):  98%|█████████▊| 39/40 [00:19<00:00,  1.49it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:24<00:00,  1.65it/s]

2025/09/04 15:03:53 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)
2025/09/04 15:03:53 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 6'].
2025/09/04 15:03:53 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0]
2025/09/04 15:03:53 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 15:03:53 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 27 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 0.00 / 1 (0.0%):   0%|          | 0/40 [00:00<?, ?it/s]
Average Metric: 0.00 / 1 (0.0%):   2%|▎         | 1/40 [00:00<00:27,  1.40it/s]

**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 2 (50.0%):   2%|▎         | 1/40 [00:00<00:27,  1.40it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 1.00 / 3 (33.3%):   5%|▌         | 2/40 [00:00<00:27,  1.40it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True



Average Metric: 2.00 / 4 (50.0%):  10%|█         | 4/40 [00:00<00:05,  6.01it/s]

**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 5 (60.0%):  10%|█         | 4/40 [00:00<00:05,  6.01it/s]



**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**Is match**: False

**actual**:  Prediction(
    answer='two'
)

Average Metric: 3.00 / 6 (50.0%):  12%|█▎        | 5/40 [00:00<00:05,  6.01it/s]

**Is match**: True


Average Metric: 4.00 / 7 (57.1%):  15%|█▌        | 6/40 [00:00<00:05,  6.01it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 8 (62.5%):  20%|██        | 8/40 [00:01<00:05,  6.24it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 9 (55.6%):  20%|██        | 8/40 [00:01<00:05,  6.24it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 5.00 / 10 (50.0%):  25%|██▌       | 10/40 [00:01<00:04,  6.88it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 11 (54.5%):  25%|██▌       | 10/40 [00:01<00:04,  6.88it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 12 (50.0%):  30%|███       | 12/40 [00:02<00:07,  3.82it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 13 (53.8%):  32%|███▎      | 13/40 [00:04<00:14,  1.83it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 14 (57.1%):  35%|███▌      | 14/40 [00:07<00:26,  1.04s/it]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 8.00 / 15 (53.3%):  35%|███▌      | 14/40 [00:07<00:26,  1.04s/it]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 16 (56.2%):  40%|████      | 16/40 [00:07<00:17,  1.41it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 17 (58.8%):  42%|████▎     | 17/40 [00:08<00:14,  1.60it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 18 (55.6%):  45%|████▌     | 18/40 [00:08<00:11,  1.91it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 19 (57.9%):  48%|████▊     | 19/40 [00:08<00:08,  2.39it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 20 (55.0%):  50%|█████     | 20/40 [00:09<00:10,  1.97it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 21 (52.4%):  52%|█████▎    | 21/40 [00:09<00:08,  2.33it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 22 (50.0%):  55%|█████▌    | 22/40 [00:11<00:16,  1.08it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 23 (47.8%):  57%|█████▊    | 23/40 [00:12<00:17,  1.02s/it]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 24 (50.0%):  60%|██████    | 24/40 [00:13<00:12,  1.31it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 25 (52.0%):  62%|██████▎   | 25/40 [00:13<00:10,  1.45it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 26 (50.0%):  62%|██████▎   | 25/40 [00:13<00:10,  1.45it/s]


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 13.00 / 27 (48.1%):  68%|██████▊   | 27/40 [00:13<00:05,  2.43it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 28 (50.0%):  68%|██████▊   | 27/40 [00:13<00:05,  2.43it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 29 (48.3%):  72%|███████▎  | 29/40 [00:14<00:03,  3.01it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 30 (46.7%):  75%|███████▌  | 30/40 [00:14<00:03,  2.86it/s]

2025/09/04 15:04:08 ERROR dspy.utils.parallelizer: Error for Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4.1 in organization org-q446iSedRzWhcp3jJDP32sfL on tokens per min (TPM): Limit 30000, Used 30000, Requested 256. Please try again in 512ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 14.00 / 30 (46.7%):  78%|███████▊  | 31/40 [00:14<00:03,  2.76it/s]

2025/09/04 15:04:08 ERROR dspy.utils.parallelizer: Error for Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4.1 in organization org-q446iSedRzWhcp3jJDP32sfL on tokens per min (TPM): Limit 30000, Used 30000, Requested 256. Please try again in 512ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 14.00 / 30 (46.7%):  80%|████████  | 32/40 [00:15<00:02,  2.98it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 31 (45.2%):  82%|████████▎ | 33/40 [00:17<00:06,  1.02it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 32 (43.8%):  85%|████████▌ | 34/40 [00:18<00:04,  1.28it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 33 (42.4%):  88%|████████▊ | 35/40 [00:18<00:02,  1.69it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 14.00 / 34 (41.2%):  90%|█████████ | 36/40 [00:18<00:01,  2.17it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 35 (42.9%):  92%|█████████▎| 37/40 [00:18<00:01,  2.34it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 36 (44.4%):  92%|█████████▎| 37/40 [00:18<00:01,  2.34it/s]

2025/09/04 15:04:14 ERROR dspy.utils.parallelizer: Error for Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'}): litellm.RateLimitError: RateLimitError: OpenAIException - Rate limit reached for gpt-4.1 in organization org-q446iSedRzWhcp3jJDP32sfL on tokens per min (TPM): Limit 30000, Used 29929, Requested 257. Please try again in 372ms. Visit https://platform.openai.com/account/rate-limits to learn more.. Set `provide_traceback=True` for traceback.


Average Metric: 16.00 / 36 (44.4%):  98%|█████████▊| 39/40 [00:20<00:00,  1.45it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 17.00 / 37 (45.9%): 100%|██████████| 40/40 [00:21<00:00,  1.84it/s]

2025/09/04 15:04:15 INFO dspy.evaluate.evaluate: Average Metric: 17.0 / 40 (42.5%)
2025/09/04 15:04:15 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 42.5 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 17'].
2025/09/04 15:04:15 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 42.5]
2025/09/04 15:04:15 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 15:04:15 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 28 / 27 =====



  0%|          | 0/40 [00:00<?, ?it/s]


**expected**: Example({'number_guess': '2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 1.00 / 1 (100.0%):   2%|▎         | 1/40 [00:00<00:23,  1.70it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 2.00 / 2 (100.0%):   2%|▎         | 1/40 [00:00<00:23,  1.70it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 2.00 / 3 (66.7%):   8%|▊         | 3/40 [00:00<00:07,  5.13it/s] 


**expected**: Example({'number_guess': 'The number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 3.00 / 4 (75.0%):   8%|▊         | 3/40 [00:00<00:07,  5.13it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 3.00 / 5 (60.0%):  10%|█         | 4/40 [00:00<00:07,  5.13it/s]


**expected**: Example({'number_guess': 'Select the number: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 4.00 / 6 (66.7%):  12%|█▎        | 5/40 [00:00<00:06,  5.13it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 4.00 / 7 (57.1%):  18%|█▊        | 7/40 [00:00<00:02, 11.43it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 5.00 / 8 (62.5%):  18%|█▊        | 7/40 [00:01<00:02, 11.43it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 6.00 / 9 (66.7%):  22%|██▎       | 9/40 [00:05<00:25,  1.22it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 6.00 / 10 (60.0%):  22%|██▎       | 9/40 [00:05<00:25,  1.22it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 7.00 / 11 (63.6%):  28%|██▊       | 11/40 [00:05<00:17,  1.70it/s]


**expected**: Example({'number_guess': 'Two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 8.00 / 12 (66.7%):  28%|██▊       | 11/40 [00:05<00:17,  1.70it/s]


**expected**: Example({'number_guess': 'Numeric value: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 9.00 / 13 (69.2%):  32%|███▎      | 13/40 [00:06<00:14,  1.85it/s]


**expected**: Example({'number_guess': 'Select the number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 14 (64.3%):  35%|███▌      | 14/40 [00:08<00:20,  1.24it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 9.00 / 15 (60.0%):  38%|███▊      | 15/40 [00:08<00:17,  1.45it/s]


**expected**: Example({'number_guess': 'Option 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 10.00 / 16 (62.5%):  38%|███▊      | 15/40 [00:08<00:17,  1.45it/s]


**expected**: Example({'number_guess': 'Number one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 17 (58.8%):  42%|████▎     | 17/40 [00:09<00:11,  1.97it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 18 (55.6%):  45%|████▌     | 18/40 [00:09<00:09,  2.33it/s]


**expected**: Example({'number_guess': 'One', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 10.00 / 19 (52.6%):  45%|████▌     | 18/40 [00:09<00:09,  2.33it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 11.00 / 20 (55.0%):  50%|█████     | 20/40 [00:09<00:07,  2.74it/s]


**expected**: Example({'number_guess': 'Textual value: one', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 11.00 / 21 (52.4%):  52%|█████▎    | 21/40 [00:09<00:05,  3.19it/s]


**expected**: Example({'number_guess': 'Answer is 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 12.00 / 22 (54.5%):  55%|█████▌    | 22/40 [00:13<00:20,  1.12s/it]


**expected**: Example({'number_guess': 'Guess: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 12.00 / 23 (52.2%):  55%|█████▌    | 22/40 [00:13<00:20,  1.12s/it]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 13.00 / 24 (54.2%):  60%|██████    | 24/40 [00:13<00:11,  1.42it/s]


**expected**: Example({'number_guess': 'Guess: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 14.00 / 25 (56.0%):  60%|██████    | 24/40 [00:13<00:11,  1.42it/s]


**expected**: Example({'number_guess': 'Textual value: two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 15.00 / 26 (57.7%):  65%|██████▌   | 26/40 [00:14<00:06,  2.13it/s]


**expected**: Example({'number_guess': 'Answer is 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 15.00 / 27 (55.6%):  68%|██████▊   | 27/40 [00:14<00:06,  2.08it/s]


**expected**: Example({'number_guess': 'Pick 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 16.00 / 28 (57.1%):  68%|██████▊   | 27/40 [00:14<00:06,  2.08it/s]


**expected**: Example({'number_guess': 'Numeric value: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 29 (55.2%):  72%|███████▎  | 29/40 [00:18<00:10,  1.04it/s]


**expected**: Example({'number_guess': 'Choose: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 16.00 / 30 (53.3%):  72%|███████▎  | 29/40 [00:18<00:10,  1.04it/s]



**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**expected**: Example({'number_guess': 'The number: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False

**Is match**: True



Average Metric: 17.00 / 32 (53.1%):  78%|███████▊  | 31/40 [00:18<00:08,  1.04it/s]


**expected**: Example({'number_guess': 'Pick 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 17.00 / 33 (51.5%):  82%|████████▎ | 33/40 [00:18<00:03,  1.87it/s]


**expected**: Example({'number_guess': 'Digit: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 18.00 / 34 (52.9%):  82%|████████▎ | 33/40 [00:18<00:03,  1.87it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 18.00 / 35 (51.4%):  88%|████████▊ | 35/40 [00:20<00:03,  1.43it/s]


**expected**: Example({'number_guess': 'Choose: 2', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 19.00 / 36 (52.8%):  90%|█████████ | 36/40 [00:21<00:02,  1.64it/s]


**expected**: Example({'number_guess': '1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 19.00 / 37 (51.4%):  92%|█████████▎| 37/40 [00:21<00:01,  1.59it/s]


**expected**: Example({'number_guess': 'Number two', 'answer': 'two'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='two'
)


**Is match**: True


Average Metric: 20.00 / 38 (52.6%):  95%|█████████▌| 38/40 [00:23<00:01,  1.31it/s]


**expected**: Example({'number_guess': 'Digit: 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 20.00 / 39 (51.3%):  95%|█████████▌| 38/40 [00:23<00:01,  1.31it/s]


**expected**: Example({'number_guess': 'Option 1', 'answer': 'one'}) (input_keys={'number_guess'})

**actual**:  Prediction(
    answer='one'
)


**Is match**: False


Average Metric: 20.00 / 40 (50.0%): 100%|██████████| 40/40 [00:23<00:00,  1.72it/s]

2025/09/04 15:04:38 INFO dspy.evaluate.evaluate: Average Metric: 20 / 40 (50.0%)





2025/09/04 15:04:38 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 50.0 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 14'].
2025/09/04 15:04:38 INFO dspy.teleprompt.mipro_optimizer_v2: Scores so far: [50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 47.5, 50.0, 50.0, 50.0, 50.0, 50.0, 47.5, 50.0, 50.0, 50.0, 50.0, 42.5, 50.0]
2025/09/04 15:04:38 INFO dspy.teleprompt.mipro_optimizer_v2: Best score so far: 50.0


2025/09/04 15:04:38 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 50.0!


In [None]:
###########################################################################################

In [None]:
optimized_matcher.save("./saved_files/", save_program=True)

In [19]:
optimized_matcher.save("./saved_files/test.json")

In [18]:
optimized_matcher(number_guess="The biggest numer")


Prediction(
    answer='two'
)

In [14]:

if hasattr(optimized_matcher, "teleprompter_state"):
    print("=== Teleprompter State ===")
    print(optimized_matcher.teleprompter_state)
else:
    print("No teleprompter_state found.")


No teleprompter_state found.


In [13]:
print("=== Examples in Optimized Matcher ===")
print(hasattr(optimized_matcher, "examples"))
if hasattr(optimized_matcher, "examples"):
    print(len(optimized_matcher.examples), " examples:")
    for ex in optimized_matcher.examples:
        print(ex)


=== Examples in Optimized Matcher ===
False


In [10]:

print("=== Signature ===")
print(getattr(optimized_matcher, "signature", None))

print("\n=== Examples ===")
if hasattr(optimized_matcher, "examples"):
    for ex in optimized_matcher.examples:
        print(ex)
else:
    print("No examples attribute found.")


=== Signature ===
NumberPicker(number_guess -> answer
    instructions='Guess a number'
    number_guess = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Number Guess:', 'desc': '${number_guess}'})
    answer = Field(annotation=Literal['one', 'two'] required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
)

=== Examples ===
No examples attribute found.


In [9]:
from dspy.teleprompt import render_prompt

# Render the optimized prompt for a sample input
sample_input = {"number_guess": "Pick 2"}
optimized_prompt_text = render_prompt(optimized_matcher, **sample_input)

print("=== Optimized Prompt ===")
print(optimized_prompt_text)


ImportError: cannot import name 'render_prompt' from 'dspy.teleprompt' (/usr/local/python/3.12.1/lib/python3.12/site-packages/dspy/teleprompt/__init__.py)

In [None]:

optimized_prompt = getattr(optimized_matcher, "prompt", None) \
    or getattr(optimized_matcher, "instructions", None) \
    or getattr(optimized_matcher, "template", None)

print("**Optimized Prompt:**", optimized_prompt)

**Optimized Prompt:** None


In [None]:
from dspy.teleprompt import render_prompt

print(render_prompt(optimized_matcher, number_guess="Pick 2"))


ImportError: cannot import name 'render_prompt' from 'dspy.teleprompt' (/usr/local/python/3.12.1/lib/python3.12/site-packages/dspy/teleprompt/__init__.py)

In [None]:
if hasattr(optimized_matcher, "examples"):
    print("Few-shot examples:", optimized_matcher.examples)
elif hasattr(optimized_matcher, "demos"):
    print("Few-shot demos:", optimized_matcher.demos)

Few-shot demos: []


In [None]:
print(getattr(optimized_matcher, "compiled_template", None))

None


In [None]:

# Quick peeks (if available in your version)
try:
    print("=== optimized_matcher.inspect() ===")
    print(optimized_matcher.inspect())
except Exception:
    pass

try:
    print("=== tp.inspect() ===")
    print(tp.inspect())
except Exception:
    pass


=== optimized_matcher.inspect() ===
=== tp.inspect() ===


In [None]:
from typing import Any

def _safe(obj: Any, attr: str, default=None):
    return getattr(obj, attr, default)

def _print_section(title: str, content):
    if content:
        print(f"\n===== {title} =====")
        print(content)

def print_dspy_program(program: Any):
    print(">>> Repr:", repr(program))

    # Signature + fields
    sig = _safe(program, "signature")
    _print_section("Signature", sig)
    if sig:
        # Some versions expose .instructions or a docstring
        _print_section("Signature Doc", getattr(sig, "__doc__", None))
        _print_section("Signature Instructions", _safe(sig, "instructions"))
        _print_section("Signature Inputs", _safe(sig, "inputs", _safe(sig, "input_fields", None)))
        _print_section("Signature Outputs", _safe(sig, "outputs", _safe(sig, "output_fields", None)))

    # Where different versions might store the optimized prompt/template
    for k in ("prompt", "instructions", "system_prompt", "template", "compiled_template"):
        _print_section(k, _safe(program, k))

    # Few-shot examples (these keys vary by version/teleprompter)
    examples = None
    for k in ("examples", "fewshot", "demos", "shots", "train_examples", "demo_store"):
        v = _safe(program, k)
        if v:
            _print_section(f"{k} (container)", f"type={type(v)}, len={getattr(v, '__len__', lambda: 'n/a')() if hasattr(v, '__len__') else 'n/a'}")
            examples = v
            break

    # Try to print examples if iterable
    if examples and hasattr(examples, "__iter__"):
        try:
            for i, ex in enumerate(list(examples)[:20]):  # cap to avoid huge logs
                print(f"\n--- Few-shot Example #{i+1} ---")
                if hasattr(ex, "inputs") and callable(ex.inputs):
                    print("inputs:", ex.inputs())
                elif hasattr(ex, "inputs"):
                    print("inputs:", ex.inputs)
                else:
                    print("inputs: <unknown>")

                if hasattr(ex, "outputs") and callable(ex.outputs):
                    print("outputs:", ex.outputs())
                elif hasattr(ex, "outputs"):
                    print("outputs:", ex.outputs)
                else:
                    print("outputs: <unknown>")
        except Exception as e:
            print(f"(could not iterate examples cleanly: {e})")

print("\n================= OPTIMIZED PROGRAM =================")
print_dspy_program(optimized_matcher)


>>> Repr: Predict(NumberPicker(number_guess -> answer
    instructions='Guess a number'
    number_guess = Field(annotation=Literal['one', 'two'] required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Number Guess:', 'desc': '${number_guess}'})
    answer = Field(annotation=Literal['one', 'two'] required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
))

===== Signature =====
NumberPicker(number_guess -> answer
    instructions='Guess a number'
    number_guess = Field(annotation=Literal['one', 'two'] required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Number Guess:', 'desc': '${number_guess}'})
    answer = Field(annotation=Literal['one', 'two'] required=True json_schema_extra={'__dspy_field_type': 'output', 'prefix': 'Answer:', 'desc': '${answer}'})
)

===== Signature Doc =====
Guess a number

===== Signature Instructions =====
Guess a number

===== Signature Inputs =====
{'number_guess': Fie