# DSPy Agent Example with Azure OpenAI

In [1]:
import os
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
import dspy
import ujson
import random
import dspy.evaluate
import dspy.retrievers
import dspy.datasets
from datasets import load_dataset

In [28]:
# Load environment variables from .env file
load_dotenv()

True

## Setup Tracing with MLFlow

In [3]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("DSPy-Azure-Eval-Example")

# Enable tracing.
mlflow.dspy.autolog()

## Setup LM models

In [32]:
token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")

gpt41 = dspy.LM(
    model=f"azure/gpt-4.1",
    base_url=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_ad_token_provider=token_provider,
)

gpt4omini = dspy.LM(
    model=f"azure/gpt-4o-mini",
    base_url=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_ad_token_provider=token_provider,
)

# Use the mini model as the default for DSPy.
dspy.configure(lm=gpt4omini)

In [5]:
gpt41("Say hello to the world!")

['Hello, world! 🌍']

In [33]:
gpt4omini("Say hello to the world!")

['Hello, world! 🌍✨ How can I assist you today?']

## Load HoVer multi-hop QA datasets

In [None]:
# Load HoVer dataset using parquet files (avoiding deprecated script)
from datasets import load_dataset

try:
    # Try loading using parquet files which should be available
    dataset = load_dataset("parquet", data_files={
        "train": "hf://datasets/hover-nlp/hover/train-00000-of-00001.parquet"
    })
    hover_train = dataset["train"]
except:
    # Fallback: Load using the dataset name with revision
    dataset = load_dataset("hover-nlp/hover", revision="refs/convert/parquet")
    hover_train = dataset["train"]

# Convert to DSPy examples
hover = []
hpqa_ids = set()

for x in hover_train:
    if x["num_hops"] == 3 and x["hpqa_id"] not in hpqa_ids:
        hpqa_ids.add(x["hpqa_id"])
        titles = list(set([y["key"] for y in x["supporting_facts"]]))
        hover.append(
            dspy.Example(claim=x["claim"], titles=titles).with_inputs("claim")
        )

random.Random(0).shuffle(hover)
trainset, devset, testset = hover[:100], hover[100:200], hover[650:]

0000.parquet:   0%|          | 0.00/2.25M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/751k [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/552k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [8]:
len(trainset), len(devset), len(testset)

(100, 100, 1216)

In [9]:
trainset[0]

Example({'claim': 'This director is known for his work on Miss Potter. The Academy of Motion Picture Arts and Sciences presents the award in which he was nominated for his work in "Babe".', 'titles': ['Academy Award for Best Director', 'Chris Noonan', 'Miss Potter']}) (input_keys={'claim'})

## Tools for agent

In [10]:
DOCS = {}

def search(query: str, k: int) -> list[str]:
    results = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')(query, k=k)
    results = [x['text'] for x in results]

    for result in results:
        title, text = result.split(" | ", 1)
        DOCS[title] = text

    return results

In [11]:
def search_wikipedia(query: str) -> list[str]:
    """Returns top-5 results and then the titles of the top-5 to top-30 results."""

    topK = search(query, 30)
    titles, topK = [f"`{x.split(' | ')[0]}`" for x in topK[5:30]], topK[:5]
    return topK + [f"Other retrieved pages have titles: {', '.join(titles)}."]

def lookup_wikipedia(title: str) -> str:
    """Returns the text of the Wikipedia page, if it exists."""

    if title in DOCS:
        return DOCS[title]

    results = [x for x in search(title, 10) if x.startswith(title + " | ")]
    if not results:
        return f"No Wikipedia page found for title: {title}"
    return results[0]

In [12]:
search_wikipedia("Albert Einstein")

['Albert Einstein | Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist. Einstein developed the theory of relativity, one of the two pillars of modern physics (alongside quantum mechanics). Einstein\'s work is also known for its influence on the philosophy of science. Einstein is best known by the general public for his mass–energy equivalence formula "E" = "mc" (which has been dubbed "the world\'s most famous equation"). He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the evolution of quantum theory.',
 'Einstein (disambiguation) | Albert Einstein (1879–1955) was a German-born theoretical physicist.',
 'Hans Albert Einstein | Hans Albert Einstein ( ; May 14, 1904 – July 26, 1973) was a Swiss-American engineer and educator, the second child and first son of Albert Einstein and Mileva Marić. Hans A. Einstein was a professor

In [13]:
lookup_wikipedia("Albert Einstein")

'Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist. Einstein developed the theory of relativity, one of the two pillars of modern physics (alongside quantum mechanics). Einstein\'s work is also known for its influence on the philosophy of science. Einstein is best known by the general public for his mass–energy equivalence formula "E" = "mc" (which has been dubbed "the world\'s most famous equation"). He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the evolution of quantum theory.'

## Define DSPy agent using `dspy.ReAct`

In [34]:
instructions = "Find all Wikipedia titles relevant to verifying (or refuting) the claim."
signature = dspy.Signature("claim -> titles: list[str]", instructions)
react = dspy.ReAct(signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=20)

In [35]:
react(claim="David Gregory was born in 1625.").titles[:3]

['David Gregory (physician)',
 'David Gregory (historian)',
 'David Gregory (mathematician)']

## Evaluate the agent

In [36]:
def top5_recall(example, pred, trace=None):
    gold_titles = example.titles
    recall = sum(x in pred.titles[:5] for x in gold_titles) / len(gold_titles)

    # If we're "bootstrapping" for optimization, return True if and only if the recall is perfect.
    if trace is not None:
        return recall >= 1.0
    
    # If we're just doing inference, just measure the recall.
    return recall

evaluate = dspy.Evaluate(devset=devset, metric=top5_recall, num_threads=16, display_progress=True, display_table=5)

In [37]:
# Wrapper to bypass exceptions during evaluation.
def safe_react(claim: str):
    try:
        return react(claim=claim)
    except Exception as e:
        return dspy.Prediction(titles=[])

evaluate(safe_react)

Average Metric: 77.00 / 100 (77.0%): 100%|██████████| 100/100 [02:39<00:00,  1.59s/it]

2025/07/13 19:26:10 INFO dspy.evaluate.evaluate: Average Metric: 77.0 / 100 (77.0%)





Unnamed: 0,claim,example_titles,trajectory,reasoning,pred_titles,top5_recall
0,The Church of England's movement that inspired the Trinity Episcop...,"[Oxford Movement, Trinity Episcopal Church (Houghton, Michigan), S...","{'thought_0': ""I need to gather information about the Church of En...",The claim states that the Church of England's movement that inspir...,"[Trinity Episcopal Church (Houghton, Michigan), Oxford Movement, S...",✔️ [1.000]
1,"Red, White & Crüe and this athlete both fight. The french fighter ...","[Red, White &amp; Crüe, Mike Tyson, Bobby Stewart]",{'thought_0': 'I need to find information about the athlete mentio...,"The claim connects the album ""Red, White & Crüe"" by Mötley Crüe to...","[Bobby Stewart, Mike Tyson, Mötley Crüe, Red, White & Crüe]",✔️ [0.667]
2,The writer/director/actor from Glen or Glenda and Fernand Rivers s...,"[Glen or Glenda, Ed Wood, Fernand Rivers]",{'thought_0': 'I need to verify the claim about the writer/directo...,"The claim states that the writer/director/actor from ""Glen or Glen...","[Glen or Glenda, Ed Wood, Fernand Rivers]",✔️ [1.000]
3,The film by Sandi Sissel was released before The End of Suburbia.,"[Chicken Ranch (film), The End of Suburbia, Sandi Sissel]",{'thought_0': 'I need to verify the release date of the film by Sa...,"The claim states that a film by Sandi Sissel was released before ""...","[The End of Suburbia, Sandi Sissel]",✔️ [0.667]
4,The actor who played captain hook in the live production with Tayl...,"[Taylor Louderman, Peter Pan Live!, Christopher Walken]",{'thought_0': 'I need to verify the claim about the actor who play...,The claim states that the actor who played Captain Hook in the liv...,"[Peter Pan Live!, Christopher Walken, The Deer Hunter]",✔️ [0.667]


77.0

## Optimize the agent

In [38]:
kwargs = dict(teacher_settings=dict(lm=gpt41), prompt_model=gpt41, max_errors=999)

tp = dspy.MIPROv2(metric=top5_recall, auto="medium", num_threads=16, **kwargs)
optimized_react = tp.compile(react, trainset=trainset, max_bootstrapped_demos=3, max_labeled_demos=0)

2025/07/13 19:27:21 INFO dspy.teleprompt.mipro_optimizer_v2: 
RUNNING WITH THE FOLLOWING MEDIUM AUTO RUN SETTINGS:
num_trials: 28
minibatch: True
num_fewshot_candidates: 12
num_instruct_candidates: 6
valset size: 80



[93m[1mProjected Language Model (LM) Calls[0m

Based on the parameters you have set, the maximum number of LM calls is projected as follows:

[93m- Prompt Generation: [94m[1m10[0m[93m data summarizer calls + [94m[1m6[0m[93m * [94m[1m2[0m[93m lm calls in program + ([94m[1m3[0m[93m) lm calls in program-aware proposer = [94m[1m25[0m[93m prompt model calls[0m
[93m- Program Evaluation: [94m[1m35[0m[93m examples in minibatch * [94m[1m28[0m[93m batches + [94m[1m80[0m[93m examples in val set * [94m[1m6[0m[93m full evals = [94m[1m1460[0m[93m LM Program calls[0m

[93m[1mEstimated Cost Calculation:[0m

[93mTotal Cost = (Number of calls to task model * (Avg Input Token Length per Call * Task Model Price per Input Token + Avg Output Token Length per Call * Task Model Price per Output Token)
            + (Number of program calls * (Avg Input Token Length per Call * Task Prompt Price per Input Token + Avg Output Token Length per Call * Prompt Model 

2025/07/13 19:27:41 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 1: BOOTSTRAP FEWSHOT EXAMPLES <==
2025/07/13 19:27:41 INFO dspy.teleprompt.mipro_optimizer_v2: These will be used as few-shot example candidates for our program and for creating instructions.

2025/07/13 19:27:41 INFO dspy.teleprompt.mipro_optimizer_v2: Bootstrapping N=12 sets of demonstrations...



No input received within 20 seconds. Proceeding with execution...
Bootstrapping set 1/12
Bootstrapping set 2/12


 55%|█████▌    | 11/20 [01:47<01:27,  9.74s/it]


Bootstrapped 3 full traces after 11 examples for up to 1 rounds, amounting to 11 attempts.
Bootstrapping set 3/12


 15%|█▌        | 3/20 [00:13<01:16,  4.52s/it]


Bootstrapped 3 full traces after 3 examples for up to 1 rounds, amounting to 3 attempts.
Bootstrapping set 4/12


 10%|█         | 2/20 [00:00<00:00, 63.35it/s]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 5/12


 50%|█████     | 10/20 [00:29<00:29,  2.99s/it]


Bootstrapped 3 full traces after 10 examples for up to 1 rounds, amounting to 10 attempts.
Bootstrapping set 6/12


 35%|███▌      | 7/20 [00:14<00:26,  2.03s/it]


Bootstrapped 3 full traces after 7 examples for up to 1 rounds, amounting to 7 attempts.
Bootstrapping set 7/12


 20%|██        | 4/20 [00:15<01:03,  3.96s/it]


Bootstrapped 3 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Bootstrapping set 8/12


 10%|█         | 2/20 [00:00<00:00, 49.77it/s]


Bootstrapped 1 full traces after 2 examples for up to 1 rounds, amounting to 2 attempts.
Bootstrapping set 9/12


 35%|███▌      | 7/20 [00:11<00:21,  1.68s/it]


Bootstrapped 3 full traces after 7 examples for up to 1 rounds, amounting to 7 attempts.
Bootstrapping set 10/12


 50%|█████     | 10/20 [00:00<00:00, 58.79it/s]


Bootstrapped 3 full traces after 10 examples for up to 1 rounds, amounting to 10 attempts.
Bootstrapping set 11/12


  5%|▌         | 1/20 [00:00<00:00, 47.06it/s]


Bootstrapped 1 full traces after 1 examples for up to 1 rounds, amounting to 1 attempts.
Bootstrapping set 12/12


 20%|██        | 4/20 [00:06<00:26,  1.64s/it]
2025/07/13 19:31:04 INFO dspy.teleprompt.mipro_optimizer_v2: 
==> STEP 2: PROPOSE INSTRUCTION CANDIDATES <==
2025/07/13 19:31:04 INFO dspy.teleprompt.mipro_optimizer_v2: We will use the few-shot examples from the previous step, a generated dataset summary, a summary of the program code, and a randomly selected prompting tip to propose instructions.


Bootstrapped 1 full traces after 4 examples for up to 1 rounds, amounting to 4 attempts.
Error getting source code: unhashable type: 'dict'.

Running without program aware proposer.


2025/07/13 19:31:12 INFO dspy.teleprompt.mipro_optimizer_v2: 
Proposing N=6 instructions...

2025/07/13 19:31:37 INFO dspy.teleprompt.mipro_optimizer_v2: Proposed Instructions for Predictor 0:

2025/07/13 19:31:37 INFO dspy.teleprompt.mipro_optimizer_v2: 0: Find all Wikipedia titles relevant to verifying (or refuting) the claim.

You are an Agent. In each episode, you will be given the fields `claim` as input. And you can see your past trajectory so far.
Your goal is to use one or more of the supplied tools to collect any necessary information for producing `titles`.

To do this, you will interleave next_thought, next_tool_name, and next_tool_args in each turn, and also when finishing the task.
After each tool call, you receive a resulting observation, which gets appended to your trajectory.

When writing next_thought, you may reason about the current situation and plan for future steps.
When selecting the next_tool_name and its next_tool_args, the tool must be one of:

(1) search_wiki

Average Metric: 61.33 / 80 (76.7%): 100%|██████████| 80/80 [01:40<00:00,  1.25s/it]

2025/07/13 19:33:17 INFO dspy.evaluate.evaluate: Average Metric: 61.33333333333333 / 80 (76.7%)
2025/07/13 19:33:17 INFO dspy.teleprompt.mipro_optimizer_v2: Default program score: 76.67






2025/07/13 19:33:18 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 2 / 35 - Minibatch ==


Average Metric: 23.00 / 35 (65.7%): 100%|██████████| 35/35 [01:15<00:00,  2.16s/it]

2025/07/13 19:34:34 INFO dspy.evaluate.evaluate: Average Metric: 23.0 / 35 (65.7%)





2025/07/13 19:34:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 65.71 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 6', 'Predictor 1: Instruction 4', 'Predictor 1: Few-Shot Set 2'].
2025/07/13 19:34:34 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71]
2025/07/13 19:34:34 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67]
2025/07/13 19:34:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:34:34 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 3 / 35 - Minibatch ==


Average Metric: 25.33 / 35 (72.4%): 100%|██████████| 35/35 [01:14<00:00,  2.11s/it]

2025/07/13 19:35:48 INFO dspy.evaluate.evaluate: Average Metric: 25.333333333333332 / 35 (72.4%)





2025/07/13 19:35:49 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.38 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 6', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 4'].
2025/07/13 19:35:49 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38]
2025/07/13 19:35:49 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67]
2025/07/13 19:35:49 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:35:49 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 4 / 35 - Minibatch ==


Average Metric: 22.00 / 35 (62.9%): 100%|██████████| 35/35 [02:00<00:00,  3.44s/it]

2025/07/13 19:37:49 INFO dspy.evaluate.evaluate: Average Metric: 22.0 / 35 (62.9%)





2025/07/13 19:37:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 5', 'Predictor 1: Instruction 4', 'Predictor 1: Few-Shot Set 6'].
2025/07/13 19:37:50 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86]
2025/07/13 19:37:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67]
2025/07/13 19:37:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:37:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 5 / 35 - Minibatch ==


Average Metric: 23.67 / 35 (67.6%): 100%|██████████| 35/35 [01:27<00:00,  2.49s/it]

2025/07/13 19:39:18 INFO dspy.evaluate.evaluate: Average Metric: 23.666666666666664 / 35 (67.6%)





2025/07/13 19:39:18 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 67.62 on minibatch of size 35 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 3', 'Predictor 1: Few-Shot Set 3'].
2025/07/13 19:39:18 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62]
2025/07/13 19:39:18 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67]
2025/07/13 19:39:18 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:39:18 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 6 / 35 - Minibatch ==


Average Metric: 24.33 / 35 (69.5%): 100%|██████████| 35/35 [01:19<00:00,  2.27s/it]

2025/07/13 19:40:38 INFO dspy.evaluate.evaluate: Average Metric: 24.333333333333332 / 35 (69.5%)





2025/07/13 19:40:39 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 69.52 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 10', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 9'].
2025/07/13 19:40:39 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52]
2025/07/13 19:40:39 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67]
2025/07/13 19:40:39 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:40:39 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 7 / 35 - Full Evaluation =====
2025/07/13 19:40:39 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 72.38) from minibatch trials...


Average Metric: 59.67 / 80 (74.6%): 100%|██████████| 80/80 [01:33<00:00,  1.16s/it]

2025/07/13 19:42:12 INFO dspy.evaluate.evaluate: Average Metric: 59.666666666666664 / 80 (74.6%)
2025/07/13 19:42:12 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58]
2025/07/13 19:42:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67
2025/07/13 19:42:12 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/13 19:42:12 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 8 / 35 - Minibatch ==



Average Metric: 24.33 / 35 (69.5%): 100%|██████████| 35/35 [01:03<00:00,  1.81s/it]

2025/07/13 19:43:16 INFO dspy.evaluate.evaluate: Average Metric: 24.333333333333332 / 35 (69.5%)
2025/07/13 19:43:16 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 69.52 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 3', 'Predictor 1: Instruction 5', 'Predictor 1: Few-Shot Set 6'].
2025/07/13 19:43:16 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52]
2025/07/13 19:43:16 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58]
2025/07/13 19:43:16 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:43:16 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 9 / 35 - Minibatch ==



Average Metric: 27.00 / 35 (77.1%): 100%|██████████| 35/35 [00:51<00:00,  1.48s/it]

2025/07/13 19:44:09 INFO dspy.evaluate.evaluate: Average Metric: 27.0 / 35 (77.1%)
2025/07/13 19:44:09 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 10', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 6'].
2025/07/13 19:44:09 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14]
2025/07/13 19:44:09 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58]
2025/07/13 19:44:09 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:44:09 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 10 / 35 - Minibatch ==



Average Metric: 26.33 / 35 (75.2%): 100%|██████████| 35/35 [01:40<00:00,  2.88s/it]

2025/07/13 19:45:50 INFO dspy.evaluate.evaluate: Average Metric: 26.333333333333332 / 35 (75.2%)
2025/07/13 19:45:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.24 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 5', 'Predictor 1: Few-Shot Set 1'].
2025/07/13 19:45:50 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24]
2025/07/13 19:45:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58]
2025/07/13 19:45:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:45:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 11 / 35 - Minibatch ==



Average Metric: 24.67 / 35 (70.5%): 100%|██████████| 35/35 [01:21<00:00,  2.33s/it]

2025/07/13 19:47:13 INFO dspy.evaluate.evaluate: Average Metric: 24.666666666666664 / 35 (70.5%)
2025/07/13 19:47:13 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.48 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 10', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 1'].
2025/07/13 19:47:13 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48]
2025/07/13 19:47:13 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58]
2025/07/13 19:47:13 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:47:13 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 12 / 35 - Minibatch ==



Average Metric: 27.00 / 35 (77.1%): 100%|██████████| 35/35 [00:29<00:00,  1.19it/s]

2025/07/13 19:47:42 INFO dspy.evaluate.evaluate: Average Metric: 27.0 / 35 (77.1%)
2025/07/13 19:47:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 6'].
2025/07/13 19:47:43 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14]
2025/07/13 19:47:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58]
2025/07/13 19:47:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:47:43 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 13 / 35 - Full Evaluation =====
2025/07/13 19:47:43 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 77.14) from minibatch trials...



Average Metric: 57.67 / 80 (72.1%): 100%|██████████| 80/80 [00:39<00:00,  2.04it/s]

2025/07/13 19:48:22 INFO dspy.evaluate.evaluate: Average Metric: 57.666666666666664 / 80 (72.1%)
2025/07/13 19:48:22 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08]
2025/07/13 19:48:22 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67
2025/07/13 19:48:22 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/13 19:48:22 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 14 / 35 - Minibatch ==



Average Metric: 25.33 / 35 (72.4%): 100%|██████████| 35/35 [00:01<00:00, 23.58it/s]

2025/07/13 19:48:26 INFO dspy.evaluate.evaluate: Average Metric: 25.333333333333332 / 35 (72.4%)





2025/07/13 19:48:26 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 72.38 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 10', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 6'].
2025/07/13 19:48:26 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38]
2025/07/13 19:48:26 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08]
2025/07/13 19:48:26 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:48:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 15 / 35 - Minibatch ==


Average Metric: 27.67 / 35 (79.0%): 100%|██████████| 35/35 [01:11<00:00,  2.05s/it]

2025/07/13 19:49:38 INFO dspy.evaluate.evaluate: Average Metric: 27.666666666666664 / 35 (79.0%)
2025/07/13 19:49:38 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 79.05 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 6'].
2025/07/13 19:49:38 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38, 79.05]
2025/07/13 19:49:38 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08]
2025/07/13 19:49:38 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:49:38 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 16 / 35 - Minibatch ==



Average Metric: 26.33 / 35 (75.2%): 100%|██████████| 35/35 [00:47<00:00,  1.35s/it]

2025/07/13 19:50:26 INFO dspy.evaluate.evaluate: Average Metric: 26.333333333333332 / 35 (75.2%)
2025/07/13 19:50:26 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 75.24 on minibatch of size 35 with parameters ['Predictor 0: Instruction 1', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 6'].
2025/07/13 19:50:26 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38, 79.05, 75.24]
2025/07/13 19:50:26 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08]
2025/07/13 19:50:26 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:50:26 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 17 / 35 - Minibatch ==



Average Metric: 25.00 / 35 (71.4%): 100%|██████████| 35/35 [01:18<00:00,  2.24s/it]

2025/07/13 19:51:46 INFO dspy.evaluate.evaluate: Average Metric: 25.0 / 35 (71.4%)
2025/07/13 19:51:46 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 71.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 6', 'Predictor 1: Instruction 3', 'Predictor 1: Few-Shot Set 10'].
2025/07/13 19:51:46 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38, 79.05, 75.24, 71.43]
2025/07/13 19:51:46 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08]
2025/07/13 19:51:46 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:51:46 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 18 / 35 - Minibatch ==



Average Metric: 24.67 / 35 (70.5%): 100%|██████████| 35/35 [00:54<00:00,  1.55s/it]

2025/07/13 19:52:41 INFO dspy.evaluate.evaluate: Average Metric: 24.666666666666664 / 35 (70.5%)
2025/07/13 19:52:41 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.48 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 3'].
2025/07/13 19:52:41 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38, 79.05, 75.24, 71.43, 70.48]
2025/07/13 19:52:41 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08]
2025/07/13 19:52:41 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:52:41 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 19 / 35 - Full Evaluation =====
2025/07/13 19:52:41 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 79.05) from minibatch trials...



Average Metric: 56.67 / 80 (70.8%): 100%|██████████| 80/80 [01:33<00:00,  1.17s/it]

2025/07/13 19:54:15 INFO dspy.evaluate.evaluate: Average Metric: 56.666666666666664 / 80 (70.8%)
2025/07/13 19:54:15 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08, 70.83]
2025/07/13 19:54:15 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67
2025/07/13 19:54:15 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/13 19:54:15 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 20 / 35 - Minibatch ==



Average Metric: 27.00 / 35 (77.1%): 100%|██████████| 35/35 [01:35<00:00,  2.74s/it]

2025/07/13 19:55:51 INFO dspy.evaluate.evaluate: Average Metric: 27.0 / 35 (77.1%)
2025/07/13 19:55:51 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 77.14 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 8'].
2025/07/13 19:55:51 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38, 79.05, 75.24, 71.43, 70.48, 77.14]
2025/07/13 19:55:51 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08, 70.83]
2025/07/13 19:55:51 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:55:51 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 21 / 35 - Minibatch ==



Average Metric: 24.67 / 35 (70.5%): 100%|██████████| 35/35 [00:56<00:00,  1.61s/it]

2025/07/13 19:56:48 INFO dspy.evaluate.evaluate: Average Metric: 24.666666666666668 / 35 (70.5%)
2025/07/13 19:56:48 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.48 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 11', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 7'].
2025/07/13 19:56:48 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38, 79.05, 75.24, 71.43, 70.48, 77.14, 70.48]
2025/07/13 19:56:48 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08, 70.83]
2025/07/13 19:56:48 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:56:48 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 22 / 35 - Minibatch ==



Average Metric: 22.00 / 35 (62.9%): 100%|██████████| 35/35 [01:35<00:00,  2.74s/it]

2025/07/13 19:58:24 INFO dspy.evaluate.evaluate: Average Metric: 22.0 / 35 (62.9%)
2025/07/13 19:58:24 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 62.86 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 4', 'Predictor 1: Instruction 3', 'Predictor 1: Few-Shot Set 6'].
2025/07/13 19:58:24 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38, 79.05, 75.24, 71.43, 70.48, 77.14, 70.48, 62.86]
2025/07/13 19:58:24 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08, 70.83]
2025/07/13 19:58:24 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:58:24 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 23 / 35 - Minibatch ==



Average Metric: 24.00 / 35 (68.6%): 100%|██████████| 35/35 [01:09<00:00,  1.98s/it]

2025/07/13 19:59:34 INFO dspy.evaluate.evaluate: Average Metric: 24.0 / 35 (68.6%)
2025/07/13 19:59:34 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 9', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 6'].
2025/07/13 19:59:34 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38, 79.05, 75.24, 71.43, 70.48, 77.14, 70.48, 62.86, 68.57]
2025/07/13 19:59:34 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08, 70.83]
2025/07/13 19:59:34 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 19:59:34 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 24 / 35 - Minibatch ==



Average Metric: 26.00 / 35 (74.3%): 100%|██████████| 35/35 [01:17<00:00,  2.22s/it]

2025/07/13 20:00:52 INFO dspy.evaluate.evaluate: Average Metric: 26.0 / 35 (74.3%)
2025/07/13 20:00:52 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 74.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 11', 'Predictor 1: Instruction 2', 'Predictor 1: Few-Shot Set 6'].
2025/07/13 20:00:52 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38, 79.05, 75.24, 71.43, 70.48, 77.14, 70.48, 62.86, 68.57, 74.29]
2025/07/13 20:00:52 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08, 70.83]
2025/07/13 20:00:52 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 20:00:52 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 25 / 35 - Full Evaluation =====
2025/07/13 20:00:52 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 77.14) from minibat


Average Metric: 58.67 / 80 (73.3%): 100%|██████████| 80/80 [00:11<00:00,  6.78it/s]

2025/07/13 20:01:04 INFO dspy.evaluate.evaluate: Average Metric: 58.666666666666664 / 80 (73.3%)
2025/07/13 20:01:04 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08, 70.83, 73.33]
2025/07/13 20:01:04 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67
2025/07/13 20:01:04 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/13 20:01:04 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 26 / 35 - Minibatch ==



Average Metric: 25.00 / 35 (71.4%): 100%|██████████| 35/35 [00:07<00:00,  4.91it/s]

2025/07/13 20:01:12 INFO dspy.evaluate.evaluate: Average Metric: 25.0 / 35 (71.4%)





2025/07/13 20:01:12 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 71.43 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 4', 'Predictor 1: Few-Shot Set 6'].
2025/07/13 20:01:12 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38, 79.05, 75.24, 71.43, 70.48, 77.14, 70.48, 62.86, 68.57, 74.29, 71.43]
2025/07/13 20:01:12 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08, 70.83, 73.33]
2025/07/13 20:01:12 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 20:01:12 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 27 / 35 - Minibatch ==


Average Metric: 23.67 / 35 (67.6%): 100%|██████████| 35/35 [01:20<00:00,  2.31s/it]

2025/07/13 20:02:33 INFO dspy.evaluate.evaluate: Average Metric: 23.666666666666668 / 35 (67.6%)
2025/07/13 20:02:33 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 67.62 on minibatch of size 35 with parameters ['Predictor 0: Instruction 2', 'Predictor 0: Few-Shot Set 2', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 6'].
2025/07/13 20:02:33 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38, 79.05, 75.24, 71.43, 70.48, 77.14, 70.48, 62.86, 68.57, 74.29, 71.43, 67.62]
2025/07/13 20:02:33 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08, 70.83, 73.33]
2025/07/13 20:02:33 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 20:02:33 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 28 / 35 - Minibatch ==



Average Metric: 26.00 / 35 (74.3%): 100%|██████████| 35/35 [01:09<00:00,  1.99s/it]

2025/07/13 20:03:43 INFO dspy.evaluate.evaluate: Average Metric: 26.0 / 35 (74.3%)
2025/07/13 20:03:43 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 74.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 8', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 5'].
2025/07/13 20:03:43 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38, 79.05, 75.24, 71.43, 70.48, 77.14, 70.48, 62.86, 68.57, 74.29, 71.43, 67.62, 74.29]
2025/07/13 20:03:43 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08, 70.83, 73.33]
2025/07/13 20:03:43 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 20:03:43 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 29 / 35 - Minibatch ==



Average Metric: 24.00 / 35 (68.6%): 100%|██████████| 35/35 [01:06<00:00,  1.91s/it]

2025/07/13 20:04:50 INFO dspy.evaluate.evaluate: Average Metric: 24.0 / 35 (68.6%)
2025/07/13 20:04:50 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 68.57 on minibatch of size 35 with parameters ['Predictor 0: Instruction 0', 'Predictor 0: Few-Shot Set 7', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 6'].
2025/07/13 20:04:50 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38, 79.05, 75.24, 71.43, 70.48, 77.14, 70.48, 62.86, 68.57, 74.29, 71.43, 67.62, 74.29, 68.57]
2025/07/13 20:04:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08, 70.83, 73.33]
2025/07/13 20:04:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 20:04:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 30 / 35 - Minibatch ==



Average Metric: 24.67 / 35 (70.5%): 100%|██████████| 35/35 [01:23<00:00,  2.39s/it]

2025/07/13 20:06:14 INFO dspy.evaluate.evaluate: Average Metric: 24.666666666666664 / 35 (70.5%)





2025/07/13 20:06:15 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.48 on minibatch of size 35 with parameters ['Predictor 0: Instruction 3', 'Predictor 0: Few-Shot Set 0', 'Predictor 1: Instruction 0', 'Predictor 1: Few-Shot Set 6'].
2025/07/13 20:06:15 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38, 79.05, 75.24, 71.43, 70.48, 77.14, 70.48, 62.86, 68.57, 74.29, 71.43, 67.62, 74.29, 68.57, 70.48]
2025/07/13 20:06:15 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08, 70.83, 73.33]
2025/07/13 20:06:15 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 20:06:15 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 31 / 35 - Full Evaluation =====
2025/07/13 20:06:15 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 77.14) from minibatch trials...


Average Metric: 58.00 / 80 (72.5%): 100%|██████████| 80/80 [01:35<00:00,  1.19s/it]

2025/07/13 20:07:50 INFO dspy.evaluate.evaluate: Average Metric: 58.0 / 80 (72.5%)
2025/07/13 20:07:50 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08, 70.83, 73.33, 72.5]
2025/07/13 20:07:50 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67
2025/07/13 20:07:50 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/13 20:07:50 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 32 / 35 - Minibatch ==



Average Metric: 22.33 / 35 (63.8%): 100%|██████████| 35/35 [01:12<00:00,  2.06s/it]

2025/07/13 20:09:02 INFO dspy.evaluate.evaluate: Average Metric: 22.333333333333332 / 35 (63.8%)
2025/07/13 20:09:03 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 63.81 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 11', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 8'].
2025/07/13 20:09:03 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38, 79.05, 75.24, 71.43, 70.48, 77.14, 70.48, 62.86, 68.57, 74.29, 71.43, 67.62, 74.29, 68.57, 70.48, 63.81]
2025/07/13 20:09:03 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08, 70.83, 73.33, 72.5]
2025/07/13 20:09:03 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 20:09:03 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 33 / 35 - Minibatch ==



Average Metric: 24.67 / 35 (70.5%): 100%|██████████| 35/35 [00:00<00:00, 48.11it/s]

2025/07/13 20:09:04 INFO dspy.evaluate.evaluate: Average Metric: 24.666666666666664 / 35 (70.5%)





2025/07/13 20:09:04 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 70.48 on minibatch of size 35 with parameters ['Predictor 0: Instruction 4', 'Predictor 0: Few-Shot Set 1', 'Predictor 1: Instruction 1', 'Predictor 1: Few-Shot Set 8'].
2025/07/13 20:09:04 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38, 79.05, 75.24, 71.43, 70.48, 77.14, 70.48, 62.86, 68.57, 74.29, 71.43, 67.62, 74.29, 68.57, 70.48, 63.81, 70.48]
2025/07/13 20:09:04 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08, 70.83, 73.33, 72.5]
2025/07/13 20:09:04 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 20:09:04 INFO dspy.teleprompt.mipro_optimizer_v2: == Trial 34 / 35 - Minibatch ==


Average Metric: 26.00 / 35 (74.3%): 100%|██████████| 35/35 [01:19<00:00,  2.28s/it]

2025/07/13 20:10:24 INFO dspy.evaluate.evaluate: Average Metric: 26.0 / 35 (74.3%)





2025/07/13 20:10:27 INFO dspy.teleprompt.mipro_optimizer_v2: Score: 74.29 on minibatch of size 35 with parameters ['Predictor 0: Instruction 5', 'Predictor 0: Few-Shot Set 6', 'Predictor 1: Instruction 5', 'Predictor 1: Few-Shot Set 8'].
2025/07/13 20:10:27 INFO dspy.teleprompt.mipro_optimizer_v2: Minibatch scores so far: [65.71, 72.38, 62.86, 67.62, 69.52, 69.52, 77.14, 75.24, 70.48, 77.14, 72.38, 79.05, 75.24, 71.43, 70.48, 77.14, 70.48, 62.86, 68.57, 74.29, 71.43, 67.62, 74.29, 68.57, 70.48, 63.81, 70.48, 74.29]
2025/07/13 20:10:27 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08, 70.83, 73.33, 72.5]
2025/07/13 20:10:27 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67


2025/07/13 20:10:27 INFO dspy.teleprompt.mipro_optimizer_v2: ===== Trial 35 / 35 - Full Evaluation =====
2025/07/13 20:10:27 INFO dspy.teleprompt.mipro_optimizer_v2: Doing full eval on next top averaging program (Avg Score: 75.24) from minibatch trials...


Average Metric: 56.67 / 80 (70.8%): 100%|██████████| 80/80 [02:15<00:00,  1.69s/it]

2025/07/13 20:12:42 INFO dspy.evaluate.evaluate: Average Metric: 56.666666666666664 / 80 (70.8%)
2025/07/13 20:12:42 INFO dspy.teleprompt.mipro_optimizer_v2: Full eval scores so far: [76.67, 74.58, 72.08, 70.83, 73.33, 72.5, 70.83]
2025/07/13 20:12:42 INFO dspy.teleprompt.mipro_optimizer_v2: Best full score so far: 76.67
2025/07/13 20:12:42 INFO dspy.teleprompt.mipro_optimizer_v2: 

2025/07/13 20:12:42 INFO dspy.teleprompt.mipro_optimizer_v2: Returning best identified program with score 76.67!





In [None]:
optimized_react(claim="The author of the 1960s unproduced script written for The Beatles, Up Against It, and Bernard-Marie Koltès are both playwrights.").titles

['Joe Orton', 'Bernard-Marie Koltès', 'Up Against It']

In [40]:
optimized_react.save("optimized_react.json")

loaded_react = dspy.ReAct("claim -> titles: list[str]", tools=[search_wikipedia, lookup_wikipedia], max_iters=20)
loaded_react.load("optimized_react.json")

loaded_react(claim="The author of the 1960s unproduced script written for The Beatles, Up Against It, and Bernard-Marie Koltès are both playwrights.").titles

['Joe Orton', 'Bernard-Marie Koltès', 'Up Against It']