# DSPy Agent Example with Azure OpenAI and Azure Evaluation SDK

In [1]:
import os
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
import dspy
import ujson
import random
import dspy.evaluate
import dspy.retrievers
import dspy.datasets
from datasets import load_dataset
from azure.ai.evaluation import AzureOpenAIModelConfiguration

In [2]:
# Load environment variables from .env file
load_dotenv()

True

## Setup Tracing with MLFlow

In [3]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("DSPy-Azure-Eval-Example")

# Enable tracing.
mlflow.dspy.autolog()

## Setup LM models

In [4]:
token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")

gpt41 = dspy.LM(
    model=f"azure/gpt-4.1-2",
    base_url=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_ad_token_provider=token_provider,
)

gpt4omini=dspy.LM(
    model=f"azure/gpt-4o-mini",
    base_url=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_ad_token_provider=token_provider,
)

dspy.configure(lm=gpt4omini)

In [5]:
gpt41("Say hello to the world!")

['Hello, world! 🌍']

In [6]:
gpt4omini("Say hello to the world!")

['Hello, world! 🌍✨ How can I assist you today?']

## Load HoVer multi-hop QA datasets

In [7]:
# Load HoVer dataset using parquet files (avoiding deprecated script)
from datasets import load_dataset

try:
    # Try loading using parquet files which should be available
    dataset = load_dataset("parquet", data_files={
        "train": "hf://datasets/hover-nlp/hover/train-00000-of-00001.parquet"
    })
    hover_train = dataset["train"]
except:
    # Fallback: Load using the dataset name with revision
    dataset = load_dataset("hover-nlp/hover", revision="refs/convert/parquet")
    hover_train = dataset["train"]

# Convert to DSPy examples
hover = []
hpqa_ids = set()

for x in hover_train:
    if x["num_hops"] == 3 and x["hpqa_id"] not in hpqa_ids:
        hpqa_ids.add(x["hpqa_id"])
        titles = list(set([y["key"] for y in x["supporting_facts"]]))
        hover.append(
            dspy.Example(claim=x["claim"], titles=titles).with_inputs("claim")
        )

random.Random(0).shuffle(hover)
trainset, devset, testset = hover[:100], hover[100:200], hover[650:]

In [8]:
len(trainset), len(devset), len(testset)

(100, 100, 1216)

In [9]:
trainset[0]

Example({'claim': 'This director is known for his work on Miss Potter. The Academy of Motion Picture Arts and Sciences presents the award in which he was nominated for his work in "Babe".', 'titles': ['Miss Potter', 'Academy Award for Best Director', 'Chris Noonan']}) (input_keys={'claim'})

## Tools for agent

In [10]:
DOCS = {}

def search(query: str, k: int) -> list[str]:
    results = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')(query, k=k)
    results = [x['text'] for x in results]

    for result in results:
        title, text = result.split(" | ", 1)
        DOCS[title] = text

    return results

In [11]:
def search_wikipedia(query: str) -> list[str]:
    """Returns top-5 results and then the titles of the top-5 to top-30 results."""

    topK = search(query, 30)
    titles, topK = [f"`{x.split(' | ')[0]}`" for x in topK[5:30]], topK[:5]
    return topK + [f"Other retrieved pages have titles: {', '.join(titles)}."]

def lookup_wikipedia(title: str) -> str:
    """Returns the text of the Wikipedia page, if it exists."""

    if title in DOCS:
        return DOCS[title]

    results = [x for x in search(title, 10) if x.startswith(title + " | ")]
    if not results:
        return f"No Wikipedia page found for title: {title}"
    return results[0]

In [12]:
search_wikipedia("Albert Einstein")

['Albert Einstein | Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist. Einstein developed the theory of relativity, one of the two pillars of modern physics (alongside quantum mechanics). Einstein\'s work is also known for its influence on the philosophy of science. Einstein is best known by the general public for his mass–energy equivalence formula "E" = "mc" (which has been dubbed "the world\'s most famous equation"). He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the evolution of quantum theory.',
 'Einstein (disambiguation) | Albert Einstein (1879–1955) was a German-born theoretical physicist.',
 'Hans Albert Einstein | Hans Albert Einstein ( ; May 14, 1904 – July 26, 1973) was a Swiss-American engineer and educator, the second child and first son of Albert Einstein and Mileva Marić. Hans A. Einstein was a professor

In [13]:
lookup_wikipedia("Albert Einstein")

'Albert Einstein (14 March 1879 – 18 April 1955) was a German-born theoretical physicist. Einstein developed the theory of relativity, one of the two pillars of modern physics (alongside quantum mechanics). Einstein\'s work is also known for its influence on the philosophy of science. Einstein is best known by the general public for his mass–energy equivalence formula "E" = "mc" (which has been dubbed "the world\'s most famous equation"). He received the 1921 Nobel Prize in Physics "for his services to theoretical physics, and especially for his discovery of the law of the photoelectric effect", a pivotal step in the evolution of quantum theory.'

## Define DSPy agent using `dspy.ReAct`

In [14]:
instructions = "Find all Wikipedia titles relevant to verifying (or refuting) the claim."
signature = dspy.Signature("claim -> titles: list[str]", instructions)
react = dspy.ReAct(signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=20)

In [15]:
react(claim="David Gregory was born in 1625.").titles[:3]

['David Gregory (physician)',
 'David Gregory (historian)',
 'David Gregory (mathematician)']

## Evaluate the agent using metrics from Azure Evaluation SDK

In [18]:
from azure.ai.evaluation import RetrievalEvaluator
from azure.ai.evaluation import AzureOpenAIModelConfiguration

class Relevance(dspy.Module):
    def __init__(self, model_config, threshold=3):
        self.relevance = RetrievalEvaluator(model_config=model_config, threshold=threshold)

    def forward(self, example, pred, trace=None):
        gold_titles = example.titles
        claim = example.claim

        titles = pred.titles[:5]
        context = "\n\n".join([lookup_wikipedia(title) for title in titles])
        scores = self.relevance(
            query=claim,
            context=context,
        )
        return scores["retrieval"] / 5.0
    
relevance = Relevance(
    model_config=AzureOpenAIModelConfiguration(
        azure_deployment="gpt-4.1-2",
        azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
        api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    ),
    threshold=3
)

evaluate = dspy.Evaluate(devset=devset, metric=relevance, num_threads=16, display_progress=True, display_table=5)

In [19]:
pred = react(**devset[0].inputs())

score = relevance(devset[0], pred)

print(f"Relevance score for the first example: {score}")

Relevance score for the first example: 1.0


In [20]:
# Wrapper to bypass exceptions during evaluation.
def safe_react(claim: str):
    try:
        return react(claim=claim)
    except Exception as e:
        return dspy.Prediction(titles=[])

evaluate(safe_react)

Average Metric: 83.60 / 100 (83.6%): 100%|██████████| 100/100 [00:44<00:00,  2.25it/s]

2025/07/13 21:29:22 INFO dspy.evaluate.evaluate: Average Metric: 83.6 / 100 (83.6%)





Unnamed: 0,claim,example_titles,trajectory,reasoning,pred_titles,Relevance
0,The Church of England's movement that inspired the Trinity Episcop...,"[Samuel Rickards, Oxford Movement, Trinity Episcopal Church (Hough...","{'thought_0': ""I need to gather information about the Church of En...",The claim states that the Church of England's movement that inspir...,"[Trinity Episcopal Church (Houghton, Michigan), Oxford Movement, S...",✔️ [1.000]
1,"Red, White & Crüe and this athlete both fight. The french fighter ...","[Red, White &amp; Crüe, Mike Tyson, Bobby Stewart]",{'thought_0': 'I need to find information about the athlete mentio...,"The claim connects the album ""Red, White & Crüe"" by Mötley Crüe to...","[Bobby Stewart, Mike Tyson, Mötley Crüe, Red, White & Crüe]",✔️ [0.400]
2,The writer/director/actor from Glen or Glenda and Fernand Rivers s...,"[Ed Wood, Fernand Rivers, Glen or Glenda]",{'thought_0': 'I need to verify the claim about the writer/directo...,"The claim states that the writer/director/actor from ""Glen or Glen...","[Glen or Glenda, Ed Wood, Fernand Rivers]",✔️ [1.000]
3,The film by Sandi Sissel was released before The End of Suburbia.,"[Chicken Ranch (film), Sandi Sissel, The End of Suburbia]",{'thought_0': 'I need to verify the release date of the film by Sa...,"The claim states that a film by Sandi Sissel was released before ""...","[The End of Suburbia, Sandi Sissel]",✔️ [0.400]
4,The actor who played captain hook in the live production with Tayl...,"[Peter Pan Live!, Christopher Walken, Taylor Louderman]",{'thought_0': 'I need to verify the claim about the actor who play...,The claim states that the actor who played Captain Hook in the liv...,"[Peter Pan Live!, Christopher Walken, The Deer Hunter]",✔️ [1.000]


83.6

## Optimize the agent

In [None]:
kwargs = dict(teacher_settings=dict(lm=gpt41), prompt_model=gpt41, max_errors=999)

tp = dspy.MIPROv2(metric=relevance, auto="medium", num_threads=16, **kwargs)
optimized_react = tp.compile(react, trainset=trainset, max_bootstrapped_demos=3, max_labeled_demos=0)

In [None]:
optimized_react(claim="The author of the 1960s unproduced script written for The Beatles, Up Against It, and Bernard-Marie Koltès are both playwrights.").titles

In [None]:
optimized_react.save("optimized_react.json")

loaded_react = dspy.ReAct("claim -> titles: list[str]", tools=[search_wikipedia, lookup_wikipedia], max_iters=20)
loaded_react.load("optimized_react.json")

loaded_react(claim="The author of the 1960s unproduced script written for The Beatles, Up Against It, and Bernard-Marie Koltès are both playwrights.").titles