# DSPy Agent Example with Azure OpenAI and Azure Evaluation SDK

In [None]:
import os
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
import dspy
import ujson
import random
import dspy.evaluate
import dspy.retrievers
import dspy.datasets
from datasets import load_dataset
from azure.ai.evaluation import AzureOpenAIModelConfiguration

In [None]:
# Load environment variables from .env file
load_dotenv()

## Setup Tracing with MLFlow

In [None]:
import mlflow

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("DSPy-Azure-Eval-Example")

# Enable tracing.
mlflow.dspy.autolog()

## Setup LM models

In [None]:
token_provider = get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default")

gpt41 = dspy.LM(
    model=f"azure/gpt-4.1-2",
    base_url=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_ad_token_provider=token_provider,
)

gpt4omini=dspy.LM(
    model=f"azure/gpt-4o-mini",
    base_url=os.environ["AZURE_OPENAI_ENDPOINT"],
    api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_ad_token_provider=token_provider,
)

dspy.configure(lm=gpt4omini)

In [None]:
gpt41("Say hello to the world!")

In [None]:
gpt4omini("Say hello to the world!")

## Load HoVer multi-hop QA datasets

In [None]:
# Load HoVer dataset using parquet files (avoiding deprecated script)
from datasets import load_dataset

try:
    # Try loading using parquet files which should be available
    dataset = load_dataset("parquet", data_files={
        "train": "hf://datasets/hover-nlp/hover/train-00000-of-00001.parquet"
    })
    hover_train = dataset["train"]
except:
    # Fallback: Load using the dataset name with revision
    dataset = load_dataset("hover-nlp/hover", revision="refs/convert/parquet")
    hover_train = dataset["train"]

# Convert to DSPy examples
hover = []
hpqa_ids = set()

for x in hover_train:
    if x["num_hops"] == 3 and x["hpqa_id"] not in hpqa_ids:
        hpqa_ids.add(x["hpqa_id"])
        titles = list(set([y["key"] for y in x["supporting_facts"]]))
        hover.append(
            dspy.Example(claim=x["claim"], titles=titles).with_inputs("claim")
        )

random.Random(0).shuffle(hover)
trainset, devset, testset = hover[:100], hover[100:200], hover[650:]

In [None]:
len(trainset), len(devset), len(testset)

In [None]:
trainset[0]

## Tools for agent

In [None]:
DOCS = {}

def search(query: str, k: int) -> list[str]:
    results = dspy.ColBERTv2(url='http://20.102.90.50:2017/wiki17_abstracts')(query, k=k)
    results = [x['text'] for x in results]

    for result in results:
        title, text = result.split(" | ", 1)
        DOCS[title] = text

    return results

In [None]:
def search_wikipedia(query: str) -> list[str]:
    """Returns top-5 results and then the titles of the top-5 to top-30 results."""

    topK = search(query, 30)
    titles, topK = [f"`{x.split(' | ')[0]}`" for x in topK[5:30]], topK[:5]
    return topK + [f"Other retrieved pages have titles: {', '.join(titles)}."]

def lookup_wikipedia(title: str) -> str:
    """Returns the text of the Wikipedia page, if it exists."""

    if title in DOCS:
        return DOCS[title]

    results = [x for x in search(title, 10) if x.startswith(title + " | ")]
    if not results:
        return f"No Wikipedia page found for title: {title}"
    return results[0]

In [None]:
search_wikipedia("Albert Einstein")

In [None]:
lookup_wikipedia("Albert Einstein")

## Define DSPy agent using `dspy.ReAct`

In [None]:
instructions = "Find all Wikipedia titles relevant to verifying (or refuting) the claim."
signature = dspy.Signature("claim -> titles: list[str]", instructions)
react = dspy.ReAct(signature, tools=[search_wikipedia, lookup_wikipedia], max_iters=20)

In [None]:
react(claim="David Gregory was born in 1625.").titles[:3]

## Evaluate the agent using metrics from Azure Evaluation SDK

In [None]:
from azure.ai.evaluation import RetrievalEvaluator
from azure.ai.evaluation import AzureOpenAIModelConfiguration

class Relevance(dspy.Module):
    def __init__(self, model_config, threshold=3):
        self.relevance = RetrievalEvaluator(model_config=model_config, threshold=threshold)

    def forward(self, example, pred, trace=None):
        gold_titles = example.titles
        claim = example.claim

        titles = pred.titles[:5]
        context = "\n\n".join([lookup_wikipedia(title) for title in titles])
        scores = self.relevance(
            query=claim,
            context=context,
        )
        return scores["retrieval"]
    
relevance = Relevance(
    model_config=AzureOpenAIModelConfiguration(
        azure_deployment="gpt-4.1-mini",
        azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
        api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    ),
    threshold=3
)

evaluate = dspy.Evaluate(devset=devset, metric=relevance, num_threads=16, display_progress=True, display_table=5)

In [None]:
pred = react(**devset[0].inputs())

score = relevance(devset[0], pred)

print(f"Relevance score for the first example: {score}")

In [None]:
# Wrapper to bypass exceptions during evaluation.
def safe_react(claim: str):
    try:
        return react(claim=claim)
    except Exception as e:
        return dspy.Prediction(titles=[])

evaluate(safe_react)

## Optimize the agent

In [None]:
kwargs = dict(teacher_settings=dict(lm=gpt41), prompt_model=gpt41, max_errors=999)

tp = dspy.MIPROv2(metric=relevance, auto="medium", num_threads=16, **kwargs)
optimized_react = tp.compile(react, trainset=trainset, max_bootstrapped_demos=3, max_labeled_demos=0)

In [None]:
optimized_react(claim="The author of the 1960s unproduced script written for The Beatles, Up Against It, and Bernard-Marie Koltès are both playwrights.").titles

In [None]:
optimized_react.save("optimized_react.json")

loaded_react = dspy.ReAct("claim -> titles: list[str]", tools=[search_wikipedia, lookup_wikipedia], max_iters=20)
loaded_react.load("optimized_react.json")

loaded_react(claim="The author of the 1960s unproduced script written for The Beatles, Up Against It, and Bernard-Marie Koltès are both playwrights.").titles