In [1]:
import mlflow
import dspy

from dotenv import load_dotenv

load_dotenv(dotenv_path="../.env.local")

# Enable autologging with all features
mlflow.dspy.autolog(
    log_compiles=True,  # Track optimization process
    log_evals=True,  # Track evaluation results
    log_traces_from_compile=True,  # Track program traces during optimization
)

# Configure MLflow tracking
mlflow.set_tracking_uri("http://127.0.0.1:5000/")  # Use local MLflow server
mlflow.set_experiment("deep_leads_dspy_test")

<Experiment: artifact_location='mlflow-artifacts:/2', creation_time=1751239700003, experiment_id='2', last_update_time=1751239700003, lifecycle_stage='active', name='deep_leads_dspy_test', tags={}>

## Program definition


### Tools


In [2]:
import os

from tavily import TavilyClient


tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))


def browse_web(query: str) -> str:
    """browse the web for information"""
    try:
        research_results = tavily_client.search(query, max_results=5)
    except Exception as e:
        print(f"Error browsing the web: {e}")
        return "Error browsing the web"

    return research_results


def get_website_map(url: str) -> str:
    """get the website map"""
    try:
        website_map = tavily_client.map(url)
    except Exception as e:
        print(f"Error getting the website map: {e}")
        return "Error getting the website map"
    return website_map


def get_website_content(url: str) -> str:
    """get the website content"""
    try:
        website_content = tavily_client.extract(url)
    except Exception as e:
        print(f"Error getting the website content: {e}")
        return "Error getting the website content"
    return website_content

### Single Agent


In [3]:
from src.types import LeadResults

# Enable caching
dspy.settings.configure(
    lm=dspy.LM("openai/gpt-4.1", max_tokens=10000), track_usage=True
)


class SingleAgentSig(dspy.Signature):
    """
    You are an expert lead research agent specializing in finding high-quality contact information for specific professionals,
    researchers, and business contacts. Your mission is to conduct thorough, systematic research to identify leads that precisely
    match the user's criteria.
    """

    user_query: str = dspy.InputField()
    leads: LeadResults = dspy.OutputField()


single_agent = dspy.ReAct(
    SingleAgentSig,
    tools=[browse_web, get_website_map, get_website_content],
    max_iters=20,
)

In [None]:
from rich import print as rprint
from src.agents.utils.build_final_query import build_final_query
from src.types import ResearchParams


user_query = build_final_query(
    ResearchParams(
        who_query="researchers",
        what_query="Human Nutrition",
        where_query="Edmonton",
        context_query="",
    )
)
result = await single_agent.acall(user_query=user_query)

rprint(result)

### Multi Agent


In [None]:
from rich import print as rprint
from src.agents.utils.build_final_query import build_final_query
from src.types import ResearchParams

dspy.enable_logging()


class MultiAgentSig(dspy.Signature):
    """
    You are an expert lead research agent specializing in finding high-quality contact information for specific professionals,
    researchers, and business contacts. Your mission is to conduct thorough, systematic research to identify leads that precisely
    match the user's criteria.

    You can use parallel tools calls and deploy a research agent to explore specfic branches of research.
    """

    user_query: str = dspy.InputField()
    leads: LeadResults = dspy.OutputField()


async def deploy_search_agent(search_query: str) -> LeadResults:
    """
    Deploy a search a research agent that you can use to explore specfic branches of research. This should be used as parallel tool calls.
    """
    return await single_agent.acall(user_query=search_query)


multi_agent = dspy.ReAct(
    MultiAgentSig,
    tools=[browse_web, get_website_map, get_website_content, deploy_search_agent],
)

user_query = build_final_query(
    ResearchParams(
        who_query="researchers",
        what_query="Human Nutrition",
        where_query="Edmonton",
        context_query="",
    )
)
result = await multi_agent.acall(user_query=user_query)

rprint(result)

In [None]:
rprint(result.leads)

## Evaluation


#### Loading and converting trainset


In [4]:
import json
import random
from typing import List, Tuple
from rich import print as rprint
from src.types import Sample
from sklearn.model_selection import train_test_split


# Load and convert the JSON data to Sample objects
def load_eval_samples() -> List[Sample]:
    """Load eval data from JSON and convert to Sample objects"""
    with open("checkpoints/eval_leads_v4_800.json", "r") as f:
        eval_data = json.load(f)

    # Convert each dictionary to a Sample object using Pydantic validation
    samples = []
    for item in eval_data:
        try:
            sample = Sample.model_validate(item)
            samples.append(sample)
        except Exception as e:
            print(f"Failed to parse sample: {e}")
            print(f"Problematic item: {item}")
            continue

    return samples


# Load the samples
eval_samples = load_eval_samples()
print(f"Successfully loaded {len(eval_samples)} samples")

# Converting data to training and test sets using DSPy Example abstraction


def convert_sample_to_dspy_example(sample: Sample) -> dspy.Example:
    return dspy.Example(
        user_query=sample.query_string,
        leads=sample.expected_results.leads,
    ).with_inputs("user_query")


def train_test_split_list(lst, train_frac=0.8, seed=None):
    rng = random.Random(seed)
    lst_copy = lst[:]  # shallow copy so original order is preserved
    rng.shuffle(lst_copy)
    split = int(len(lst_copy) * train_frac)
    return lst_copy[:split], lst_copy[split:]


def get_train_test_split(
    samples: List[Sample], test_size: float = 0.8
) -> Tuple[List[dspy.Example], List[dspy.Example]]:
    examples = [convert_sample_to_dspy_example(sample) for sample in samples]
    train_examples, test_examples = train_test_split(examples, test_size=test_size)
    return train_examples, test_examples


train_examples, test_examples = get_train_test_split(eval_samples)

Successfully loaded 800 samples


#### Metrics


In [5]:
import numpy as np
from scipy.spatial.distance import cdist
from scipy.optimize import linear_sum_assignment
from typing import Literal


def cosine_similarity_matrix(A: np.ndarray, B: np.ndarray) -> np.ndarray:
    """
    Compute pairwise cosine similarities between rows of A and rows of B
    using SciPy's cdist (which returns cosine distances = 1 - cosine_similarity).
    """
    # if either is empty, return an empty (M, N) matrix
    if A.size == 0 or B.size == 0:
        return np.zeros((A.shape[0], B.shape[0]))

    cos_dist = cdist(A, B, metric="cosine")  # shape (M, N)
    return 1.0 - cos_dist  # convert distance -> similarity


def score_sample(
    true_names: list[str], pred_names: list[str], threshold: float = 0.7
) -> tuple[int, int, int]:
    """
    For one sample (one list of true names, one list of predicted names):
      - embed both sets,
      - build similarity matrix,
      - run Hungarian to match,
      - count TP, FP, FN at the given sim threshold.
    Returns (TP, FP, FN).
    """

    # 1. embed
    embedder = dspy.Embedder("openai/text-embedding-3-small", batch_size=100)
    E_true = embedder(true_names)  # shape (M, D)
    E_pred = embedder(pred_names)  # shape (N, D)

    # 2. similarity matrix
    sim = cosine_similarity_matrix(E_true, E_pred)  # (M, N)

    # 3. Hungarian matching on negative sim to maximize total similarity
    #    If M≠N, linear_sum_assignment will match min(M,N) pairs.
    row_idx, col_idx = linear_sum_assignment(-sim)

    # 4. filter by threshold
    matched_sims = sim[row_idx, col_idx]
    good = matched_sims >= threshold

    TP = int(good.sum())
    FP = len(pred_names) - TP
    FN = len(true_names) - TP

    # Precision: Of all predicted leads, how many were correct?
    precision = (TP / (TP + FP)) if (TP + FP) > 0 else 0.0

    # Recall: Of all expected leads, how many were found?
    recall = (TP / (TP + FN)) if (TP + FN) > 0 else 0.0

    # F1 Score: Harmonic mean of precision and recall
    f1_score = (
        (2 * precision * recall) / (precision + recall)
        if (precision + recall) > 0
        else 0.0
    )

    return recall, precision, f1_score


def validate_leads_recall(
    example: dspy.Example, pred: dspy.Prediction, trace=None
) -> float:
    true_names = [lead.name for lead in example.leads]
    pred_names = [lead.name for lead in pred.leads.leads]

    recall, _, _ = score_sample(true_names, pred_names)

    return recall


def validate_leads_precision(
    example: dspy.Example, pred: dspy.Prediction, trace=None
) -> float:
    true_names = [lead.name for lead in example.leads]
    pred_names = [lead.name for lead in pred.leads.leads]

    _, precision, _ = score_sample(true_names, pred_names)

    return precision


def validate_leads_f1_score(
    example: dspy.Example, pred: dspy.Prediction, trace=None
) -> float:
    true_names = [lead.name for lead in example.leads]
    pred_names = [lead.name for lead in pred.leads.leads]

    _, _, f1_score = score_sample(true_names, pred_names)

    return f1_score

#### Valuation Executors


In [None]:
import time
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
from tqdm import tqdm
import mlflow


def print_eval_results(
    scores, tavily_cost, llm_cost, total_cost, dataset_size, num_threads=None
):
    """Print nicely formatted evaluation results"""
    from rich.console import Console
    from rich.table import Table
    from rich.panel import Panel
    from rich.text import Text

    console = Console()

    # Calculate statistics
    average_score = sum(scores) / len(scores) if scores else 0
    max_score = max(scores) if scores else 0
    min_score = min(scores) if scores else 0
    success_rate = (
        len([s for s in scores if s > 0]) / len(scores) * 100 if scores else 0
    )

    # Create main results table
    table = Table(title="🔍 Evaluation Results", title_style="bold blue")
    table.add_column("Metric", style="cyan", no_wrap=True)
    table.add_column("Value", style="magenta")
    table.add_column("Details", style="green")

    # Performance metrics
    table.add_row(
        "📊 Average Score",
        f"{average_score:.4f}",
        f"Range: {min_score:.4f} - {max_score:.4f}",
    )
    table.add_row(
        "✅ Success Rate",
        f"{success_rate:.1f}%",
        f"{len([s for s in scores if s > 0])}/{len(scores)} samples",
    )
    table.add_row("📈 Total Samples", f"{dataset_size}", f"Completed: {len(scores)}")

    if num_threads:
        table.add_row(
            "🔄 Parallelization", f"{num_threads} threads", "Concurrent execution"
        )

    # Cost breakdown
    table.add_row("", "", "")  # Separator
    table.add_row(
        "💰 Tavily Cost", f"${tavily_cost:.4f}", f"{int(tavily_cost / 0.008)} API calls"
    )
    table.add_row("🤖 LLM Cost", f"${llm_cost:.4f}", "Language model usage")
    table.add_row("💸 Total Cost", f"${total_cost:.4f}", "All services combined")

    # Cost per sample
    cost_per_sample = total_cost / len(scores) if scores else 0
    table.add_row(
        "📋 Cost/Sample", f"${cost_per_sample:.4f}", "Average cost per evaluation"
    )

    console.print(table)

    # Score distribution
    if scores:
        score_text = Text()
        score_text.append("Score Distribution: ", style="bold")

        # Create simple histogram
        score_ranges = [
            (0.0, 0.2, "🔴"),
            (0.2, 0.4, "🟠"),
            (0.4, 0.6, "🟡"),
            (0.6, 0.8, "🟢"),
            (0.8, 1.0, "🟦"),
        ]

        for low, high, emoji in score_ranges:
            count = len(
                [s for s in scores if low <= s < high or (high == 1.0 and s == 1.0)]
            )
            if count > 0:
                score_text.append(
                    f"{emoji} {low:.1f}-{high:.1f}: {count} ", style="white"
                )

        console.print(
            Panel(score_text, title="📊 Score Breakdown", border_style="blue")
        )


# Alternative simpler version without rich dependency
def print_eval_results_simple(
    scores, tavily_cost, llm_cost, total_cost, dataset_size, num_threads=None
):
    """Print nicely formatted evaluation results (no rich dependency)"""

    # Calculate statistics
    average_score = sum(scores) / len(scores) if scores else 0
    max_score = max(scores) if scores else 0
    min_score = min(scores) if scores else 0
    success_rate = (
        len([s for s in scores if s > 0]) / len(scores) * 100 if scores else 0
    )
    cost_per_sample = total_cost / len(scores) if scores else 0

    print("=" * 60)
    print("🔍 EVALUATION RESULTS")
    print("=" * 60)

    print("📊 PERFORMANCE METRICS")
    print(f"   Average Score:     {average_score:.4f}")
    print(f"   Score Range:       {min_score:.4f} - {max_score:.4f}")
    print(
        f"   Success Rate:      {success_rate:.1f}% ({len([s for s in scores if s > 0])}/{len(scores)} samples)"
    )
    print(f"   Total Samples:     {dataset_size}")
    if num_threads:
        print(f"   Threads Used:      {num_threads}")

    print("\n💰 COST BREAKDOWN")
    print(
        f"   Tavily API:        ${tavily_cost:.4f} ({int(tavily_cost / 0.008)} calls)"
    )
    print(f"   LLM Usage:         ${llm_cost:.4f}")
    print(f"   Total Cost:        ${total_cost:.4f}")
    print(f"   Cost per Sample:   ${cost_per_sample:.4f}")

    print("\n📊 SCORE DISTRIBUTION")
    score_ranges = [
        (0.0, 0.2, "Poor"),
        (0.2, 0.4, "Fair"),
        (0.4, 0.6, "Good"),
        (0.6, 0.8, "Very Good"),
        (0.8, 1.0, "Excellent"),
    ]
    for low, high, label in score_ranges:
        count = len(
            [s for s in scores if low <= s < high or (high == 1.0 and s == 1.0)]
        )
        if count > 0:
            print(f"   {label:10} ({low:.1f}-{high:.1f}): {count:3d} samples")

    print("=" * 60)


def get_tavily_usage(previous_usage: int = 0):
    url = "https://api.tavily.com/usage"
    headers = {"Authorization": f"Bearer {os.getenv('TAVILY_API_KEY')}"}

    try:
        response = requests.request("GET", url, headers=headers)
        last_usage = response.json()["key"]["usage"]
        return last_usage - previous_usage
    except Exception as e:
        print(f"Error getting Tavily usage: {e}")
        return None


def get_llm_cost(program):
    return program.history[-1]["cost"]


def run_evals_sequentially(
    model, dataset, metric, program, delay_seconds=1, experiment_name="Test_eval_1"
):
    tavily_start_usage = get_tavily_usage()
    if tavily_start_usage is None:
        time.sleep(60)
        tavily_start_usage = get_tavily_usage()

    mlflow.dspy.autolog(log_traces_from_eval=True)

    with mlflow.start_run(run_name=experiment_name):
        scores = []
        llm_cost = 0
        average_score = 0
        for sample in tqdm(dataset):
            time.sleep(delay_seconds)
            with dspy.context(lm=dspy.LM(model, max_tokens=10000)):
                pred = program(**sample.inputs())
            llm_cost += get_llm_cost(program)
            score = metric(sample, pred)
            scores.append(score)
            average_score = sum(scores) / len(scores)
            print(f"Latest score: {score}, average score: {average_score}")

            # Log the aggregated score
            mlflow.log_metric("Recall_average_score", average_score)

    tavily_usage_end_of_run = get_tavily_usage()
    tavily_usage = tavily_usage_end_of_run - tavily_start_usage
    tavily_cost = tavily_usage * 0.008
    total_cost = tavily_cost + llm_cost

    print_eval_results(scores, tavily_cost, llm_cost, total_cost, len(dataset))

    return scores, tavily_cost, llm_cost, total_cost


def run_evals_parallel(
    model,
    dataset,
    metric,
    program,
    num_threads=4,
    delay_seconds=3,
    experiment_name="Test_eval_1",
):
    scores = []
    scores_lock = threading.Lock()

    # Cost tracking variables
    tavily_start_usage = get_tavily_usage()
    if tavily_start_usage is None:
        time.sleep(60)
        tavily_start_usage = get_tavily_usage()

    llm_cost = 0
    cost_lock = threading.Lock()

    def evaluate_sample(sample):
        nonlocal llm_cost

        time.sleep(delay_seconds)
        with dspy.context(lm=dspy.LM(model, max_tokens=10000)):
            pred = program(**sample.inputs())

        # Get costs for this sample
        sample_llm_cost = get_llm_cost(program)

        score = metric(sample, pred)

        # Thread-safe updates
        with cost_lock:
            llm_cost += sample_llm_cost

        with scores_lock:
            scores.append(score)
            average_score = sum(scores) / len(scores)
            print(
                f"Latest score: {score}, average score: {average_score:.4f}, completed: {len(scores)}/{len(dataset)}"
            )

        return score

    mlflow.dspy.autolog(log_traces_from_eval=True)
    with mlflow.start_run(run_name=experiment_name):
        with ThreadPoolExecutor(max_workers=num_threads) as executor:
            # Submit all tasks
            future_to_sample = {
                executor.submit(evaluate_sample, sample): sample for sample in dataset
            }

            # Process completed tasks
            for future in tqdm(
                as_completed(future_to_sample), total=len(future_to_sample)
            ):
                try:
                    future.result()  # This will raise any exceptions that occurred
                except Exception as e:
                    sample = future_to_sample[future]
                    print(f"Error evaluating sample: {e}")

        # Calculate final costs and average scores
        mlflow.log_metric("Recall_average_score", sum(scores) / len(scores))

    tavily_usage_end_of_run = get_tavily_usage()
    if tavily_usage_end_of_run is None:
        sleep_time = 60 * 4
        print(f"Tavily usage is None, waiting {sleep_time} seconds")
        time.sleep(sleep_time)
        tavily_usage_end_of_run = get_tavily_usage()
    tavily_usage = tavily_usage_end_of_run - tavily_start_usage
    tavily_cost = tavily_usage * 0.008
    total_cost = tavily_cost + llm_cost

    print_eval_results(scores, tavily_cost, llm_cost, total_cost, len(dataset))

    return scores, tavily_cost, llm_cost, total_cost

## Eval Runs


##### Single Agent


In [7]:
results = run_evals_parallel(
    model="openai/gpt-4.1-mini",
    dataset=test_examples[:3],
    metric=validate_leads_recall,
    program=single_agent,
)

rprint(results)

  0%|          | 0/3 [00:00<?, ?it/s]

Error getting the website map: 422 Client Error: Unprocessable Entity for url: https://api.tavily.com/map
Error getting the website content: Your request has been blocked due to excessive requests. Please reduce the rate of requests.
Error getting the website map: 422 Client Error: Unprocessable Entity for url: https://api.tavily.com/map
Error getting the website map: 422 Client Error: Unprocessable Entity for url: https://api.tavily.com/map
Error browsing the web: Your request has been blocked due to excessive requests. Please reduce the rate of requests.
Error browsing the web: Your request has been blocked due to excessive requests. Please reduce the rate of requests.


 33%|███▎      | 1/3 [01:12<02:24, 72.27s/it]

Latest score: 0.07462686567164178, average score: 0.0746, completed: 1/3
Error browsing the web: Your request has been blocked due to excessive requests. Please reduce the rate of requests.
Error getting the website content: Your request has been blocked due to excessive requests. Please reduce the rate of requests.


 67%|██████▋   | 2/3 [01:28<00:39, 39.36s/it]

Latest score: 0.6666666666666666, average score: 0.3706, completed: 2/3
Error browsing the web: Your request has been blocked due to excessive requests. Please reduce the rate of requests.
Error browsing the web: Your request has been blocked due to excessive requests. Please reduce the rate of requests.


100%|██████████| 3/3 [01:48<00:00, 36.19s/it]

Latest score: 0.0, average score: 0.2471, completed: 3/3
🏃 View run Test_eval_1 at: http://127.0.0.1:5000/#/experiments/2/runs/5ff81073a25b4b9eaba563944db1406d
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/2





TypeError: float() argument must be a string or a real number, not 'list'

In [None]:
from dspy.evaluate import Evaluate

mlflow.dspy.autolog(log_traces_from_eval=True)

with mlflow.start_run(run_name="Test_eval_1"):
    # Set up the evaluator, which can be re-used in your code.
    evaluator = Evaluate(
        devset=test_examples[0:],
        num_threads=3,
        display_progress=True,
        display_table=5,
        show_progress=True,
        provide_traceback=True,
        return_all_scores=True,
        return_outputs=True,
    )

    aggregated_score, outputs, all_scores = evaluator(
        single_agent, metric=validate_leads_recall
    )
    # Log the aggregated score
    mlflow.log_metric("exact_match", aggregated_score)
    # Log the detailed evaluation results as a table
    mlflow.log_table(
        {
            "question": [example.user_query for example in test_examples[1:4]],
            "answer": [example.leads for example in test_examples[1:4]],
            "output": outputs,
            "recall": all_scores,
        },
        artifact_file="eval_results.json",
    )

# Launch evaluation.

##### Multi Agent


### Parallel test


In [None]:
import logging
from typing import TYPE_CHECKING, Any, Callable, Type, List

from litellm import ContextWindowExceededError

import dspy
from dspy.adapters.types.tool import Tool
from dspy.signatures.signature import ensure_signature
from rich import print as rprint

logger = logging.getLogger(__name__)

if TYPE_CHECKING:
    from dspy.signatures.signature import Signature


class ReAct(dspy.Module):
    def __init__(
        self, signature: Type["Signature"], tools: list[Callable], max_iters: int = 10
    ):
        """
        ReAct stands for "Reasoning and Acting," a popular paradigm for building tool-using agents.
        In this approach, the language model is iteratively provided with a list of tools and has
        to reason about the current situation. The model decides whether to call a tool to gather more
        information or to finish the task based on its reasoning process. The DSPy version of ReAct is
        generalized to work over any signature, thanks to signature polymorphism.

        Args:
            signature: The signature of the module, which defines the input and output of the react module.
            tools (list[Callable]): A list of functions, callable objects, or `dspy.Tool` instances.
            max_iters (Optional[int]): The maximum number of iterations to run. Defaults to 10.

        Example:

        ```python
        def get_weather(city: str) -> str:
            return f"The weather in {city} is sunny."

        react = dspy.ReAct(signature="question->answer", tools=[get_weather])
        pred = react(question="What is the weather in Tokyo?")
        ```
        """
        super().__init__()
        self.signature = signature = ensure_signature(signature)
        self.max_iters = max_iters

        tools = [t if isinstance(t, Tool) else Tool(t) for t in tools]
        tools = {tool.name: tool for tool in tools}

        inputs = ", ".join([f"`{k}`" for k in signature.input_fields.keys()])
        outputs = ", ".join([f"`{k}`" for k in signature.output_fields.keys()])
        instr = [f"{signature.instructions}\n"] if signature.instructions else []

        instr.extend(
            [
                f"You are an Agent. In each episode, you will be given the fields {inputs} as input. And you can see your past trajectory so far.",
                f"Your goal is to use one or more of the supplied tools to collect any necessary information for producing {outputs}.\n",
                "To do this, you will interleave next_thought, next_tool_name, and next_tool_args in each turn, and also when finishing the task.",
                "You can use multiple tools in each turn, and you can use the same tool multiple times in the same turn.",
                "After each tool call, you receive a resulting observation, which gets appended to your trajectory.\n",
                "When writing next_thought, you may reason about the current situation and plan for future steps.",
                "When selecting next_tool_name and its next_tool_args, the tools must be on the following list:\n",
            ]
        )

        tools["finish"] = Tool(
            func=lambda: "Completed.",
            name="finish",
            desc=f"Marks the task as complete. That is, signals that all information for producing the outputs, i.e. {outputs}, are now available to be extracted.",
            args={},
        )

        for idx, tool in enumerate(tools.values()):
            instr.append(f"({idx + 1}) {tool}")
        instr.append(
            "When providing `next_tool_args`, the value inside the field must be in JSON format"
        )

        react_signature = (
            dspy.Signature({**signature.input_fields}, "\n".join(instr))
            .append("trajectory", dspy.InputField(), type_=str)
            .append("next_thought", dspy.OutputField(), type_=str)
            .append(
                "next_tool_name",
                dspy.OutputField(),
                type_=List[Literal[tuple(tools.keys())]],
            )
            .append("next_tool_args", dspy.OutputField(), type_=List[dict[str, Any]])
        )

        fallback_signature = dspy.Signature(
            {**signature.input_fields, **signature.output_fields},
            signature.instructions,
        ).append("trajectory", dspy.InputField(), type_=str)

        self.tools = tools
        self.react = dspy.Predict(react_signature)
        self.extract = dspy.ChainOfThought(fallback_signature)

    def _format_trajectory(self, trajectory: dict[str, Any]):
        adapter = dspy.settings.adapter or dspy.ChatAdapter()
        trajectory_signature = dspy.Signature(f"{', '.join(trajectory.keys())} -> x")
        return adapter.format_user_message_content(trajectory_signature, trajectory)

    def forward(self, **input_args):
        trajectory = {}
        max_iters = input_args.pop("max_iters", self.max_iters)
        for idx in range(max_iters):
            try:
                pred = self._call_with_potential_trajectory_truncation(
                    self.react, trajectory, **input_args
                )
            except ValueError as err:
                logger.warning(
                    f"Ending the trajectory: Agent failed to select a valid tool: {_fmt_exc(err)}"
                )
                break

            rprint(pred)

            trajectory[f"thought_{idx}"] = pred.next_thought
            trajectory[f"tool_name_{idx}"] = pred.next_tool_name
            trajectory[f"tool_args_{idx}"] = pred.next_tool_args

            try:
                trajectory[f"observation_{idx}"] = self.tools[pred.next_tool_name](
                    **pred.next_tool_args
                )
            except Exception as err:
                trajectory[f"observation_{idx}"] = (
                    f"Execution error in {pred.next_tool_name}: {_fmt_exc(err)}"
                )

            if pred.next_tool_name == "finish":
                break

        extract = self._call_with_potential_trajectory_truncation(
            self.extract, trajectory, **input_args
        )
        return dspy.Prediction(trajectory=trajectory, **extract)

    async def aforward(self, **input_args):
        trajectory = {}
        max_iters = input_args.pop("max_iters", self.max_iters)
        for idx in range(max_iters):
            try:
                pred = await self._async_call_with_potential_trajectory_truncation(
                    self.react, trajectory, **input_args
                )
            except ValueError as err:
                logger.warning(
                    f"Ending the trajectory: Agent failed to select a valid tool: {_fmt_exc(err)}"
                )
                break

            print("printing pred")
            rprint(pred)
            print()

            trajectory[f"thought_{idx}"] = pred.next_thought
            trajectory[f"tool_name_{idx}"] = pred.next_tool_name
            trajectory[f"tool_args_{idx}"] = pred.next_tool_args

            try:
                trajectory[f"observation_{idx}"] = await self.tools[
                    pred.next_tool_name
                ].acall(**pred.next_tool_args)
            except Exception as err:
                trajectory[f"observation_{idx}"] = (
                    f"Execution error in {pred.next_tool_name}: {_fmt_exc(err)}"
                )

            if pred.next_tool_name == "finish":
                break

        extract = await self._async_call_with_potential_trajectory_truncation(
            self.extract, trajectory, **input_args
        )
        return dspy.Prediction(trajectory=trajectory, **extract)

    def _call_with_potential_trajectory_truncation(
        self, module, trajectory, **input_args
    ):
        for _ in range(3):
            try:
                return module(
                    **input_args,
                    trajectory=self._format_trajectory(trajectory),
                )
            except ContextWindowExceededError:
                logger.warning(
                    "Trajectory exceeded the context window, truncating the oldest tool call information."
                )
                trajectory = self.truncate_trajectory(trajectory)

    async def _async_call_with_potential_trajectory_truncation(
        self, module, trajectory, **input_args
    ):
        for _ in range(3):
            try:
                return await module.acall(
                    **input_args,
                    trajectory=self._format_trajectory(trajectory),
                )
            except ContextWindowExceededError:
                logger.warning(
                    "Trajectory exceeded the context window, truncating the oldest tool call information."
                )
                trajectory = self.truncate_trajectory(trajectory)

    def truncate_trajectory(self, trajectory):
        """Truncates the trajectory so that it fits in the context window.

        Users can override this method to implement their own truncation logic.
        """
        keys = list(trajectory.keys())
        if len(keys) < 4:
            # Every tool call has 4 keys: thought, tool_name, tool_args, and observation.
            raise ValueError(
                "The trajectory is too long so your prompt exceeded the context window, but the trajectory cannot be "
                "truncated because it only has one tool call."
            )

        for key in keys[:4]:
            trajectory.pop(key)

        return trajectory


def _fmt_exc(err: BaseException, *, limit: int = 5) -> str:
    """
    Return a one-string traceback summary.
    * `limit` - how many stack frames to keep (from the innermost outwards).
    """

    import traceback

    return (
        "\n"
        + "".join(
            traceback.format_exception(type(err), err, err.__traceback__, limit=limit)
        ).strip()
    )

In [None]:
from src.types import LeadResults

# Enable caching
dspy.settings.configure(lm=dspy.LM("openai/gpt-4.1"), track_usage=True)


class SingleAgentSig(dspy.Signature):
    """
    You are an expert lead research agent specializing in finding high-quality contact information for specific professionals,
    researchers, and business contacts. Your mission is to conduct thorough, systematic research to identify leads that precisely
    match the user's criteria.

    You can use multiple tools in each turn, having them ran in parallel on the same search.
    You can also use the same tool multiple times in the same turn.
    """

    user_query: str = dspy.InputField()
    leads: LeadResults = dspy.OutputField()


single_agent = ReAct(
    SingleAgentSig, tools=[browse_web, get_website_map, get_website_content]
)

In [None]:
from rich import print as rprint
from src.agents.utils.build_final_query import build_final_query
from src.types import ResearchParams


user_query = build_final_query(
    ResearchParams(
        who_query="researchers",
        what_query="Human Nutrition",
        where_query="Edmonton",
        context_query="",
    )
)
result = await single_agent.acall(user_query=user_query)

rprint(result)