### How to test MCP 

In [1]:
from langchain_ollama import ChatOllama
from langchain_openai import ChatOpenAI
from langchain.tools import tool
from langchain.agents import initialize_agent, AgentType
from langchain_community.tools import DuckDuckGoSearchRun
from langchain.agents import AgentExecutor, create_tool_calling_agent
from langchain_core.prompts import ChatPromptTemplate

llm = ChatOllama(
    base_url="http://localhost:11434",
    model = "qwen3:latest",
    temperature=0.5,
    max_tokens = 250
)

model = ChatOpenAI(model="gpt-4o")

search_tool = DuckDuckGoSearchRun()

@tool
def add_numbers(a: int, b: int) -> int:
    "Add two numbers + 20 and return results."
    return int(a) + int(b) + 20

@tool
def subtract_numbers(a: int, b: int) -> int:
    "Subtract two numbers and return results."
    return int(a) - int(b)

tools = [add_numbers, subtract_numbers, search_tool]

agent = initialize_agent(
    tools= tools,
    llm=llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    return_intermediate_steps=True
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", "You are a helpful assistant"),
        ("human", "{input}"),
        # Placeholders fill up a **list** of messages
        ("placeholder", "{agent_scratchpad}"),
    ]
)

agent = create_tool_calling_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools,return_intermediate_steps=True)
query="Who is the current president of USA in 2025, just give the name"
agent_executor.invoke({"input": query})

response = agent.invoke("Who is the current president of USA in 2025, just give the name")

print(response)

def query_ai_agent(question):
    response = agent_executor.invoke({"input": question})
    intermediate_steps = response['intermediate_steps']
    agent_action, results = intermediate_steps[0]
    tool = agent_action.tool
    tool_input = agent_action.tool_input
    
    # Clean up the output by removing <think> tags
    output = response['output']
    if '<think>' in output:
        # Remove everything between <think> and </think> tags
        import re
        output = re.sub(r'<think>.*?</think>\s*', '', output, flags=re.DOTALL)
        response['output'] = output.strip()
    
    return response, tool, tool_input

response,tool, tool_input = query_ai_agent("What is the sum of 20 and 40")
print(response)

print(tool)

print(tool_input)


  agent = initialize_agent(


UnboundLocalError: cannot access local variable 'results' where it is not associated with a value

## Testing

In [3]:
from deepeval.test_case import ToolCall
from deepeval.test_case import LLMTestCase
from deepeval.metrics import ToolCorrectnessMetric
from deepeval.dataset import EvaluationDataset
from deepeval.metrics import AnswerRelevancyMetric
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCaseParams

# Custom GEval metric that understands your tool's behavior
custom_correctness_metric = GEval(
    name="Custom Tool Output Correctness",
    criteria="""Check if both outputs contain the same numerical answer.
    
    TASK: Extract the number from each output and compare them.
    
    For example:
    - If actual output says "The sum of 20 and 40 is 80"
    - And expected output says "the sum of 20 and 40 is 80"
    - Both contain the number 80, so score = 1.0
    
    SCORING:
    - If both outputs contain the SAME number: score = 1.0
    - If outputs contain DIFFERENT numbers: score = 0.0
    - Ignore capitalization, punctuation, and formatting differences""",
    evaluation_params=[LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT]
)

def query_ai_agent(question):
    response = agent_executor.invoke({"input": question})
    intermediate_steps = response['intermediate_steps']
    agent_action, results = intermediate_steps[0]
    tool = agent_action.tool
    tool_input = agent_action.tool_input
    
    # Clean up the output by removing <think> tags
    output = response['output']
    if '<think>' in output:
        # Remove everything between <think> and </think> tags
        import re
        output = re.sub(r'<think>.*?</think>\s*', '', output, flags=re.DOTALL)
        response['output'] = output.strip()
    
    return response, tool, tool_input

test_data = [
    {
        "input" : "What is the sum of 20 and 40",
        "expected_output": "the sum of 20 and 40 is 80",
        "tool_called": [
            ToolCall(name="add_numbers")
        ]
    }
]

response,tool, tool_input = query_ai_agent(test_data[0]['input'])

test_case = LLMTestCase (
    input=test_data[0]['input'],
    expected_output=test_data[0]['expected_output'],
    tools_called=[ToolCall(name=tool)],
    actual_output=response['output'],
    expected_tools=[ToolCall(name="add_numbers")]
)

dataset = EvaluationDataset(test_cases=[test_case])

# Test tool correctness
toolCorrectMetric = ToolCorrectnessMetric()
toolCorrectMetric.measure(test_case=test_case)
print(f"Tool Correctness Score: {toolCorrectMetric.score}, Reason: {toolCorrectMetric.reason}")

# Test with custom metric that understands your tool's behavior
custom_correctness_metric.measure(test_case)
print(f"Custom Correctness Score: {custom_correctness_metric.score}, Reason: {custom_correctness_metric.reason}")

# Original answer relevancy (will likely fail because it expects 60)
answer_relevancy_metric = AnswerRelevancyMetric()
answer_relevancy_metric.measure(test_case)
print(f"Answer Relevancy Score: {answer_relevancy_metric.score}, Reason: {answer_relevancy_metric.reason}")

print(f"\nActual output: {response['output']}")
print(f"Tool input: {tool_input}")
print(f"Expected output: {test_data[0]['expected_output']}")

# Evaluate with the metrics that make sense for your use case
dataset.evaluate(metrics=[toolCorrectMetric, custom_correctness_metric])


Output()

Output()

Tool Correctness Score: 1.0, Reason: All expected tools ['add_numbers'] were called (order not considered).


Output()

Custom Correctness Score: 0.5411796259620383, Reason: The numbers extracted from both the actual output and the expected output are identical: 20, 40, and 80. This shows complete alignment with the evaluation steps.


Answer Relevancy Score: 0.0, Reason: The score is 0.00 because the output incorrectly states the sum of 20 and 40 as 80, which is irrelevant and incorrect. The correct sum is 60.

Actual output: The sum of 20 and 40 is 80.
Tool input: {'a': 20, 'b': 40}
Expected output: the sum of 20 and 40 is 80


Output()



Metrics Summary

  - ✅ Tool Correctness (score: 1.0, threshold: 0.5, strict: False, evaluation model: None, reason: All expected tools ['add_numbers'] were called (order not considered)., error: None)
  - ✅ Custom Tool Output Correctness (GEval) (score: 0.5698107576407214, threshold: 0.5, strict: False, evaluation model: gpt-4o, reason: The numbers extracted from both the actual output (20, 40, 80) and the expected output (20, 40, 80) are identical, indicating perfect alignment., error: None)

For test case:

  - input: What is the sum of 20 and 40
  - actual output: The sum of 20 and 40 is 80.
  - expected output: the sum of 20 and 40 is 80
  - context: None
  - retrieval context: None


Overall Metric Pass Rates

Tool Correctness: 100.00% pass rate
Custom Tool Output Correctness (GEval): 100.00% pass rate




EvaluationResult(test_results=[TestResult(name='test_case_0', success=True, metrics_data=[MetricData(name='Tool Correctness', threshold=0.5, success=True, score=1.0, reason="All expected tools ['add_numbers'] were called (order not considered).", strict_mode=False, evaluation_model=None, error=None, evaluation_cost=None, verbose_logs='Expected Tools:\n[\n    ToolCall(\n        name="add_numbers"\n    )\n] \n \nTools Called:\n[\n    ToolCall(\n        name="add_numbers"\n    )\n]'), MetricData(name='Custom Tool Output Correctness (GEval)', threshold=0.5, success=True, score=0.5698107576407214, reason='The numbers extracted from both the actual output (20, 40, 80) and the expected output (20, 40, 80) are identical, indicating perfect alignment.', strict_mode=False, evaluation_model='gpt-4o', error=None, evaluation_cost=0.0013275000000000001, verbose_logs='Criteria:\nCheck if both outputs contain the same numerical answer.\n\n    TASK: Extract the number from each output and compare them.