# Automatic Evaluation of the DraCor MCP  

In [None]:
import json
import os
from pydracor import DraCorAPI, Corpus
import anthropic
from jsonschema import validate, ValidationError, Draft7Validator
from datetime import datetime

In [None]:
from typing import Union

### Get corpora hashes for documentation

In [None]:
dracor = DraCorAPI()
dracor.get_info()

In [None]:
corpora = dracor.get_corpora()
for corpus in corpora:
    print(f"{corpus.name}: {corpus.commit}")

### Data Type Schemas

In [None]:
NUMBER_SCHEMA = {
    "type": "number"
}

In [None]:
STRING_SCHEMA = {
    "type": "string",
    "minLength": 1 
}

In [None]:
validate(instance=5.0, schema=NUMBER_SCHEMA)

### Read in questions

In [None]:
# Read in question data 
with open("../../../2025-DH-Paper-Automatic-Evaluation/preliminary_work/MCP_Evaluation_Question_Experiments.json") as question_json:
    questions = json.load(question_json)

In [None]:
# Read in question data with notes
with open("../../../2025-DH-Paper-Automatic-Evaluation/preliminary_work/mcp-evaluation-experiments.json") as question_json:
    question_notes = json.load(question_json)

In [None]:
with open("../../../2025-DH-Paper-Automatic-Evaluation/preliminary_work/mcp-evaluation-experiments-with-manual-results-data-type.json", 'w') as question_json_out:
    json.dump(question_notes, question_json_out)

In [None]:
for question_a, question_b in zip(questions, question_notes):
    question_b["Response Type"] = question_a["Response Type"]

In [None]:
# Read in question data with ID and data type annotations
# Use this one!
with open("../../../2025-DH-Paper-Automatic-Evaluation/preliminary_work/mcp-evaluation-experiments-with-manual-results-data-type.json") as question_json:
    questions = json.load(question_json)

In [None]:
questions[9]

### Create Validation Data and Examples Mapping

In [None]:
validation_data = {
    "int": {
        "example": 5,
        "schema": NUMBER_SCHEMA
    },
    "str": {
        "example": "Emma",
        "schema": STRING_SCHEMA
    },
    "float": {
        "example": 5.0,
        "schema": NUMBER_SCHEMA
    }
}

In [None]:
for key, values in validation_data.items():
    try:
        validate(instance=values["example"], schema=values["schema"])
    except:
        print(values)

### Prepare requests

In [None]:
# Your server URL (replace with your actual URL)
url = 'https://dev.dracor.org/'

In [None]:
# set client 
client = anthropic.Anthropic()

# set server info
server_info = {
    "type": "url",
    "url": f"{url}/mcp/",
    "name": "dracor-mcp",
}

#### Create Messages

In [None]:
# Create message content with prompt for data type 
def create_content_schema(question_info: dict[str, Union[str, int]], 
                   validation_data: dict[str, Union[int, str, float, dict[str, Union[str,int]]]]) -> str:
    content = question_info["Question"]
    response_type = question_info["Response Type"]
    example = validation_data[response_type]["example"]
    schema = validation_data[response_type]["schema"]
    restriction = f"CRITICAL: The answer should be of the data type {response_type}. \
    Do not include any additional explanation, markdown formatting, or text."
    example = f"Example for an answer is: {example}"
    return " ".join([content, restriction, example]), schema

In [None]:
# Create message content without validation
def create_content(question_info: dict[str, Union[str, int]]) -> str:
    content = question_info["Question"]
    restriction = f"CRITICAL: The answer should be as short and as precise as possible."
    return " ".join([content, restriction])

We skip the 'open questions' here:

In [None]:
# Create messages
messages_info = []
for entry in questions:
    if entry["Response Type"] in validation_data:
        content, schema = create_content_schema(entry, validation_data)
    else:
        content = create_content(entry)
        schema = None
    message = {
        "role": "user",
        "content": content
    }
    messages_info.append({"message":message, "schema":schema, "id":entry["ID"]})

In [None]:
# set here or directly?
hyperparameter = {
    "model": "claude-sonnet-4-20250514",
    "max_tokens": 1024,
}

In [None]:
messages_info[9]

### Perform Request
* types:
  * mcp_tool_result
  * mcp_tool_use
  * text  

In [None]:
def isfloat(i: str) -> bool:
    try:
        float(i)
        return True
    except ValueError:
        return False

In [None]:
def perform_request(client: anthropic.Anthropic,
                    message: dict[str, str],
                    server_info: dict,
                    model="claude-sonnet-4-20250514"):
    response = client.beta.messages.create(
        model=model,
        max_tokens=1000,
        messages=[message],
        mcp_servers=[
            server_info
        ],
        extra_headers={
            "anthropic-beta": "mcp-client-2025-04-04"
        }
    )
    return response

def process_response(response, schema: dict):
    
    # Extract text
    text_responses = [block.text for block in response.content if block.type == "text"] # this gets all text blocks not only the final one 
    response_text = text_responses[-1].strip() # get final answer

    tools_used = []
    tools_result = {}
    for block in response.content:
        # Check if tool use was successful, did not result in an error
        if block.type == "mcp_tool_result":
            result_content = block.content
            # tool use id mapped to API error (bool)
            tools_result[block.tool_use_id] = block.is_error
        # Extract tools
        # also track IDs to be able to match the tool use with the tool response
        elif block.type == "mcp_tool_use":
            tools_info = {"name": block.name, "input": block.input,"id": block.id}
            tools_used.append(tools_info)
    
    for entry in tools_used:
        if entry["id"] in tools_result:
            entry["is_error"] = tools_result[entry["id"]]

    tool_chain = [entry["name"] for entry in tools_used]
    
    # Get usage info
    usage_infos = {
        "cache_creation_input_tokens": response.usage.cache_creation_input_tokens,
        "cache_read_input_tokens": response.usage.cache_read_input_tokens,
        "input_tokens": response.usage.input_tokens, 
        "output_tokens": response.usage.output_tokens  
    }

    if schema:
        if response_text.isnumeric():
            response_text = int(response_text)
        elif isfloat(response_text):
            response_text = float(response_text)
    
        # Validate
        try:
            # Validate against schema
            validate(instance=response_text, schema=schema)
            valid = True 
        except:
            valid = False
    else:
        valid = None
        
    return {
        "valid": valid,
        "response": response_text,
        "tools_used": tools_used,
        "tool_chain": tool_chain,
        "usage_infos": usage_infos
    }

### Test

In [None]:
 messages_info[1]

In [None]:
response =  perform_request(client, messages_info[2]["message"], server_info)

In [None]:
extracted_info = process_response(response,messages_info[2]["schema"])

In [None]:
extracted_info

### Apply to all

Goal: ten iterations, if much variation augment to 50 (100)

In [None]:
# set number of runs
start = 1
end = 2
output_dir = "../results/"

for i in range(start, end):
    for message_info in messages_info[1:2]:
        response = perform_request(client, message_info["message"],server_info)
        extracted_response = process_response(response, message_info["schema"])
        # add metdata
        extracted_response["id"] = message_info["id"]
        extracted_response["run"] = i
        extracted_response["datetime"] = datetime.now().isoformat()
         
        # create outputfile for extracted data
        fp_extracted = f"{output_dir}{message_info['id']}_{i}_extracted-info.json"
        fp = f"{output_dir}{message_info['id']}_{i}_raw.json"
        print(fp)
        with open(fp_extracted, 'w') as result_out:
            json.dump(extracted_response, result_out)

        with open(fp, 'w') as result_out:
            json.dump(response.to_dict(), result_out)