In [10]:
prompt_template = '''
You are an expert in converting natural language descriptions of events into structured annotations using a predefined Hierarchical Event Descriptor (HED) vocabulary. Your task is to extract relevant concepts from the input description and represent them as a comma-separated list of HED tags, strictly adhering to the provided vocabulary.

**Crucial Constraints:**
* **Vocabulary Adherence:** You MUST ONLY use tags (single terms) found in the provided HED vocabulary schema. Do not introduce any new words, synonyms, or full hierarchical paths in the annotation. When a concept maps to a hierarchical tag, use only the **most specific (leaf) tag** from the hierarchy.
* **Output Format:** Your output must be a comma-separated list of HED tags. Each tag or nested group of tags should be enclosed in parentheses `()`. For tags that take a value (indicated by `#` in the schema), replace `#` with the appropriate value from the description.
* **Completeness:** Capture all relevant information from the input description using the vocabulary.
* **Conciseness:** Avoid redundant tags.
* **No Explanations:** Do not provide any conversational filler, explanations, or additional text beyond the annotations themselves, *except for the reasoning process requested below*.
* **Handling Unmappable Concepts:** If a concept in the description cannot be mapped to any tag in the vocabulary, omit it. Do not invent tags or try to approximate.

**Reasoning Process (Output this first, then the Annotation):**
Before providing the final annotation, detail your thought process in the following steps:
1.  **Breakdown:** Identify the key elements or phrases in the description that represent distinct concepts.
2.  **Mapping:** For each key element, find the most specific corresponding tag in the provided HED vocabulary schema. If a concept cannot be mapped, note it as unmappable.
3.  **Structuring:** Determine how the identified tags should be grouped or nested according to HED conventions (e.g., (Object, Action, Object) or (Main_Event, (Property, Value))).

**HED Vocabulary Schema:**
{{hed_vocab}}

**Examples:**

Description: The foreground view consists of a large number of ingestible objects, indicating a high quantity. The background view includes an adult human body, outdoors in a setting that includes furnishings, natural features such as the sky, and man-made objects in an urban environment

--- REASONING PROCESS START ---
1.  **Breakdown**: The description includes elements about visual perspective (foreground/background), quantity, types of objects, human characteristics, environmental setting, and urban features.
2.  **Mapping**:
    * "foreground view": `Foreground-view`
    * "large number": `Item-count`, `High`
    * "ingestible objects": `Ingestible-object`
    * "background view": `Background-view`
    * "adult human body": `Human`, `Body`
    * "outdoors": `Outdoors`
    * "furnishings": `Furnishing`
    * "natural features": `Natural-feature` (since "sky" isn't a leaf tag, use parent)
    * "man-made objects": `Man-made-object`
    * "urban environment": `Urban`
3.  **Structuring**: Group related visual elements, quantify, specify human attributes, and categorize environmental aspects.
--- REASONING PROCESS END ---

--- ANNOTATION START ---
(Foreground-view, (Item-count, High), Ingestible-object), (Background-view, (Human, Body, Outdoors, Furnishing, Natural-feature, Urban, Man-made-object)
--- ANNOTATION END ---

Description from user:
{{description}}

--- REASONING PROCESS START ---
--- REASONING PROCESS END ---

--- ANNOTATION START ---
--- ANNOTATION END ---
'''

In [None]:
from ollama import chat
from ollama import ChatResponse
from google import genai

from jinja2 import Template
from dotenv import load_dotenv
load_dotenv()
import os
# load the HED vocabulary from HED_vocab_reformatted.xml into a string
hed_vocab = ''
with open('HED_vocab_reformatted.xml', 'r') as file:
    hed_vocab = file.read()

# render the jinja template from prompt_template variable
template = Template(prompt_template)

query = 'A participant is looking at a computer screen displaying an image of a red car.'

prompt = template.render(hed_vocab=hed_vocab, description=query)

# The client gets the API key from the environment variable `GEMINI_API_KEY`.
client = genai.Client()

model = 'qwen3:8b'
# call the chat function with the model and the prompt
response: ChatResponse = chat(model=model, messages=[
  {
    'role': 'user',
    'content': prompt,
  },
])
print(response['message']['content'])

# save the prompt template and model response to a json file
import json
i = 0
filename = 'prompt_experiments/experiment_'
while os.path.exists(f'{filename}{i}.json'):
    i += 1

with open(f'{filename}{i}.json', 'w') as f:
    json.dump({
        'model': model,
        'prompt_template': prompt_template,
        'model_response': response['message']['content']
    }, f)


<think>
Okay, let's tackle this annotation task. The user provided a description: "A participant is looking at a computer screen displaying an image of a red car." I need to break this down using the given tags.

First, I'll identify the key elements. The main components here are the participant, the computer screen, and the red car. Let's start with the participant. The tag for the participant's action might be under Task-property or maybe under Task-event-role. Wait, looking back at the tags, there's a Task-event-role with "Participant-response" and "Task-activity". But the participant here is the one observing, so maybe "Participant-response" isn't the right fit. Alternatively, maybe it's under the Task-action-type, like "Appropriate-action" or something else. Hmm, maybe not. Let me check the tags again.

Wait, the tags include "Task-event-role" with "Participant-response" and "Task-activity". The participant's action here is observing, so maybe "Task-activity" could be relevant. Bu

In [None]:
from hed import HedString, load_schema_version
from hed.errors import ErrorHandler, get_printable_issue_string
from hed.validator import HedValidator

def validate_hed_string(hed_string: str, schema_name='standard', schema_version='8.4.0') -> str:
    '''
    Validate a HED string and return validation issues if any'''
    if schema_name != 'standard':
        schema = load_schema_version(f'{schema_name}_{schema_version}')
    else:
        schema = load_schema_version(f'{schema_version}')
    check_for_warnings = True
    data = hed_string
    hedObj = HedString(data, schema)
    short_string = hedObj.get_as_form('short_tag')

    # Validate the string
    error_handler = ErrorHandler(check_for_warnings=check_for_warnings)
    # validator = HedValidator(schema, def_dict)
    validator = HedValidator(schema)
    issues = validator.validate(hedObj, allow_placeholders=False, error_handler=error_handler)
    
    if issues:
        # issues = get_printable_issue_string(issues, 'Validation issues').strip('\n')
        # return issues
        return len(issues)
    else:
        # return 'No issue found'
        return 0

In [27]:
validate_hed_string('(Experimental-stimulus, Visual-presentation, (Square, (Color, Blue), (Left-side-of, Computer-screen)))')

0


'No issue found'

In [28]:
validate_hed_string('(Visual-presentation, (2D-view, (Color, Blue), (Spatial-relation, Left-side-of))) ')

0


'No issue found'

In [31]:
print(validate_hed_string('(Experiment-participant, ((hed_tags: See), ((Item-count, High), Computer-screen, (Image, (Red, Car)))))'))

3
Validation issues
TAG_NAMESPACE_PREFIX_INVALID: Prefixes can only contain alpha characters. - 'hed_tags:'
CHARACTER_INVALID: Invalid character ' ' in tag 'hed_tags: See'
TAG_NAMESPACE_PREFIX_INVALID: Tag 'hed_tags: See has unknown namespace 'hed_tags:'.  Valid prefixes: ['']


In [56]:
def grade_annotation_quality(description, annotation, grader_model='mistral:latest'):
    """
    Grade the quality of an annotation using an LLM grader.
    Returns a score from 0-10 based on clarity and how well the original description can be inferred.
    """
    print("annotation:", annotation)
    try:
        # Quality grading prompt template
        grading_prompt = f"""
Here's the requested description:
{description}

Here's the generated annotation:
{annotation}

Evaluate the quality of the annotation on a scale of 0 to 10.
- If the annotation is completely empty or contains only whitespace, the score must be 0.
- Otherwise, the score should reflect how accurately and comprehensively the original description can be inferred *solely* from the annotation. A score of 10 means the original description can be perfectly understood from the annotation. A score of 0 (besides the empty case) means no meaningful information about the description can be inferred.
"""
        
        # Use Ollama API to grade the annotation
        response = chat(model=grader_model, messages=[
            {
                'role': 'user',
                'content': grading_prompt,
            },
        ])
        
        grader_response = response['message']['content']
        
        # Extract numeric score from response
        score = extract_quality_score(grader_response)
        
        return {
            'score': score,
            'full_response': grader_response,
            'grader_model': grader_model
        }
        
    except Exception as e:
        print(f"Quality grading error: {e}")
        return {
            'score': None,
            'full_response': f"Error: {str(e)}",
            'grader_model': grader_model
        }

def extract_quality_score(response_text):
    """Extract numeric score from grader response"""
    import re
    
    # Look for patterns like "8/10", "score: 7", "rating of 9", etc.
    patterns = [
        r'(\d+(?:\.\d+)?)/10',  # X/10 format
        r'(?:score|rating|grade)(?:\s*:?\s*|\s+(?:of|is)\s+)(\d+(?:\.\d+)?)',  # score: X, rating of X, etc.
        r'(\d+(?:\.\d+)?)\s*(?:out of 10|/10)',  # X out of 10
        r'(?:^|\s)(\d+(?:\.\d+)?)(?:\s*$|\s)',  # standalone number
    ]
    
    for pattern in patterns:
        matches = re.findall(pattern, response_text, re.IGNORECASE)
        if matches:
            try:
                score = float(matches[0])
                # Ensure score is within 0-10 range
                if 0 <= score <= 10:
                    return score
            except ValueError:
                continue
    
    return None

In [57]:
# load the experiment data from the json files 
for i in range(6):
    with open(f'prompt_experiments/experiment_{i}.json', 'r') as f:
        experiment_data = json.load(f)
        print(experiment_data)
        annotation = experiment_data['annotation']
        if len(annotation) == 0:
            annotation = 'No annotation provided' 
        query = experiment_data['description']
        print(f"Experiment {i} - Description: {query}")
        # print(f"Experiment {i} - Annotation: {annotation}")
        quality_score = grade_annotation_quality(query, annotation)
        print(f"Quality score for experiment {i}: {quality_score}")
        # save the quality score to the experiment data
        experiment_data['quality_grade'] = quality_score
        # save the updated experiment data to the json file
        with open(f'prompt_experiments/experiment_{i}.json', 'w') as f:
            json.dump(experiment_data, f, indent=4)
        print(f"Updated experiment {i} saved with quality grade.")

{'model': 'deepseek-r1:8b', 'experiment_id': 0, 'annotation': '(Experiment-participant, ((hed_tags: See), ((Item-count, High), Computer-screen, (Image, (Red, Car)))))', 'validation_issues': 3, 'description': 'A participant is looking at a computer screen displaying an image of a red car.', 'prompt_template': '\nYou are an expert in converting natural language descriptions of events into structured annotations using a predefined Hierarchical Event Descriptor (HED) vocabulary. Your task is to extract relevant concepts from the input description and represent them as a comma-separated list of HED tags, strictly adhering to the provided vocabulary.\n\n**Crucial Constraints:**\n* **Vocabulary Adherence:** You MUST ONLY use tags (single terms) found in the provided HED vocabulary schema. Do not introduce any new words, synonyms, or full hierarchical paths in the annotation. When a concept maps to a hierarchical tag, use only the **most specific (leaf) tag** from the hierarchy.\n* **Output Fo