In [9]:
#%pip install pandas
#%pip install matplotlb
#%pip install openpyxl

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import json

In [3]:
import xml.etree.ElementTree as ET
import re

def parse_state_relationship(state):
    # Parsing XML/EAF file and unlaoding the tree
    if not state or state == 'unknown':
        return None, None, None
    
    # Pattern match for realationship where there are multiple entities involved
    match = re.match(r'(\w+)\(([\w_]+),\s*([\w_]+)\)', state)
    if match:
        relationship_type, subject, object_entity = match.groups()
        return relationship_type, subject, object_entity
    
	# Single entity match
    match = re.match(r'(\w+)\(([\w_]+)\)', state)
    if match:
        relationship_type, subject = match.groups()
        return relationship_type, subject, None
    
    return state, None, None

def parse_elan_xml(xml_file):
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Get time slots mapping
    time_slots = {slot.get('TIME_SLOT_ID'): float(slot.get('TIME_VALUE', 0))/1000.0 
                 for slot in root.findall('.//TIME_SLOT')}
    
    annotations_data = []
    
    for tier in root.findall('.//TIER'):
        tier_id = tier.get('TIER_ID')
        
        # Skip tiers without category/entity information
        if '(' not in tier_id or ')' not in tier_id:
            continue
            
        category, entity = tier_id.split('(')
        entity = entity.rstrip(')')
        
        for annotation in tier.findall('.//ALIGNABLE_ANNOTATION'):
            start_slot = annotation.get('TIME_SLOT_REF1')
            end_slot = annotation.get('TIME_SLOT_REF2')
            state = annotation.find('ANNOTATION_VALUE').text
            
            relationship_type, subject, object_entity = parse_state_relationship(state)
            
            if start_slot in time_slots and end_slot in time_slots:
                annotations_data.append({
                    'category': category,
                    'entity': entity,
                    'start_time': time_slots[start_slot],
                    'end_time': time_slots[end_slot],
                    'duration': time_slots[end_slot] - time_slots[start_slot],
                    'state': state,
                    'relationship_type': relationship_type,
                    'subject_entity': subject if subject else entity,
                    'object_entity': object_entity
                })
    
    # Main dataframe with annotations
    df_annotations = pd.DataFrame(annotations_data)
    
    # Interaction dataframe subset
    df_relationships = df_annotations[df_annotations['object_entity'].notna()].copy()
    
    return df_annotations, df_relationships

In [5]:
df_annotations, df_relationships = parse_elan_xml('./data/sample_sport_data.eaf')


In [10]:
df_annotations.to_excel("./data/output_data.xlsx", sheet_name='data')