In [11]:

import xml.etree.ElementTree as ET
import json
import pandas as pd
from pathlib import Path

def parse_state_relationship(state):
    """Extract state and list of objects from annotation value."""
    if not state:
        return None, []
    
    # Handle cases like "occluded_by(car1, car2)" or "moving_towards(black_car, [ego_car, white_car])"
    if '(' in state and ')' in state:
        action = state.split('(')[0]
        content = state.split('(')[1].rstrip(')')
        
        # Handle list notation [obj1, obj2]
        if '[' in content and ']' in content:
            # Split on comma first to get subject and list of objects
            subject_and_objects = content.split(',', 1)
            if len(subject_and_objects) > 1:
                objects_str = subject_and_objects[1].strip()
                objects_str = objects_str.strip('[]')
                objects = [obj.strip() for obj in objects_str.split(',')]
                return action, objects
        
        # Regular case with comma-separated entities
        elif ',' in content:
            entities = [e.strip() for e in content.split(',')]
            return action, [entities[-1]]  # Return as list for consistency
        else:
            # Single entity case like "visible(car1)"
            return action, []
            
    return state, []

def get_video_metadata(json_data):
    """Extract video metadata from JSON."""
    first_annotation = json_data['annotations'][0]
    first_result = first_annotation['result'][0]
    
    sequence = first_result['value']['sequence']
    duration = first_result['value']['duration']
    
    # Get total frames
    total_frames = max(frame['frame'] for frame in sequence)
    
    # Calculate FPS using first two frames
    frame1, frame2 = sequence[0], sequence[1]
    fps = 1 / (frame2['time'] - frame1['time'])
    
    return {
        'duration': duration,
        'total_frames': total_frames,
        'fps': fps
    }

def parse_label_studio_json(file_path):
    """Parse JSON file to get bounding box data."""
    with open(file_path) as f:
        json_data = json.load(f)
    
    metadata = get_video_metadata(json_data)
    bbox_data = []
    
    print("Parsing JSON annotations...")
    for annotation in json_data.get('annotations', []):
        for result in annotation.get('result', []):
            if result.get('type') == 'videorectangle':
                entity = result.get('meta', {}).get('text', ['unknown'])[0]
                print(f"Found entity in JSON: {entity}")
                sequence = result.get('value', {}).get('sequence', [])
                
                for frame_data in sequence:
                    bbox_data.append({
                        'frame': frame_data.get('frame'),
                        'timestamp': frame_data.get('time'),
                        'entity': entity,
                        'bbox_x': frame_data.get('x'),
                        'bbox_y': frame_data.get('y'),
                        'bbox_width': frame_data.get('width'),
                        'bbox_height': frame_data.get('height'),
                        'bbox_enabled': frame_data.get('enabled', True)
                    })
    
    return pd.DataFrame(bbox_data), metadata

def expand_annotation_to_frames(row, fps, total_frames):
    """Expand single annotation to cover all relevant frames."""
    start_frame = max(1, int(row['timestamp_start'] * fps) + 1)
    end_frame = min(int(row['timestamp_end'] * fps) + 1, total_frames + 1)
    
    frames = range(start_frame, end_frame)
    frames_data = []
    
    # If there are objects, create a row for each object
    if row['objects']:  # Now expecting a list of objects
        for obj in row['objects']:
            for frame in frames:
                frames_data.append({
                    'frame': frame,
                    'timestamp': frame/fps,
                    'entity': row['entity'],
                    'category': row['category'],
                    'state': row['state'],
                    'object': obj
                })
    else:
        # No objects case
        for frame in frames:
            frames_data.append({
                'frame': frame,
                'timestamp': frame/fps,
                'entity': row['entity'],
                'category': row['category'],
                'state': row['state'],
                'object': None
            })
    
    return pd.DataFrame(frames_data)

def parse_elan_xml(file_path, fps, total_frames):
    """Parse XML file to get annotations."""
    tree = ET.parse(file_path)
    root = tree.getroot()
    
    time_slots = {
        slot.get('TIME_SLOT_ID'): float(slot.get('TIME_VALUE', 0))/1000.0 
        for slot in root.findall('.//TIME_SLOT')
    }
    
    annotations = []
    
    for tier in root.findall('.//TIER'):
        tier_id = tier.get('TIER_ID')
        if '(' not in tier_id or ')' not in tier_id:
            continue
            
        category, entity = tier_id.split('(')
        entity = entity.rstrip(')')
        
        for annotation in tier.findall('.//ALIGNABLE_ANNOTATION'):
            start_slot = annotation.get('TIME_SLOT_REF1')
            end_slot = annotation.get('TIME_SLOT_REF2')
            value = annotation.find('ANNOTATION_VALUE').text
            
            try:
                if start_slot in time_slots and end_slot in time_slots:
                    state, objects = parse_state_relationship(value)
                    
                    row = {
                        'timestamp_start': time_slots[start_slot],
                        'timestamp_end': time_slots[end_slot],
                        'entity': entity,
                        'category': category,
                        'state': state,
                        'objects': objects  # Now passing list of objects
                    }
                    
                    frame_rows = expand_annotation_to_frames(row, fps, total_frames)
                    annotations.append(frame_rows)
            except Exception as e:
                print(f"Error processing annotation: {value}")
                print(f"In tier: {tier_id}")
                print(f"Error: {str(e)}")
                raise
    
    return pd.concat(annotations, ignore_index=True) if annotations else pd.DataFrame()

def create_base_dataframe(metadata, bbox_df):
    """Create base DataFrame with all frame-entity combinations."""
    entities = bbox_df['entity'].unique()
    frames = range(1, metadata['total_frames'] + 1)
    
    frame_entity_pairs = [(frame, entity) 
                         for frame in frames 
                         for entity in entities]
    
    base_df = pd.DataFrame(frame_entity_pairs, columns=['frame', 'entity'])
    base_df['timestamp'] = base_df['frame'] / metadata['fps']
    
    return base_df

def process_files(xml_path, json_path):
    """Process XML and JSON files and return combined DataFrame."""
    # Get bounding box data and metadata
    bbox_df, metadata = parse_label_studio_json(json_path)
    print("\nJSON DataFrame shape:", bbox_df.shape)
    print("JSON first few rows:")
    print(bbox_df.head())
    
    # Create base DataFrame with all frames
    base_df = create_base_dataframe(metadata, bbox_df)
    print("\nBase DataFrame shape:", base_df.shape)
    
    # Get annotations
    xml_df = parse_elan_xml(xml_path, metadata['fps'], metadata['total_frames'])
    print("\nXML DataFrame shape:", xml_df.shape)
    print("XML first few rows:")
    print(xml_df.head())
    
    # Merge bounding box data
    # First round the timestamps to handle floating point differences
    bbox_df['timestamp'] = bbox_df['timestamp'].round(3)
    base_df['timestamp'] = base_df['timestamp'].round(3)
    
    print("\nAttempting merge with bounding box data...")
    with_bbox = pd.merge(
        base_df,
        bbox_df,
        on=['frame', 'entity'],  # removed timestamp from merge keys
        how='left'
    )
    print("\nAfter bbox merge shape:", with_bbox.shape)
    
    # Update timestamp from bbox_df where available
    with_bbox['timestamp'] = with_bbox['timestamp_y'].fillna(with_bbox['timestamp_x'])
    with_bbox = with_bbox.drop(['timestamp_x', 'timestamp_y'], axis=1)
    
    # Round timestamp for XML merge
    if not xml_df.empty:
        xml_df['timestamp'] = xml_df['timestamp'].round(3)
        
        print("\nAttempting merge with XML annotations...")
        final_df = pd.merge(
            with_bbox,
            xml_df,
            on=['frame', 'entity'],  # removed timestamp from merge keys
            how='left'
        )
        
        # Update timestamp from xml_df where available
        final_df['timestamp'] = final_df['timestamp_y'].fillna(final_df['timestamp_x'])
        final_df = final_df.drop(['timestamp_x', 'timestamp_y'], axis=1)
    else:
        final_df = with_bbox
        final_df['category'] = None
        final_df['state'] = None
        final_df['object'] = None
    
    # Mark frames with/without annotations
    final_df['has_annotation'] = ~final_df['state'].isna()
    
    print("\nFinal DataFrame shape:", final_df.shape)
    print("Final DataFrame columns:", final_df.columns.tolist())
    
    return final_df, metadata

if __name__ == "__main__":
    # Test with sample files
    xml_path = "../data/sample/sample_1.eaf"
    json_path = "../data/sample/sample_1.json"
    
    df, metadata = process_files(xml_path, json_path)
    print("Video metadata:", metadata)
    print("\nDataFrame shape:", df.shape)
    print("\nColumns:", df.columns.tolist())
    print("\nSample data:")
    print(df.head())

Parsing JSON annotations...
Found entity in JSON: white_car
Found entity in JSON: black_car

JSON DataFrame shape: (608, 8)
JSON first few rows:
   frame  timestamp     entity     bbox_x     bbox_y  bbox_width  bbox_height  \
0     96       3.84  white_car  26.609375  54.166667    6.281250     3.833333   
1     97       3.88  white_car  26.658288  54.166667    6.232337     3.833333   
2     98       3.92  white_car  26.617712  54.166667    6.272913     3.833333   
3     99       3.96  white_car  26.577137  54.166667    6.313488     3.833333   
4    100       4.00  white_car  27.211561  54.166667    5.679064     3.833333   

   bbox_enabled  
0          True  
1          True  
2          True  
3          True  
4          True  

Base DataFrame shape: (1128, 3)

XML DataFrame shape: (4230, 6)
XML first few rows:
   frame  timestamp   entity    category         state object
0    100       4.00  ego_car  Visibility  out_of_frame   None
1    101       4.04  ego_car  Visibility  out_of_fr