In [450]:
from xml.etree import ElementTree as ET
from xml.etree.ElementTree import Element

input_file: str = "../docs/tp2/in_example.exb"
output_file: str = "../docs/tp2/out_exercice.exb"

# load the xml file and get tree structure
xml_tree: ET.ElementTree = ET.parse(input_file)
root = xml_tree.getroot()

In [451]:


def create_event_node(
    start: str, end: str, text: str
) -> Element:
    """
    Create a new event node with the given start, end and cleaned text
    """
    event = Element("event", attrib={"start": start, "end": end})
    event.text = text
    return event


def summarize_tier(tier_node: Element) -> Element:
    """
    This method extract list of events from given tier node and return a new tier node with grouped events
    Args:
        tier_node (Element): The tier node to process
    Returns:
        Element: A new tier node with grouped events
    """
    try:
        events = tier_node.findall("event")
        if not events or not events[0].text:
            return

        # remove all events from the tier node
        for event in events:
            tier_node.remove(event)
        
        start: str = ""
        event_text: str = ""

        # we group text of events with "TT" and create a new event with the text without "TT"
        # we also keep the start of the first event and the end of the last event
        for event in events:
            if start == "":
                start = event.get("start")
            event_text += event.text or ""

            if "TT" not in event.text:
                new_event = create_event_node(start, event.get("end"), event_text.replace("TT", ""))
                tier_node.append(new_event)
                
                # we reset the event text and the start for the next loop
                start = ""
                event_text = ""

        return tier_node
    except Exception as e:
        raise e


In [452]:
for tier_node in root.findall(".//tier"):
    summarize_tier(tier_node)
        

In [453]:
xml_tree.write(output_file, encoding="utf-8", xml_declaration=True)