## TEIXML2TEXT

This notebook tries to generate exactly two witnesses from a given TEI XML document.

The idea is to treat the edit operations in exactly two different ways:

1. Witness 1: apply only the instant edit operations to the document. Ignore all other edit operations.
2. Witness 2: apply all remaining edit operations to the document.

NB: remember to retain the subst edit operations.

#### 1. Add unique identifiers to each edit operations

Adds an 'id' attribute to every edit operation. This makes it easier to refer to specific edit operations later on (if we want to remove or apply specific ones).

In [1]:
from bs4 import BeautifulSoup
from bs4.element import Tag
import re
import json

def add_ids_to_edit_ops(filepath):
    with open(filepath, "r") as file:
        soup = BeautifulSoup(file, features="lxml-xml")
    
    tag_index = []
    add_tag_counter = 1
    del_tag_counter = 1
    subst_tag_counter = 1
    for tag in soup.find_all():
        if tag.name == "add":
            tag['id'] = add_tag_counter
            add_tag_counter += 1
        if tag.name == "del":
            tag['id'] = del_tag_counter
            del_tag_counter += 1
        if tag.name == "subst":
            tag['id'] = subst_tag_counter
            subst_tag_counter += 1
    
    filepath_parts = filepath.split(".")
    if len(filepath_parts) <= 1:
        raise Exception("no valid filepath or file extension specified for the input file")
      
    file_extension = filepath_parts[len(filepath_parts)-1]
    filepath_parts.pop()
    filepath_parts.append('_ids_added.')
    filepath_parts.append(file_extension)
    output_filepath = ''.join(filepath_parts) 
    
    with open(output_filepath, "w", encoding='utf-8') as file:
        file.write(str(soup))

add_ids_to_edit_ops("ms-aladin-simplified.xml")

with open("ms-aladin-simplified_ids_added.xml", "r") as file:
    soup = BeautifulSoup(file, features="lxml-xml")

#### 2. Define functions to remove / apply specific edit operations

These functions will be called when we want to remove or apply specific edit operations to transform the TEI/XML document to pure text.

In [2]:
def is_child_of(edit1, edit2):
    if (edit1.name in ["add", "del", "subst"]) and (edit2.name in ["add", "del", "subst"]):
        children = edit2.findChildren(re.compile('.*') , recursive=False)
        if edit1 in children:
            return True
        else:
            return is_child_of_entity_in(edit1, children)
    return False
        
def is_child_of_entity_in(edit, edits):
    # if edit in edits:
    #     edits.remove(edit)
    for e in edits:
        if is_child_of(edit, e):
            # print("edit1: ", edit, " - is a child of - edit2: ", e)
            return True
    return False

def filter_child_edits(edits):
    result = []
    for edit in edits:
        if (not is_child_of_entity_in(edit, edits) and (edit.name in ["add", "del", "subst"])):
            # print("top level edit: ", edit)
            # print()
            result.append(edit)
    return result

def get_deepest_nested_tag(list_of_tags):
    deepest = {"tag" : None, "depth" : -1}
    for tag in list_of_tags:
        tag_depth = {"tag" : tag, "depth" : get_tag_nesting_depth(tag, depth=1)}
        if tag_depth["depth"] > deepest["depth"]:
            deepest = tag_depth
    return deepest["tag"]

def get_tag_nesting_depth(tag, depth=1):
    children = tag.findChildren(re.compile('.*') , recursive=False)
    if len(children) == 0:
        return depth
    else:
        depth += 1
        return get_tag_nesting_depth(get_deepest_nested_tag(children), depth=depth)
    
def get_deepest_tag(tag, depth=1):
    children = tag.findChildren(re.compile('.*') , recursive=False)
    if len(children) == 0:
        return tag
    else:
        depth += 1
        return get_deepest_nested_tag(children)
    
def apply_multiple_edit_ops(soup_obj, edit_ids=[], edit_ops=[]):
    if ((len(edit_ids) == 0) and (len(edit_ops) == 0)):
        raise Exception("No edit operations specified.")
    elif (len(edit_ids) == 0):
        for edit_op in edit_ops:
            if (type(edit_op) is not Tag):
                raise Exception("Edit ID",edit_id,"is not a valid edit operation ID. Must be an integer greater equal to 1.")            
            else:
                # print()
                # print("processing edit...tag name: ", edit_op.name, ", text: ", edit_op.text, "...")
                # print()
                soup_obj = apply_single_edit_op(soup_obj, edit_op=edit_op)          
    elif (len(edit_ops) == 0):
        for edit_id in edit_ids:
            if (int(edit_id) < 1):
                raise Exception("Edit ID",edit_id,"is not a valid edit operation ID. Must be an integer greater equal to 1.")
            else:
                soup_obj = apply_single_edit_op(soup_obj, edit_id=edit_id)
    else:
        for edit_id in edit_ids:
            if (int(edit_id) < 1):
                raise Exception("Edit ID",edit_id,"is not a valid edit operation ID. Must be an integer greater equal to 1.")
            else:
                soup_obj = apply_single_edit_op(soup_obj, edit_id=edit_id)        
    return soup_obj

def ignore_multiple_edit_ops(soup_obj, edit_ids=[], edit_ops=[]):
    if ((len(edit_ids) == 0) and (len(edit_ops) == 0)):
        raise Exception("No edit operations specified.")
    elif (len(edit_ids) == 0):
        for edit_op in edit_ops:
            if (type(edit_op) is not Tag):
                raise Exception("Edit ID",edit_id,"is not a valid edit operation ID. Must be an integer greater equal to 1.")            
            else:
                # print()
                # print("processing edit...tag name: ", edit_op.name, ", text: ", edit_op.text, "...")
                # print()
                soup_obj = ignore_single_edit_op(soup_obj, edit_op=edit_op)          
    elif (len(edit_ops) == 0):
        for edit_id in edit_ids:
            if (int(edit_id) < 1):
                raise Exception("Edit ID",edit_id,"is not a valid edit operation ID. Must be an integer greater equal to 1.")
            else:
                soup_obj = ignore_single_edit_op(soup_obj, edit_id=edit_id)
    else:
        for edit_id in edit_ids:
            if (int(edit_id) < 1):
                raise Exception("Edit ID",edit_id,"is not a valid edit operation ID. Must be an integer greater equal to 1.")
            else:
                soup_obj = ignore_single_edit_op(soup_obj, edit_id=edit_id)        
    return soup_obj

def apply_single_edit_op(soup_obj, edit_id=-1, edit_op=None):
    if ((edit_id == -1) and (edit_op is None)):
        raise Exception("A valid edit operation is not specified.")
    elif (edit_id == -1):
        if (type(edit_op) is not Tag):
            raise Exception("Edit operation",edit_op,"is not a valid bs4.element.Tag object")
        else:
            tag_nesting_depth = get_tag_nesting_depth(edit_op)
            # print(edit_op.name, " nesting depth: ", tag_nesting_depth)
            # print()
            if (tag_nesting_depth == 1):
                # print("terminating condition")
                if (edit_op.name == "add"):
                    # print("add: ", edit_op['id'], " - ", edit_op)
                    if soup_obj.find("add", {"id": edit_op['id']}) is not None:
                        soup_obj.find("add", {"id": edit_op['id']}).string = re.sub(r'\s+', ' ', soup_obj.find("add", {"id": edit_op['id']}).get_text().strip())
                        soup_obj.find("add", {"id": edit_op['id']}).unwrap()
                elif (edit_op.name == "del"):
                    # print("del: ", edit_op['id'], " - ", edit_op)
                    if soup_obj.find("del", {"id": edit_op['id']}) is not None:
                        soup_obj.find("del", {"id": edit_op['id']}).string = re.sub(r'\s+', ' ', soup_obj.find("del", {"id": edit_op['id']}).get_text().strip())
                        soup_obj.find("del", {"id": edit_op['id']}).extract()
                elif (edit_op.name == "subst"):
                    if soup_obj.find("subst", {"id": edit_op['id']}) is not None:
                        soup_obj.find("subst", {"id": edit_op['id']}).string = re.sub(r'\s+', ' ', soup_obj.find("subst", {"id": edit_op['id']}).get_text().strip())
                        soup_obj.find("subst", {"id": edit_op['id']}).unwrap()
            else:
                # print("recursing")
                children = edit_op.findChildren(re.compile('.*') , recursive=False)
                # print("children: ", len(children))
                count = 1
                for child in children:
                    # print("child ", count, ":", child)
                    soup_obj = apply_single_edit_op(soup_obj, edit_op=child)
                    count += 1
                soup_obj = apply_single_edit_op(soup_obj, edit_op=edit_op)
    return soup_obj
        
def ignore_single_edit_op(soup_obj, edit_id=-1, edit_op=None):
    # print("edit_op: ", edit_op)
    if ((edit_id == -1) and (edit_op is None)):
        raise Exception("A valid edit operation is not specified.")
    elif (edit_id == -1):
        if (type(edit_op) is not Tag):
            raise Exception("Edit operation",edit_op,"is not a valid bs4.element.Tag object")
        else:
            tag_nesting_depth = get_tag_nesting_depth(edit_op)
            if (tag_nesting_depth == 1):
                if (edit_op.name == "add"):
                    if soup_obj.find("add", {"id": edit_op['id']}) is not None:
                        soup_obj.find("add", {"id": edit_op['id']}).extract()
                elif (edit_op.name == "del"):
                    if soup_obj.find("del", {"id": edit_op['id']}) is not None:
                        soup_obj.find("del", {"id": edit_op['id']}).unwrap()

            else:
                children = edit_op.findChildren(re.compile('.*') , recursive=False)
                count = 1
                for child in children:
                    ignore_single_edit_op(soup_obj, edit_op=child)
                    count += 1
                ignore_single_edit_op(soup_obj, edit_op=edit_op)
    return soup_obj

#### 3. Function to remove content from the document not necessary for the final text

In [3]:
def clean_soup(soup):
    while soup.body is not None:
        soup.body.unwrap()
        
    while soup.hi is not None:
        soup.hi.unwrap()

    while soup.pb is not None:
        soup.pb.unwrap()

    while soup.div is not None:
        soup.div.unwrap()

    while soup.foreign is not None:
        soup.foreign.extract()

    while soup.unclear is not None:
        soup.unclear.extract()

    while soup.signature is not None:
        soup.signature.unwrap()

    while soup.metamark is not None:
        soup.metamark.unwrap()

    t = soup.find('name')
    while t is not None:
        t.unwrap()
        t = soup.find('name')
    
    while soup.sic is not None:
        soup.sic.unwrap()
    
    while soup.lb is not None:
        soup.lb.unwrap()

    while soup.subst is not None:
        soup.subst.unwrap()
        
    while soup.p is not None:
        soup.p.unwrap()
        
    while soup.title is not None:
        soup.title.unwrap()

    while soup.alt is not None:
        soup.alt.unwrap()

    while soup.seg is not None:
        soup.seg.unwrap()

    while soup.gap is not None:
        soup.gap.extract()
        
    return soup

# print(soup.body.text)
# Apply function to the soup object
soup = clean_soup(soup.body)

#### 4. Generate Witness 1: only instant edits

In [4]:
cleaned_soup = re.sub(r'(?=>).\s*', '>', str(soup))
cleaned_soup = re.sub(r'\s+(?=<)', ' ', cleaned_soup)

cleaned_soup = cleaned_soup.replace('</head>','</head>\n\n').replace('</subhead>', '</subhead>\n\n')

def do_second_clean(soup):
    while soup.head is not None:
        soup.head.unwrap()
    
    while soup.subhead is not None:
        soup.subhead.unwrap()
            
    return str(soup)
    
# print(cleaned_soup)

cleaned_soup = do_second_clean(BeautifulSoup(cleaned_soup, "lxml-xml"))

# print(cleaned_soup)

soupw1 = BeautifulSoup(cleaned_soup, features="lxml-xml")
soupw2 = BeautifulSoup(cleaned_soup, features="lxml-xml")

# Identify all top-level tags (not child edits within nested ones)
top_level_tags = filter_child_edits(soupw2.find_all())

# Generate Witness 1 (only instant edits applied)
# Apply instant edits
for tag in soupw1.find_all():
    if tag.has_attr('instant'):
        if tag['instant'].lower() == 'true':
            soupw1 = apply_single_edit_op(soupw1, edit_op=tag)

edits_to_ignore = []

# Identify non-instant edits 
for tag in soupw1.find_all():
    if not tag.has_attr('instant'):
        if tag in top_level_tags:
            edits_to_ignore.append(tag)
    else:
        if tag['instant'].lower() == 'false':
            if tag in top_level_tags:
                edits_to_ignore.append(tag)

print()
print('edits to ignore: ', edits_to_ignore)
print()
                
# Apply "ignore" function to non-instant edits (i.e., revert their changes to before the edit was made)
soupw1 = ignore_multiple_edit_ops(soupw1, edit_ops=edits_to_ignore)

# Write result to file (Witness 1)
with open("ms-aladin-simplified-witness_1a.txt", "w") as outfile:
    outfile.write(soupw1.text)


edits to ignore:  [<del id="1" type="strike-through">mp</del>, <add id="1" place="above-line">ntaarn</add>, <add id="2" place="below-line">"Marga<del id="2" type="strike-through">rithas</del><add id="3" place="above-line">ridas</add>ante porcos"</add>, <del id="3" type="strike-through">trots</del>, <add id="4" place="above-line">niettegenstaande</add>, <del id="6" type="strike-through"><add id="5" place="above-line">met nageltoppe </add></del>, <del id="7" type="overwritten">s</del>, <add id="6" place="inline">toppen,</add>, <del id="8" type="strike-through">deze</del>, <del id="9" type="strike-through"><add id="7" place="above-line">het</add></del>, <add id="8" place="above-line">het</add>, <del id="12" type="strike-through">het</del>, <add id="10" place="above-line">de</add>, <add id="11" place="above-line">haar</add>, <del id="13" type="strike-through">fatale</del>, <add id="12" place="above-line">desperate</add>, <del id="14" type="strike-through">.</del>, <add id="13" place="inli

#### 5. Generate Witness 2: all edits

In [8]:
# Generate Witness 2 (all edits applied)
# Apply all edits to the document
soupw2 = apply_multiple_edit_ops(soupw2, edit_ops=top_level_tags)
    
# Write result to file (Witness 2)
with open("ms-aladin-simplified-witness_1b.txt", "w") as outfile:
    outfile.write(soupw2.text)