## TEIXML2TEXT

This notebook tries to generate two (or three) witnesses from a given TEI XML document.

The idea is to treat the edit operations in two (or three) different ways:

1. Witness 1a: ignore all edit operations (generate a plain text document from the input XML document) because we want to obtain the original "raw" version of the author's document before he / she started to edit it.
2. Witness 1b: apply only those edits that happened immediately following the raw version (e.g. "instant = true" edits and up to "nesting level" 1 or 2 of all other edits)
3. Witness 1c: apply the full nesting depth of all edits 


In [1]:
from bs4 import BeautifulSoup
from bs4.element import Tag
import re

def get_edit_ops(edit_type, bsoup_obj, child=2):
    edits_no_child = []
    all_edits = soup.find_all(edit_type)
    if (child not in [0, 1, 2]):
        raise Exception("value for 'child' parameter in method 'get_edit_ops' is not valid. Valid values must be in [0, 1, 2] only")
        
    if child == 2:
        return all_edits
    
    if child == 0:
        for edit in all_edits:
            has_child = len(edit.find_all()) != 0
            if not has_child:
                edits_no_child.append(edit)
    else:
        for edit in all_edits:
            has_child = len(edit.find_all()) != 0
            if has_child:
                edits_no_child.append(edit)

    return edits_no_child
    
def undo_edit_operations(edit_ops, bsoup_obj, specific_depth=-1):
    if (type(edit_ops) == list):
        for edit_op in edit_ops:
            if (edit_op.name == "del"):
                if bsoup_obj.find('del', {"id": edit_op['id']}) is not None: 
                    bsoup_obj.find('del', {"id": edit_op['id']}).unwrap()
            if (edit_op.name == "add"):
                if bsoup_obj.find('add', {"id": edit_op['id']}) is not None: 
                    bsoup_obj.find('add', {"id": edit_op['id']}).extract()
    elif (type(edit_ops) == Tag):
        return undo_edit_operations([edit_ops], bsoup_obj)
    else:
        raise Exception("method 'undo_edit_operations' exclusively expects either an object of type 'bs4.element.Tag' or a list of such objects as the first parameter.", type(edit_ops), "given.")
    return bsoup_obj

def apply_edit_operations(edit_ops, bsoup_obj, specific_depth=-1):
    if (type(edit_ops) == list):
        for edit_op in edit_ops:
            if (edit_op.name == "del"):
                if bsoup_obj.find('del', {"id": edit_op['id']}) is not None: 
                    bsoup_obj.find('del', {"id": edit_op['id']}).extract()
            if (edit_op.name == "add"):
                if bsoup_obj.find('add', {"id": edit_op['id']}) is not None: 
                    bsoup_obj.find('add', {"id": edit_op['id']}).unwrap()
    elif (type(edit_ops) == Tag):
        return apply_edit_operations([edit_ops], bsoup_obj)
    else:
        raise Exception("method 'apply_edit_operations' exclusively expects either an object of type 'bs4.element.Tag' or a list of such objects as the first parameter.", type(edit_ops), "given.")
    return bsoup_obj

def add_ids_to_edit_ops(filepath):
    with open(filepath, "r") as file:
        soup = BeautifulSoup(file, features="lxml-xml")
    
    tag_index = []
    add_tag_counter = 1
    del_tag_counter = 1
    for tag in soup.find_all():
        if tag.name == "add":
            tag['id'] = add_tag_counter
            add_tag_counter += 1
        if tag.name == "del":
            tag['id'] = del_tag_counter
            del_tag_counter += 1
    
    filepath_parts = filepath.split(".")
    if len(filepath_parts) <= 1:
        raise Exception("no valid filepath or file extension specified for the input file")
      
    file_extension = filepath_parts[len(filepath_parts)-1]
    filepath_parts.pop()
    filepath_parts.append('_ids_added.')
    filepath_parts.append(file_extension)
    output_filepath = ''.join(filepath_parts) 
    
    with open(output_filepath, "w", encoding='utf-8') as file:
        file.write(str(soup))
    
def get_deepest_nested_tag(list_of_tags):
    deepest = {"tag" : None, "depth" : -1}
    for tag in list_of_tags:
        tag_depth = {"tag" : tag, "depth" : get_tag_nesting_depth(tag, depth=1)}
        if tag_depth["depth"] > deepest["depth"]:
            deepest = tag_depth
    return deepest["tag"]
        
def get_tag_nesting_depth(tag, depth=1):
    children = tag.findChildren(re.compile('.*') , recursive=False)
    if len(children) == 0:
        return depth
    else:
        depth += 1
        return get_tag_nesting_depth(get_deepest_nested_tag(children), depth=depth)

def contains_tag_type(list_of_tags, list_of_tag_names):
    for tag in list_of_tags:
        if tag.name in list_of_tag_names:
            return True
    return False

def filter_tags_of_type(list_of_tags, list_of_tag_names):
    result = []
    for tag in list_of_tags:
        if (tag.name in list_of_tag_names):
            result.append(tag)
    return result
    
def get_top_level_edit_ops(tag):
    children = tag.findChildren(re.compile('.*') , recursive=False)
    if (len(children) == 0):
        return children
    else:
        if contains_tag_type(children, ["add", "del", "subst"]):
            return filter_tags_of_type(children, ["add", "del", "subst"])
        else:
            tmp = []
            for child in children:
                tmp.extend(get_top_level_edit_ops(child))
            return list(set(tmp))
        
def is_child_of(edit1, edit2):
    children = edit2.findChildren(re.compile('.*') , recursive=False)
    if edit1 in children:
        return True
    else:
        return is_child_of_entity_in(edit1, children)
        
def is_child_of_entity_in(edit, edits):
    if edit in edits:
        edits.remove(edit)
    for e in edits:
        if is_child_of(edit, e):
            return True
    return False

def filter_child_edits(edits):
    result = []
    for edit in edits:
        if not is_child_of_entity_in(edit, edits):
            result.append(edit)
    return result

In [2]:
# Test methods to get ADD, DEL and SUBST operations

# Add IDs to each edit operation
add_ids_to_edit_ops("datasets/clean-data/ms-aladin-simplified.xml")

with open("datasets/clean-data/ms-aladin-simplified_ids_added.xml", "r") as file:
    soup = BeautifulSoup(file, features="lxml-xml")

# Remove all substitutions - they are not needed because they are equivalent to delete + add
while soup.subst is not None:
    soup.subst.unwrap()

all_adds = soup.find_all("add")
all_dels = soup.find_all("del")
all_subs = soup.find_all("subst")
all_trans = soup.find_all("transpose")

print("no. of adds: " + str(len(all_adds)))
print("no. of dels: " + str(len(all_dels)))
print("no. of subs: " + str(len(all_subs)))
print("no. of transpose: " + str(len(all_trans)))

add_ops_no_child = get_edit_ops("add", soup, child=0)
del_ops_no_child = get_edit_ops("del", soup, child=0)
add_ops_with_child = get_edit_ops("add", soup, child=1)
del_ops_with_child = get_edit_ops("del", soup, child=1)

print()
print()
print("total no. of ADDs:", len(soup.find_all("add")))
print("no. of ADDs no child:", len(add_ops_no_child))
print("no. of ADDs with child:", len(add_ops_with_child))
print("ADDs: total matches components?", str(len(soup.find_all("add")) == len(add_ops_no_child) + len(add_ops_with_child)))
print()
print()
print("total no. of DELs:", len(soup.find_all("del")))
print("no. of DELs no child:", len(del_ops_no_child))
print("no. of DELs with child:", len(del_ops_with_child))
print("DELs: total matches components?", str(len(soup.find_all("del")) == len(del_ops_no_child) + len(del_ops_with_child)))

no. of adds: 469
no. of dels: 602
no. of subs: 0
no. of transpose: 0


total no. of ADDs: 469
no. of ADDs no child: 436
no. of ADDs with child: 33
ADDs: total matches components? True


total no. of DELs: 602
no. of DELs no child: 493
no. of DELs with child: 109
DELs: total matches components? True


In [3]:
# Test removing all DEL operations from the file
del_ops_with_child.extend(del_ops_no_child)
undo_dels_output = undo_edit_operations(del_ops_with_child, soup)
with open("datasets/clean-data/aladin-test-undo-dels.xml", "w") as file:
    file.write(str(undo_dels_output))

In [4]:
# Print out all TOP level edit operations with their nesting depth
# This is interesting to show their complexity ('activity') as Elli pointed out
with open("datasets/clean-data/ms-aladin-simplified_ids_added.xml", "r") as file:
    soup = BeautifulSoup(file, features="lxml-xml")
        
# Remove substitution tags (they are not necessary since they are equivalent to a del followed by an add tag)
while soup.subst is not None:
    soup.subst.unwrap()
    
# Remove substitution tags (they are not necessary since they are equivalent to a del followed by an add tag)
while soup.hi is not None:
    soup.hi.unwrap()

# Remove substitution tags (they are not necessary since they are equivalent to a del followed by an add tag)
while soup.foreign is not None:
    soup.foreign.unwrap()
        
# Remove substitution tags (they are not necessary since they are equivalent to a del followed by an add tag)
while soup.unclear is not None:
    soup.unclear.unwrap()
    
bsoup_obj = BeautifulSoup(str(soup.body), "lxml-xml")         
root_tags = get_top_level_edit_ops(soup.TEI)
result = {}
for tag in root_tags:
    if (tag.name in ["add", "del", "subst"]):
        tag_depth = get_tag_nesting_depth(tag)
        if tag_depth > 1:
            result[tag] = tag_depth

print("no. of results:", len(result))
print()
print()
        
for item in result:
    print()
    print()
    print(item, "\t:\t", result[item])
    print()
    print()

no. of results: 117




<del id="72" type="deletion">
<add id="60" place="infralinear">er</add>
</del> 	:	 2




<del id="399" type="deletion">
<add id="316" place="supralinear">zegt van Pubbel</add>
</del> 	:	 2




<add id="10" place="infralinear">
<del id="11" type="deletion">in</del> het plastische beeld van</add> 	:	 2




<del id="54" type="subsitution">
<add id="47" place="supralinear">voorgetooverd</add>
</del> 	:	 2




<del id="587" type="strike-through">Jungen <del id="588" type="strike-through">epidemie</del>
<add id="454" place="supralinear">Le</add>
</del> 	:	 2




<add id="427" place="supralinear">nevens het kleine
               aureool van een <del id="549" type="strike-through">karton</del>
<add id="428" place="infralinear">kartonnen</add> schijfje</add> 	:	 2




<del id="558" type="deletion">
<add id="435" place="supralinear">van de ouvr</add>
</del> 	:	 2




<del id="41" type="strike-through">
<add id="38" place="supralinear"> di</add>
</del> 	:	 2




<del id="4

In [5]:
# Render using XSLT
# import lxml.html
# from lxml import etree
 
# xslt_doc = etree.parse("test-xslt.xslt")
# xslt_transformer = etree.XSLT(xslt_doc)
 
# source_doc = etree.parse("datasets/clean-data/ms-aladin-simplified.xml")
# output_doc = xslt_transformer(source_doc)

# output_doc.write("output-toc.html", pretty_print=False)

In [6]:
# Render using own method
# from bs4 import BeautifulSoup
# import re

# with open("datasets/clean-data/ms-aladin-simplified.xml", "r") as file:
#     soup = BeautifulSoup(file, features="lxml-xml")

# # Remove all substitutions - they are not needed because they are equivalent to delete + add
# while soup.subst is not None:
#     soup.subst.unwrap()
    
# while soup.div is not None:
#     soup.div.unwrap()

# while soup.pb is not None:
#     soup.pb.unwrap()
    
# titletext = soup.front
# bodytext = soup.body

# html = '<html>'

# def transformToHTML(tag):
#     # print(tag.name)
#     children = tag.findChildren(re.compile('.*') , recursive=False)
#     # if (tag.name == "head"):
#     #     print(tag.text)
#     if (len(children) == 0):
#         if tag.name == "add":
#             return '<span style="color:blue;">['+tag.text+']</span>'
#         elif tag.name == "del":
#             return '<span style="color:red;"><s>'+tag.text+'</s> </span>'
#         else:
#             print()
#             # print(tag.name)
#             # print()
#             return tag.text
#     else:
#         tmp = ""
#         for child in children:
#             tmp += tag.text + transformToHTML(child)
#             # if child.name == "add":
#             #     return tag.text + transformToHTML(child)
#             # elif child.name == "del":
#             #     return tag.text + '<span style="color:red;"><s>' + transformToHTML(child) + '</s> </span>'
#             # else:
#             #     return tag.text + "<" + child.name + ">" + transformToHTML(child) + "</" + child.name + ">"
#         return tmp

# # print(bodytext)
# # html = '<span style="color:blue;">[<add place="infralinear">"Marga<subst><span style="color:red;"><s><del type="strike-through">rithas</del></s> </span><span style="color:blue;">[<add place="supralinear">ridas</add>]</span></subst> ante porcos"</add>]</span>'
# for item in bodytext.find_all():
#     html = html + transformToHTML(item)

# html += '</html>'

# with open("test.html", "w") as file:
#      file.write(html)

In [7]:
# Generate witnesses

def prepare_soup(soup):
    # Remove substitution tags (they are not necessary since they are equivalent to a del followed by an add tag)
    while soup.subst is not None:
        soup.subst.unwrap()

    # Remove substitution tags (they are not necessary since they are equivalent to a del followed by an add tag)
    while soup.hi is not None:
        soup.hi.unwrap()

    # Remove substitution tags (they are not necessary since they are equivalent to a del followed by an add tag)
    while soup.foreign is not None:
        soup.foreign.unwrap()

    # Remove substitution tags (they are not necessary since they are equivalent to a del followed by an add tag)
    while soup.unclear is not None:
        soup.unclear.unwrap()
        
    return soup

def make_file_name(filepath, suffix):
    filepath_parts = filepath.split(".")
    if len(filepath_parts) <= 1:
        raise Exception("no valid filepath or file extension specified for the input file")
      
    file_extension = filepath_parts[len(filepath_parts)-1]
    filepath_parts.pop()
    filepath_without_ext = ''.join(filepath_parts)
    path_without_ext_parts = filepath_without_ext.split('/')
    file_name = path_without_ext_parts[len(path_without_ext_parts)-1]
    file_name += suffix
    path_without_ext_parts.pop()
    path_without_ext_parts.append(file_name)
    output_file_path = '/'.join(path_without_ext_parts) + '.' + file_extension
    return output_file_path
    
def generate_composite_witness1a(filepath):
    with open(filepath, "r") as file:
        soup = BeautifulSoup(file, features="lxml-xml")
    
    soup = prepare_soup(soup)
    
    # Get all ADD operations
    all_adds = soup.find_all("add")
    # Get all DEL operations
    all_dels = soup.find_all("del")
    all_edits = all_adds + all_dels
    print(len(all_edits))
    
    soup = undo_edit_operations(all_edits, soup)
    
    output_file_path = make_file_name(filepath, '_1a')

    with open(output_file_path, "w", encoding='utf-8') as file:
        file.write(str(soup))
        
def generate_composite_witness1b(filepath):
    with open(filepath, "r") as file:
        soup = BeautifulSoup(file, features="lxml-xml")
    
    soup = prepare_soup(soup)
        
    # Get all ADD operations
    all_adds = soup.find_all("add")
    # Get all DEL operations
    all_dels = soup.find_all("del")
    # Get all instant edit operations
    all_instant_edits = soup.find_all(attrs={"instant" : "true"})
    # # print(all_instant_edits)
    # print()
    # print("all_instant_edits: ", len(all_instant_edits))
    all_edits = (all_adds + all_dels)
    all_edits = filter_child_edits(all_edits) # remove edits that are children of other edits in the list
    print("all_edits: ", len(all_edits))
    
#     # get only edits with nesting depth 1
#     edit_operations_depth_1 = []
#     for edit in all_edits:
#         if (get_tag_nesting_depth(edit) == 1):
#             edit_operations_depth_1.append(edit)
           
#     edits_to_apply = all_instant_edits + edit_operations_depth_1
#     edits_to_apply = list(set(edits_to_apply))
#     print("edit_operations_depth_1: ", len(edit_operations_depth_1))
#     relevant_edits_to_undo = [edit for edit in all_edits if edit not in edits_to_apply]
#     print("relevant_edits_to_undo: ", len(relevant_edits_to_undo)) 
    
    # apply instant edits and depth 1 edits
    print()
    for item in all_edits:
        # print(item)
        # print()
        # print()
        if (get_tag_nesting_depth(item) < 3):
            soup = apply_edit_operations(item, soup)
        else:
            print(item)
            print()
            soup = undo_edit_operations(item, soup)
    
    # remove / undo all other edits
    # soup = undo_edit_operations(relevant_edits_to_undo, soup)
      
    output_file_path = make_file_name(filepath, '_1b')

    with open(output_file_path, "w", encoding='utf-8') as file:
        file.write(str(soup))
    
def generate_composite_witnesses(filepath):        
    generate_composite_witness1a(filepath)
    generate_composite_witness1b(filepath)
    # generate_composite_witness1c(soup, filepath)

generate_composite_witnesses("datasets/clean-data/ms-aladin-simplified_ids_added.xml")

1071
all_edits:  457

<del id="282" type="deletion">
<del id="283" type="strike-through">
<add id="225" place="supralinear">Zich</add>
</del>
<add id="226" place="overwrite">Doch</add>
</del>

