## TEIXML2TEXT

This notebook tries to generate exactly two witnesses from a given TEI XML document.

The idea is to treat the edit operations in exactly two different ways:

1. Witness 1: apply only the instant edit operations to the document. Ignore all other edit operations.
2. Witness 2: apply all remaining edit operations to the document.

**NB:** The difference between this notebook and `teixml2text-two-witnesses.ipynb` is that it uses a different XML model. Specifically the [Python pulldom model](https://docs.python.org/3/library/xml.dom.pulldom.html) which treats text as a node in the XML tree.

#### 1. Add unique identifiers to each edit operations

Adds an 'id' attribute to every edit operation. This makes it easier to refer to specific edit operations later on (if we want to remove or apply specific ones).

In [1]:
from bs4 import BeautifulSoup
from bs4.element import Tag
import re
import json

# --- 1.1 Define function to add IDs to each edit operation in the input XML file ---

def add_ids_to_edit_ops(filepath):
    with open(filepath, "r") as file:
        soup = BeautifulSoup(file, features="lxml-xml")
    
    tag_index = []
    add_tag_counter = 1
    del_tag_counter = 1
    subst_tag_counter = 1
    for tag in soup.find_all():
        if tag.name == "add":
            tag['id'] = add_tag_counter
            add_tag_counter += 1
        if tag.name == "del":
            tag['id'] = del_tag_counter
            del_tag_counter += 1
        if tag.name == "subst":
            tag['id'] = subst_tag_counter
            subst_tag_counter += 1
    
    filepath_parts = filepath.split(".")
    if len(filepath_parts) <= 1:
        raise Exception("no valid filepath or file extension specified for the input file")
      
    file_extension = filepath_parts[len(filepath_parts)-1]
    filepath_parts.pop()
    filepath_parts.append('_ids_added.')
    filepath_parts.append(file_extension)
    output_filepath = ''.join(filepath_parts) 
    
    with open(output_filepath, "w", encoding='utf-8') as file:
        file.write(str(soup))

# --- 1.2 call the above function and store the result in a new XML file ---

add_ids_to_edit_ops("datasets/clean-data/ms-aladin-simplified-v2.xml")

with open("datasets/clean-data/ms-aladin-simplified-v2_ids_added.xml", "r") as file:
    soup = BeautifulSoup(file, features="lxml-xml")

#### 2. Calculate witnesses

In [71]:
import xml.dom.minidom as md
import re

witness_one = ''
witness_two = ''

# --- 2.1 Define functions to compute witnesses ---

def generate_witness_one(root):
    global witness_one
    if root.childNodes:
        for node in root.childNodes:
            if node.nodeType == node.ELEMENT_NODE:
                if (node.tagName == 'add'):
                    witness_one += ''
                else:
                    if (node.tagName == 'p'):
                        witness_one += '<p>'
                    generate_witness_one(node)
            elif (node.nodeType == node.TEXT_NODE):
                if (node.parentNode.tagName == 'subst' 
                    or (node.parentNode.tagName == 'del' 
                        and (node.parentNode.getAttribute('instant') is not None) and node.parentNode.getAttribute('instant') == "true") 
                            or (node.parentNode.tagName == 'add')):
                    witness_one += ''
                elif (node.parentNode.tagName == 'head'):
                    witness_one += '\n'
                    witness_one += re.sub('[\n\r]+', ' ', node.nodeValue)                    
                else:
                    witness_one += re.sub('[\n\r\s]+', ' ', node.nodeValue)                    

def generate_witness_two(root):
    global witness_two
    if root.childNodes:
        for node in root.childNodes:
            if node.nodeType == node.ELEMENT_NODE:
                if (node.tagName == 'p'):
                    witness_two += '<p>'
                generate_witness_two(node)
            elif (node.nodeType == node.TEXT_NODE):
                if (node.parentNode.tagName in ['del', 'subst']):
                    witness_two += ''
                elif (node.parentNode.tagName == 'head'):
                    witness_two += '\n'
                    witness_two += re.sub('[\n\r]+', ' ', node.nodeValue)                    
                else: # add is included here
                    witness_two += re.sub('[\n\r\s]+', ' ', node.nodeValue)                    

# --- 2.2 load the XML file, prepare it and call the above functions to compute the witnesses ---

dom = md.parse("datasets/clean-data/ms-aladin-simplified-v2.xml")
root = dom.getElementsByTagName('body')[0]
generate_witness_one(root) # Generate witness 1
generate_witness_two(root) # Generate witness 2

# --- 2.3 Format / Prettify the witness texts in readable paragraphs and remove inconsistent whitespace ---

paragraphs_w1 = witness_one.split('<p>')
paragraphs_w1[0] = paragraphs_w1[0].replace('\n', '').strip() # correct title whitespace
paragraphs_w2 = witness_two.split('<p>')
paragraphs_w2[0] = paragraphs_w2[0].replace('\n', '').strip() # correct title whitespace

witness_one_formatted = ''
witness_two_formatted = ''

for paragraph_w1 in paragraphs_w1:
    witness_one_formatted += re.sub('\s+',' ',paragraph_w1.strip())
    if witness_one_formatted[len(witness_one_formatted)-1] == '\n':
        witness_one_formatted += '\n'
    else:
        witness_one_formatted += '\n\n'

for paragraph_w2 in paragraphs_w2:
    witness_two_formatted += re.sub('\s+',' ',paragraph_w2.strip())
    if witness_two_formatted[len(witness_two_formatted)-1] == '\n':
        witness_two_formatted += '\n'
    else:
        witness_two_formatted += '\n\n'
    
# --- 2.4 Write the prettified text to file ---

# write witness 1 to file
with open("datasets/clean-data/ms-aladin-witness1.txt", "w") as outfile:
    outfile.write(witness_one_formatted)
    
# write witness 2 to file
with open("datasets/clean-data/ms-aladin-witness2.txt", "w") as outfile:
    outfile.write(witness_two_formatted)