# Python pull parser with string output

## Check the input XML

In [1]:
with open('../input/basic/flattened.xml') as input:
    print(input.read())

<?xml version="1.0" encoding="UTF-8"?><root xmlns:th="http://www.blackmesatech.com/2017/nss/trojan-horse">
    <p th:sID="d1e3"/>This is a <word th:sID="d1e5"/>paragraph<word th:eID="d1e5"/> that contains some stuff.<p th:eID="d1e3"/>
    <p th:sID="d1e9"/>This is another paragraph <phrase th:sID="d1e11"/><word th:sID="d1e12"/>that<word th:eID="d1e12"/>
            <word th:sID="d1e15"/>contains<word th:eID="d1e15"/>
            <word th:sID="d1e18"/>more<word th:eID="d1e18"/><phrase th:eID="d1e11"/> stuff.<p th:eID="d1e9"/>
</root>


## Transform it

In [2]:
from xml.dom.pulldom import CHARACTERS, START_ELEMENT, parseString, END_ELEMENT

output = []
with open('../input/basic/flattened.xml') as input:
    for event, node in parseString(input.read()):
        if event == START_ELEMENT:
            if node.hasAttribute('th:eID'): # Trojan end tag  
                output.append('</')
            else: # Trojan start tags and non-Trojan elements
                output.append('<')
            output.append(node.nodeName)
            for attname, attvalue in node.attributes.items(): # remove Trojan attributes and namespace declaration
                if not (attname.startswith('th:') or attname == 'xmlns:th'):
                    output.append(' ' + attname + '="' + attvalue + '"')
            output.append('>')
        if event == END_ELEMENT: 
            if not (node.hasAttribute('th:sID') or node.hasAttribute('th:eID')): # non-Trojan only
                output.append('</' + node.localName + '>')
        elif event == CHARACTERS:
            output.append(node.data)
print("".join(output))

<root>
    <p>This is a <word>paragraph</word> that contains some stuff.</p>
    <p>This is another paragraph <phrase><word>that</word>
            <word>contains</word>
            <word>more</word></phrase> stuff.</p>
</root>
