# What’s up with `element.removeAttribute(attname)`?

The following example runs to completion and produces the desired output unless I uncomment the two commented lines to try to remove the `@start` attribute with `node.removeAttribute('start)`. Hmm ...

In [8]:
from xml.dom.pulldom import CHARACTERS, START_ELEMENT, parseString, END_ELEMENT
from xml.dom.minidom import Document


class Stack(list):
    def push(self, item):
        self.append(item)

    def peek(self):
        return self[-1]


open_elements = Stack()
d = Document()
open_elements.push(d)

input = '''<root>
    <p start="p1"/>This is a <word start="w1"/>paragraph<word end="w1"/> that contains
    some stuff.<p end="p1"/>
</root>'''

for event, node in parseString(input):
    if event == START_ELEMENT and not node.hasAttribute('end'): # process pseudo-end-tags on END_ELEMENT event
        open_elements.peek().appendChild(node)
        open_elements.push(node)
#         if node.hasAttribute('start'):
#             node.removeAttribute('start')
    elif event == END_ELEMENT and not node.hasAttribute('start'): # process pseudo-start-tags on START_ELEMENT event
        open_elements.pop()
    elif event == CHARACTERS:
        t = d.createTextNode(node.data)
        open_elements.peek().appendChild(t)
    else:
        continue

print(open_elements[0].toxml())

<?xml version="1.0" ?><root>
    <p start="p1">This is a <word start="w1">paragraph</word> that contains
    some stuff.</p>
</root>


It isn’t clear (well, to me) why it isn’t possible to remove an attribute from an element without mangling the hierarchy. But since it isn’t, try cloning the node and working with the clone instead of the original.

## Cloning the node and removing the attribute from the clone works

In [25]:
from xml.dom.pulldom import CHARACTERS, START_ELEMENT, parseString, END_ELEMENT
from xml.dom.minidom import Document


class Stack(list):
    def push(self, item):
        self.append(item)

    def peek(self):
        return self[-1]


open_elements = Stack()
d = Document()
open_elements.push(d)

input = '''<root>
    <p start="p1"/>This is a <word start="w1"/>paragraph<word end="w1"/> that contains
    some stuff.<p end="p1"/>
</root>'''

for event, node in parseString(input):
    if event == START_ELEMENT and not node.hasAttribute('end'): # process pseudo-end-tags on END_ELEMENT event
        # Can’t remove attributes from the original, so work with a clone
        clone = node.cloneNode(deep=False)
        if clone.hasAttribute('start'):
            clone.removeAttribute('start')
        print(clone.toxml())
        open_elements.peek().appendChild(clone)
        open_elements.push(clone)
    elif event == END_ELEMENT and not node.hasAttribute('start'): # process pseudo-start-tags on START_ELEMENT event
        open_elements.pop()
    elif event == CHARACTERS:
        t = d.createTextNode(node.data)
        open_elements.peek().appendChild(t)
    else:
        continue

print(open_elements[0].toxml())

<root/>
<p/>
<word/>
<?xml version="1.0" ?><root>
    <p>This is a <word>paragraph</word> that contains
    some stuff.</p>
</root>


So far, so good …

## Now with real data!

First take a look at the input file:

In [19]:
with open('flattened.xml') as input:
    print(input.read())

<?xml version="1.0" encoding="UTF-8"?>
<root xmlns:th="http://www.blackmesatech.com/2017/nss/trojan-horse">
    <p th:sID="d1e3"/>This is a <word th:sID="d1e5"/>paragraph<word th:eID="d1e5"/> that contains
    some <nonTrojan type="test"/> stuff.<p th:eID="d1e3"/>
    <p th:sID="d1e9"/>This is <emphasis role="bold">another</emphasis> paragraph <phrase
        th:sID="d1e11"/><word th:sID="d1e12"/>that<word th:eID="d1e12"/>
    <word th:sID="d1e15"/>contains<word th:eID="d1e15"/>
    <word th:sID="d1e18"/>more<word th:eID="d1e18"/><phrase th:eID="d1e11"/> stuff.<p th:eID="d1e9"
    />
</root>



Then use the cloning method (full code repeated to keep everything in one place):

In [24]:
from xml.dom.pulldom import CHARACTERS, START_ELEMENT, parseString, END_ELEMENT
from xml.dom.minidom import Document


class Stack(list):
    def push(self, item):
        self.append(item)

    def peek(self):
        return self[-1]


open_elements = Stack()
d = Document()
open_elements.push(d)

with open('flattened.xml') as input:
    for event, node in parseString(input.read()):
        if event == START_ELEMENT and not node.hasAttribute('th:eID'): # process pseudo-end-tags on END_ELEMENT event
            # Can’t remove attributes from the original, so work with a clone
            clone = node.cloneNode(deep=False)
            if clone.hasAttribute('th:sID'):
                clone.removeAttribute('th:sID')
            open_elements.peek().appendChild(clone)
            open_elements.push(clone)
        elif event == END_ELEMENT and not node.hasAttribute('th:sID'): # process pseudo-start-tags on START_ELEMENT event
            open_elements.pop()
        elif event == CHARACTERS:
            t = d.createTextNode(node.data)
            open_elements.peek().appendChild(t)
        else:
            continue

print(open_elements[0].toxml())

<?xml version="1.0" ?><root xmlns:th="http://www.blackmesatech.com/2017/nss/trojan-horse">
    <p>This is a <word>paragraph</word> that contains
    some <nonTrojan type="test"/> stuff.</p>
    <p>This is <emphasis role="bold">another</emphasis> paragraph <phrase><word>that</word>
    <word>contains</word>
    <word>more</word></phrase> stuff.</p>
</root>


This is the desired output. Phew!