## XML Highlighting example

This example uses the Python ElementTree module, and a custom TreeBuilder class (mostly copied from the Python Standard Library) and Element factory to implement a version of ElementTree where each element knows where its start and end characters are.

In [97]:
# Needed to stop the C version of the library being imported
# See https://stackoverflow.com/a/55261552/1912
import _elementtree
try:
    del _elementtree.XMLParser
except AttributeError:
    # in case deleted twice
    pass

In [98]:
from xml.etree.ElementTree import XMLParser, TreeBuilder, Element, ElementTree, parse, Comment, ProcessingInstruction

In [160]:
from io import StringIO
xml_string = """<trk>
<trkpt lat="22.1862861" lon="-21.6978806">
<ele test_attrib1="blah" test_attrib2="blah2">0.000</ele>
<time>
2012-04-27T16:29:38+01:00
</time>
<course test_attrib="blah">268.7</course>
<empty></empty>
<speed>4.5</speed>
</trkpt>
</trk>"""

xml_file_obj = StringIO(xml_string)

In [100]:
class MyElement(Element):
    """
    A subclass of the Python Element class, which takes a start_byte argument
    in the constructor, and also sets up instance variables of `start`,
    `end` and `text_start`.
    """
    def __init__(self, tag, attrib={}, highlighted_file=None, start_byte=None, **extra):
        self.start = start_byte
        self.end = None
        self.text_start = None
        self.opening_tag_end = None
        self.highlighted_file = highlighted_file
        
        super(MyElement, self).__init__(tag, attrib, **extra)
        
    def record(self, tool: str, field: str, value: str, units: str = None, whole_element=False):
        self.highlighted_file.fill_char_array_if_needed()
        
        tool_field = tool + "/" + field
        if units is not None:
            message = "Value:" + str(value) + " Units:" + str(units)
        else:
            message = "Value:" + str(value)

        usage = SingleUsage(tool_field, message)
        
        if whole_element:
            start = self.start
            end = self.text_start
        else:
            start = self.text_start
            end = self.end
        
        for i in range(start, self.end):
            self.highlighted_file.chars[i].usages.append(usage)

In [185]:
class MyTreeBuilder:
    """
    Note: the majority of this is copied from the Python Standard Library
    TreeBuilder, but this has been extended to:
    - Store a parser instance from the constructor
    - Call self._parser.parser.CurrentByteIndex to get the byte index in
      methods like `start` and `end`
    - Pass this byte index to the MyElement constructor
    - Set the end and text_start locations in the MyElement constructor too
    
    Generic element structure builder.
    This builder converts a sequence of start, data, and end method
    calls to a well-formed element structure.
    You can use this class to build an element structure using a custom XML
    parser, or a parser for some other XML-like format.
    *element_factory* is an optional element factory which is called
    to create new Element instances, as necessary.
    *comment_factory* is a factory to create comments to be used instead of
    the standard factory.  If *insert_comments* is false (the default),
    comments will not be inserted into the tree.
    *pi_factory* is a factory to create processing instructions to be used
    instead of the standard factory.  If *insert_pis* is false (the default),
    processing instructions will not be inserted into the tree.
    """
    def __init__(self, parser=None, highlighted_file=None, element_factory=None, *,
                 comment_factory=None, pi_factory=None,
                 insert_comments=False, insert_pis=False):
        self._parser = parser
        self._highlighted_file = highlighted_file
        self._data = [] # data collector
        self._elem = [] # element stack
        self._last = None # last element
        self._root = None # root element
        self._tail = None # true if we're after an end tag
        if comment_factory is None:
            comment_factory = Comment
        self._comment_factory = comment_factory
        self.insert_comments = insert_comments
        if pi_factory is None:
            pi_factory = ProcessingInstruction
        self._pi_factory = pi_factory
        self.insert_pis = insert_pis
        if element_factory is None:
            element_factory = Element
        self._factory = element_factory

    def close(self):
        """Flush builder buffers and return toplevel document Element."""
        assert len(self._elem) == 0, "missing end tags"
        assert self._root is not None, "missing toplevel element"
        return self._root

    def _flush(self):
        if self._data:
            if self._last is not None:
                text = "".join(self._data)
                if self._tail:
                    assert self._last.tail is None, "internal error (tail)"
                    self._last.tail = text
                else:
                    assert self._last.text is None, "internal error (text)"
                    self._last.text = text
                    #print(f"Setting last element text start to {self._data_start}")
                    self._last.text_start = self._data_start
            self._data = []

    def data(self, data):
        """Add text to current element."""
        #print(f"Data called at position {self._parser.parser.CurrentByteIndex}")
        #print(f"self._data = {self._data}")       
        if self._data == []:
            if self._last is not None:
                print(f"End: {self._last.end}")
                if self._last.end is None:
                    print(f"Setting opening tag end = {self._parser.parser.CurrentByteIndex}")
                    self._last.opening_tag_end = self._parser.parser.CurrentByteIndex
                
            # Starting bit of text
            self._data_start = self._parser.parser.CurrentByteIndex
        #print(f"self._data_start = {self._data_start}")
        self._data.append(data)

    def start(self, tag, attrs):
        """Open new element and return it.
        *tag* is the element name, *attrs* is a dict containing element
        attributes.
        """
        self._flush()
        print(self._last)
        if self._last is not None:
            print(f"End: {self._last.end}")
            if self._last.end is None:
                print(f"Setting opening tag end = {self._parser.parser.CurrentByteIndex}")
                self._last.opening_tag_end = self._parser.parser.CurrentByteIndex
        self._last = elem = self._factory(tag, attrs, start_byte=self._parser.parser.CurrentByteIndex, highlighted_file=self._highlighted_file)
        if self._elem:
            self._elem[-1].append(elem)
        elif self._root is None:
            self._root = elem
        self._elem.append(elem)
        self._tail = 0
        return elem

    def end(self, tag):
        """Close and return current Element.
        *tag* is the element name.
        """
        self._flush()
        self._last = self._elem.pop()
        
        # Record end byte
        self._last.end = self._parser.parser.CurrentByteIndex
        
        if self._last.opening_tag_end is None:
            self._last.opening_tag_end = self._last.end
        
        assert self._last.tag == tag,\
               "end tag mismatch (expected %s, got %s)" % (
                   self._last.tag, tag)
        self._tail = 1
        return self._last

    def comment(self, text):
        """Create a comment using the comment_factory.
        *text* is the text of the comment.
        """
        return self._handle_single(
            self._comment_factory, self.insert_comments, text)

    def pi(self, target, text=None):
        """Create a processing instruction using the pi_factory.
        *target* is the target name of the processing instruction.
        *text* is the data of the processing instruction, or ''.
        """
        return self._handle_single(
            self._pi_factory, self.insert_pis, target, text)

    def _handle_single(self, factory, insert, *args):
        elem = factory(*args)
        if insert:
            self._flush()
            self._last = elem
            if self._elem:
                self._elem[-1].append(elem)
            self._tail = 1
        return elem

In [186]:
parser = XMLParser(target=None)
parser.parser.buffer_text = False
hf = HighlightedFile('small_gpx.gpx')
tree_builder = MyTreeBuilder(parser=parser, highlighted_file=hf, element_factory=MyElement)

In [187]:
parser.__init__(target=tree_builder)
parser.parser.buffer_text = False

In [188]:
with open('small_gpx.gpx') as f:
    doc = parse(f, parser)

None
End: None
Setting opening tag end = 5
<Element 'trk' at 0x10ad74d70>
End: None
Setting opening tag end = 6
End: None
Setting opening tag end = 48
<Element 'trkpt' at 0x10ad74e30>
End: None
Setting opening tag end = 49
End: None
Setting opening tag end = 95
End: 100
<Element 'ele' at 0x10ad74f50>
End: 100
End: None
Setting opening tag end = 113
End: 140
<Element 'time' at 0x10ad74fb0>
End: 140
End: None
Setting opening tag end = 175
End: 180
<Element 'course' at 0x10b903950>
End: 180
End: 197
<Element 'empty' at 0x10b9038f0>
End: 197
End: None
Setting opening tag end = 213
End: 216
End: 225


### Find the Course element, and get it's start, end and text_start

In [189]:
el = doc.find(".//course")

In [190]:
print("Whole element: " + xml_string[el.start:el.end])

Whole element: <course test_attrib="blah">268.7


In [191]:
xml_string[el.start:el.end]

'<course test_attrib="blah">268.7'

In [192]:
xml_string[el.start:el.opening_tag_end]

'<course test_attrib="blah">'

In [193]:
el = doc.find(".//trkpt")

In [194]:
el.opening_tag_end

49

In [195]:
xml_string[el.start:el.opening_tag_end]

'<trkpt lat="22.1862861" lon="-21.6978806">\n'

In [196]:
el.end

225

In [198]:
el = doc.find(".//empty")
xml_string[el.start:el.opening_tag_end]

'<empty>'

Note, the way the currentByteIndex variable works is that it gives the start of the element that triggered it - therefore we get the text from the start of the opening tag, to the start of the closing tag.

In [89]:
print("Just text content: " + xml_string[el.text_start:el.end])

Just text content: 268.7


### Do some recording of extractions, for highlighting

In [93]:
el = doc.find(".//ele")
el.record("XML parser", "Elevation", float(el.text))

el = doc.find(".//time")
el.record("XML parser", "Time", "Parsed time")

el = doc.find(".//speed")
el.record("XML parser", "Speed", float(el.text))

el = doc.find(".//course")
el.record("XML parser", "Course", float(el.text))

In [94]:
hf.export("test_xml_highlight.html", include_key=True)

100%|██████████| 224/224 [00:00<00:00, 262803.94it/s]


In [95]:
from IPython.display import HTML
with open('test_xml_highlight.html') as f:
    contents = f.read()

HTML(contents)

### Parsing a large file

In [None]:
def parse_large_file():
    parser = XMLParser(target=None)
    parser.parser.buffer_text = False
    tree_builder = MyTreeBuilder(parser=parser, element_factory=MyElement)
    parser.__init__(target=tree_builder)
    parser.parser.buffer_text = False
    with open('large_gpx.gpx') as f:
        doc = parse(f, parser)
        
    return doc

In [None]:
#%timeit parse_large_file()

### Linking with Highlighting experimentation

In [12]:
from pepys_import.file.highlighter.highlighter import HighlightedFile
from pepys_import.file.highlighter.support.usages import SingleUsage

In [13]:
hf = HighlightedFile('small_gpx.gpx')
hf.fill_char_array_if_needed()

100%|██████████| 224/224 [00:00<00:00, 393765.34it/s]


In [14]:
usage = SingleUsage("Tool", "Message")

In [15]:
for i in range(el.text_start, el.end):
    hf.chars[i].usages.append(usage)

In [16]:
hf.export("test_xml_highlight.html")

100%|██████████| 224/224 [00:00<00:00, 347200.33it/s]


In [18]:
from IPython.display import HTML
with open('test_xml_highlight.html') as f:
    contents = f.read()

HTML(contents)