## Python XML parsing

### Using xml.dom

In [6]:
from xml.dom.minidom import parse, parseString, Node

In [7]:
# opening it
document = parse("sample.xml")
print(document)

<xml.dom.minidom.Document object at 0x00000179FE9310A0>


In [8]:
# using context manager
with open("sample.xml") as file:
    document = parse(file)
print(document)

<xml.dom.minidom.Document object at 0x00000179FE929DC0>


In [9]:
# parsing it through a string
document = parseString('''<?xml version="1.0"?>
<catalog>
   <book id="bk101">
      <author>Gambardella, Matthew</author>
      <title>XML Developer's Guide</title>
      <genre>Computer</genre>
      <price>44.95</price>
      <publish_date>2000-10-01</publish_date>
      <description>An in-depth look at creating applications
      with XML.</description>
   </book>
   </catalog>''')
print(document)

<xml.dom.minidom.Document object at 0x00000179FDBAD760>


#### Accessing info from XML

In [None]:
# It is able to return info such as version and DTD
document = parse("sample.xml")
print(document.version)
print(document.doctype)
print(document.documentElement)

In [12]:
# However it can't parse elements
document = parse("sample.xml")
print(document.getElementById("bk101"))
print(document.getElementById("bk102"))

None
None


#### Solve this issue by giving all elements an id attribute

In [13]:
def set_id_attribute(parent, attribute_name="id"):
    if parent.nodeType == Node.ELEMENT_NODE:
        if parent.hasAttribute(attribute_name):
            parent.setIdAttribute(attribute_name)
    for child in parent.childNodes:
        set_id_attribute(child, attribute_name)

In [15]:
set_id_attribute(document)
print(document.getElementById("bk101"))
print(document.getElementById("bk102"))

<DOM Element: book at 0x179fe94aca0>
<DOM Element: book at 0x179fe94aca0>


In [16]:
document = parse("smiley.svg")
set_id_attribute(document)
print(document.getElementById("smiley"))
print(document.getElementsByTagName("ellipse"))

<DOM Element: g at 0x179fe9853a0>
[<DOM Element: ellipse at 0x179fe9855e0>, <DOM Element: ellipse at 0x179fe985940>]


#### Bad News


In [17]:
try:
    print(document.querySelector("#smiley"))
except AttributeError:
    print("does not work")


does not work


#### for stuff like <inkscape:custom>

In [None]:
document.getElementsByTagNameNS("*", "custom")

In [None]:
#other stuff here


### Using xml.sax

In [18]:
from xml.sax import parse
from xml.sax.handler import ContentHandler

class ParseXML(ContentHandler):
    pass

parse("smiley.svg", ParseXML())

prob no more time