Use the etree library to assist with xml parsing

In [1]:
# import the package
from lxml import etree

In [2]:
# Define some xml code as plain text
text = """
<Infringement xmlns="http://www.acns.net/ACNS" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.acns.net/ACNS http://www.acns.net/v1.2/ACNS2v1_2.xsd">
  <Case>
    <ID>e50456adea23e3d4f4a8</ID>
    <Status>Open</Status>
    <Severity>Normal</Severity>
  </Case>
  <Complainant>
    <Entity>Home Box Office, Inc.</Entity>
    <Contact>Vobile - Compliance</Contact>
    <Address>2880 Lakeside Drive, Suite 360
Santa Clara, CA 95054</Address>
    <Phone>+1 (408) 492 1100</Phone>
    <Email>notice@hbo.copyright-notice.com</Email>
  </Complainant>
  <Service_Provider>
    <Entity>North Carolina Research and Education Network</Entity>
    <Email>abuse@wfu.edu</Email>
  </Service_Provider>
  <Source>
    <TimeStamp>2019-03-23T15:20:07Z</TimeStamp>
    <IP_Address>152.17.143.119</IP_Address>
    <Port>61356</Port>
    <Type>BitTorrent</Type>
    <SubType BaseType="P2P" Protocol="BITTORRENT"/>
    <Number_Files>1</Number_Files>
  </Source>
  <Content>
    <Item>
      <TimeStamp>2019-03-23T15:20:07Z</TimeStamp>
      <Title>Game of Thrones</Title>
      <FileName>Game.of.Thrones.S02.720p.BluRay.x264.ShAaNiG</FileName>
      <FileSize>4768442317</FileSize>
      <Hash Type="SHA1">6698e0950dcd257a6b03af2e8b068b7ff9d4619d</Hash>
    </Item>
  </Content>
</Infringement>"""

In [3]:
# Instantiate an etree object from the plain text
root = etree.XML(text)

In [4]:
def search_elements(tree):
    """Function to recursively grab elements from an xml tree object.
    
    Returns a dictionary of elements indexed by tag.
    Individual elements contain either plain text or additional attributes."""
    
    def get_children(tree):
        children = [child for child in tree]
        for child in children:
            tag = child.tag.split('}')[-1]   # split out meta information
            elements[tag] = {'text':child.text,
                             'attrib':child.attrib}
            get_children(child)


    elements = {}
    get_children(root)
    
    return elements

In [5]:
# Run the function on our sample xml data
elements = search_elements(root)
elements

{'Address': {'attrib': {},
  'text': '2880 Lakeside Drive, Suite 360\nSanta Clara, CA 95054'},
 'Case': {'attrib': {}, 'text': '\n    '},
 'Complainant': {'attrib': {}, 'text': '\n    '},
 'Contact': {'attrib': {}, 'text': 'Vobile - Compliance'},
 'Content': {'attrib': {}, 'text': '\n    '},
 'Email': {'attrib': {}, 'text': 'abuse@wfu.edu'},
 'Entity': {'attrib': {},
  'text': 'North Carolina Research and Education Network'},
 'FileName': {'attrib': {},
  'text': 'Game.of.Thrones.S02.720p.BluRay.x264.ShAaNiG'},
 'FileSize': {'attrib': {}, 'text': '4768442317'},
 'Hash': {'attrib': {'Type': 'SHA1'},
  'text': '6698e0950dcd257a6b03af2e8b068b7ff9d4619d'},
 'ID': {'attrib': {}, 'text': 'e50456adea23e3d4f4a8'},
 'IP_Address': {'attrib': {}, 'text': '152.17.143.119'},
 'Item': {'attrib': {}, 'text': '\n      '},
 'Number_Files': {'attrib': {}, 'text': '1'},
 'Phone': {'attrib': {}, 'text': '+1 (408) 492 1100'},
 'Port': {'attrib': {}, 'text': '61356'},
 'Service_Provider': {'attrib': {}, 'te

In [6]:
# Extract one piece of useful information.
elements['Port']['text']

'61356'