In [1]:
from display_xml import XML
import lxml.etree as et
test_file = 'element.xml'
parser = et.XMLParser(remove_blank_text=True)
test_tree = et.parse(test_file, parser)
test_root = test_tree.getroot()

# Primer on XML elements
## Why use lxml.etree for parsing XML documents?
* JATS standard for scientific article XML is consistent
* BeautifulSoup better for unreliable web documents
* lxml.etree and BeautifulSoup each have a module of the other
* The following XML element examples are derived from actual PLOS articles

# Primer on XML elements
## XML elements have four key properties in the lxml.etree library
1. element.tag
2. element.text
3. element.attrib
4. element.tail


## Example basic element

In [2]:
element = test_root.getchildren()[0]
XML(element)

In [3]:
element.tag

'article-title'

In [4]:
element.text

'Why Most Published Research Findings Are False'

## Basic element with attribute

In [5]:
element = test_root.getchildren()[1]
XML(element)

In [6]:
element.tag

'alt-title'

In [7]:
element.text

'Essay'

In [8]:
element.attrib

{'alt-title-type': 'running-head'}

In [9]:
# for any text that comes directly after closing tag and before another tag
element.tail

## An element attribute is a dictionary
### Appears inside the element tag

In [10]:
XML(element)

In [11]:
element.attrib

{'alt-title-type': 'running-head'}

In [12]:
element.attrib['alt-title-type']

'running-head'

In [13]:
element.attrib.get('alt-title-type')

'running-head'

## XML elements can have sub-elements

In [14]:
element = test_root
XML(element)

In [15]:
# to find direct descendants; don't need to know their tags
element.getchildren()

[<Element article-title at 0x10ab16888>, <Element alt-title at 0x1087d06c8>]

In [16]:
new_element = element.getchildren()[0]
new_element.tag

'article-title'

In [17]:
# to find direct ancestor; don't need to know its tag
new_element.getparent()

<Element title-group at 0x10ab16588>

## Finding sub-elements by name with xpath
### Example element: Creative Commons License

In [18]:
parser = et.XMLParser(remove_blank_text=True)
test_file = 'element_2.xml'
test_tree = et.parse(test_file, parser)
license = test_tree.xpath('.//license')[0]
display(XML(license, style='perldoc'))

#### Xpath returns a list of search results

In [19]:
# search direct descendants by name
license.xpath('./license-p')

[<Element license-p at 0x10ab47488>]

In [20]:
# search descendants of direct descendants
license.xpath('./license-p/ext-link')

[<Element ext-link at 0x10ab445c8>]

In [21]:
# search ALL descendants
license.xpath('.//ext-link')

[<Element ext-link at 0x10ab445c8>]

## Warning 1: Multiple elements can have the same xpath location
### Remember that it always returns a list

In [22]:
text_to_display4 = """<contrib-group>
<contrib contrib-type="author"><name name-style="western"><surname>dos Santos</surname><given-names>Renato Vieira</given-names></name></contrib>
<contrib contrib-type="author"><name name-style="western"><surname>da Silva</surname><given-names>Linaena Mericy</given-names></name></contrib>
</contrib-group>"""
element = et.fromstring(text_to_display4)
XML(text_to_display4)

In [23]:
element.xpath('./contrib')

[<Element contrib at 0x10ab44608>, <Element contrib at 0x10ab4d508>]

## Warning 2: element.text doesn't always work
### When in doubt, use `lxml.etree.tostring()`

In [24]:
display(XML(license, style='perldoc'))

In [25]:
print(license.text)

None


In [26]:
import lxml.etree as et
license_text = et.tostring(license, method='text', encoding='unicode')
print(license_text)

This is an open access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.


## Quick quiz before we move on!

In [27]:
text_to_display3 = """
<contrib contrib-type="author" equal-contrib="yes">
    <name name-style="western">
        <surname>Chen</surname>
        <given-names>Ximing</given-names>
    </name>plossy
</contrib>"""
element = et.fromstring(text_to_display3, parser)
print(text_to_display3)


<contrib contrib-type="author" equal-contrib="yes">
    <name name-style="western">
        <surname>Chen</surname>
        <given-names>Ximing</given-names>
    </name>plossy
</contrib>


* element.tag?
* element.attrib?

In [28]:
print('element.tag: {}'.format(element.tag))
print('element.attrib: {}'.format(element.attrib))

element.tag: contrib
element.attrib: {'contrib-type': 'author', 'equal-contrib': 'yes'}


## Quick quiz before we move on!

In [29]:
text_to_display3 = """
<contrib contrib-type="author" equal-contrib="yes">
    <name name-style="western">
        <surname>Chen</surname>
        <given-names>Ximing</given-names>
    </name>plossy
</contrib>"""
element = et.fromstring(text_to_display3, parser)
print(text_to_display3)


<contrib contrib-type="author" equal-contrib="yes">
    <name name-style="western">
        <surname>Chen</surname>
        <given-names>Ximing</given-names>
    </name>plossy
</contrib>


In [30]:
new_element = element.xpath('./name')[0]

* new_element.tag?
* new_element.attrib?
* new_element.tail?

In [31]:
print('new_element.tag: {}'.format(new_element.tag))
print('new_element.attrib: {}'.format(new_element.attrib))
print('new_element.tail: {}'.format(new_element.tail))

new_element.tag: name
new_element.attrib: {'name-style': 'western'}
new_element.tail: plossy



# Using lxml.etree and allofplos Article class to parse XML files

In [32]:
from allofplos import Article
doi = '10.1371/journal.pone.0183591'
article = Article(doi)
article.filename

'/Users/Elizabeth/PLOS_Corpus_Project/allofplos/allofplos/allofplos_xml/journal.pone.0183591.xml'

In [33]:
# Is the article XML file locally stored?
article.local

True

In [34]:
tree = article.tree
tree.xpath('./body')

[<Element body at 0x10b176a48>]

In [35]:
xml_root = article.root
xml_root.xpath('.//license')

[<Element license at 0x10ab58488>]

In [36]:
license = xml_root.xpath('.//license')[0]
license.attrib

{'{http://www.w3.org/1999/xlink}href': 'http://creativecommons.org/licenses/by/4.0/', '{http://www.w3.org/1999/xlink}type': 'simple'}

# Tying it all together!
## Project example: which articles use PCR in their Methods section?
## First, on finding elements with xpath searching tag AND attribute
* Body of article is divided into sections ('sec')
* Method section attribute of note: `{'sec-type': 'materials|methods'}` or `{'sec-type': 'methods'}`

In [37]:
methods_sections = xml_root.xpath("//sec[@sec-type='materials|methods']")
print(methods_sections)

[<Element sec at 0x10b176b48>]


In [39]:
from allofplos.samples.corpus_analysis import get_random_list_of_dois
from allofplos.article_class import Article
import lxml.etree as et

# First get list of articles/DOIs
dois = get_random_list_of_dois(count=50)
pcr_list = []
# Initialize first article object
article = Article(dois[0])

for doi in dois:
    # Step 1: create new article object
    article.doi = doi
    xml_root = article.root
    # Step 2: find Method sections
    methods_sections = xml_root.xpath("//sec[@sec-type='materials|methods']")
    if not methods_sections:
        methods_sections = xml_root.xpath("//sec[@sec-type='methods']")
    for sec in methods_sections:
        # Step 3: turn the method sections into strings
        method_string = et.tostring(sec, method='text', encoding='unicode')
        # Step 4: add DOI if 'PCR' in string
        if 'PCR' in method_string:
            pcr_list.append(article.doi)
            break
        else:
            pass

print(pcr_list[0:5])

['10.1371/journal.pone.0128195', '10.1371/journal.pone.0165464', '10.1371/journal.pone.0136574', '10.1371/journal.pone.0072749', '10.1371/journal.pone.0060101']
