In [77]:
from display_xml import XML
import lxml.etree as et
test_file = 'element.xml'
parser = et.XMLParser(remove_blank_text=True)
test_tree = et.parse(test_file, parser)
test_root = test_tree.getroot()

# Primer on XML elements
## Why use lxml.etree for parsing XML documents?
* JATS standard for scientific article XML is consistent
* BeautifulSoup better for unreliable web documents
* lxml.etree and BeautifulSoup each have a module of the other
* The following XML element examples are from actual PLOS articles

# Primer on XML elements
## XML elements have four key properties in the lxml.etree library
1. element.tag
2. element.text
3. element.attrib
4. element.tail


## Example basic element

In [55]:
element = test_root.getchildren()[0]
XML(element)

In [56]:
element.tag

'article-title'

In [57]:
element.text

'Why Most Published Research Findings Are False'

## Basic element with attribute

In [96]:
element = test_root.getchildren()[1]
XML(element)

In [97]:
element.tag

'alt-title'

In [98]:
element.text

'Essay'

In [99]:
element.attrib

{'alt-title-type': 'running-head'}

In [100]:
# for any text that comes directly after closing tag and before another tag
element.tail

## An element attribute is a dictionary
### Appears inside the element tag

In [101]:
XML(element)

In [102]:
element.attrib

{'alt-title-type': 'running-head'}

In [103]:
element.attrib['alt-title-type']

'running-head'

In [104]:
element.attrib.get('alt-title-type')

'running-head'

## XML elements can have sub-elements

In [105]:
element = test_root
XML(element)

In [106]:
# to find direct descendants; don't need to know their tags
element.getchildren()

[<Element article-title at 0x10b6d6808>, <Element alt-title at 0x10b6cfe08>]

In [107]:
new_element = element.getchildren()[0]
new_element.tag

'article-title'

In [108]:
# to find direct ancestor; don't need to know its tag
new_element.getparent()

<Element title-group at 0x10b69a948>

## Finding sub-elements by name with xpath
### Example element: Creative Commons License

In [110]:
parser = et.XMLParser(remove_blank_text=True)
test_file = 'element_2.xml'
test_tree = et.parse(test_file, parser)
license = test_tree.xpath('.//license')[0]
display(XML(license, style='perldoc'))

#### Xpath returns a list of search results

In [75]:
# search direct descendants by name
license.xpath('./license-p')

[<Element license-p at 0x10b69ad08>]

In [89]:
# search descendants of direct descendants
license.xpath('./license-p/ext-link')

[<Element ext-link at 0x10b6980c8>]

In [90]:
# search ALL descendants
license.xpath('.//ext-link')

[<Element ext-link at 0x10b6980c8>]

## Warning 1: Multiple elements can have the same xpath location
### Remember that it always returns a list

In [116]:
text_to_display4 = """<contrib-group>
<contrib contrib-type="author"><name name-style="western"><surname>dos Santos</surname><given-names>Renato Vieira</given-names></name></contrib>
<contrib contrib-type="author"><name name-style="western"><surname>da Silva</surname><given-names>Linaena Mericy</given-names></name></contrib>
</contrib-group>"""
element = et.fromstring(text_to_display4)
XML(text_to_display4)

In [117]:
element.xpath('./contrib')

[<Element contrib at 0x10b6d9648>, <Element contrib at 0x10b6cc988>]

## Warning 2: element.text doesn't always work
### When in doubt, use `lxml.etree.tostring()`

In [111]:
display(XML(license, style='perldoc'))

In [112]:
print(license.text)

None


In [114]:
import lxml.etree as et
license_text = et.tostring(license, method='text', encoding='unicode')
print(license_text)

This is an open access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are credited.


## Quick quiz before we move on!

In [132]:
text_to_display3 = """
<contrib contrib-type="author" equal-contrib="yes">
    <name name-style="western">
        <surname>Chen</surname>
        <given-names>Ximing</given-names>
    </name>plossy
</contrib>"""
element = et.fromstring(text_to_display3, parser)
print(text_to_display3)


<contrib contrib-type="author" equal-contrib="yes">
    <name name-style="western">
        <surname>Chen</surname>
        <given-names>Ximing</given-names>
    </name>plossy
</contrib>


* element.tag?
* element.attrib?

In [137]:
print('element.tag: {}'.format(element.tag))
print('element.attrib: {}'.format(element.attrib))

element.tag: contrib
element.attrib: {'contrib-type': 'author', 'equal-contrib': 'yes'}


## Quick quiz before we move on!

In [134]:
text_to_display3 = """
<contrib contrib-type="author" equal-contrib="yes">
    <name name-style="western">
        <surname>Chen</surname>
        <given-names>Ximing</given-names>
    </name>plossy
</contrib>"""
element = et.fromstring(text_to_display3, parser)
print(text_to_display3)


<contrib contrib-type="author" equal-contrib="yes">
    <name name-style="western">
        <surname>Chen</surname>
        <given-names>Ximing</given-names>
    </name>plossy
</contrib>


In [139]:
new_element = element.xpath('./name')[0]

* new_element.tag?
* new_element.attrib?
* new_element.tail?

In [140]:
print('new_element.tag: {}'.format(new_element.tag))
print('new_element.attrib: {}'.format(new_element.attrib))
print('new_element.tail: {}'.format(new_element.tail))

new_element.tag: name
new_element.attrib: {'name-style': 'western'}
new_element.tail: plossy



# Using lxml.etree and allofplos Article class to parse XML files

In [161]:
from allofplos import Article
doi = '10.1371/journal.pone.0183591'
article = Article(doi)
article.filename

'/Users/Elizabeth/PLOS_Corpus_Project/allofplos/allofplos/allofplos_xml/journal.pone.0183591.xml'

In [162]:
# Is the article XML file locally stored?
article.local

True

In [165]:
tree = article.tree
tree.xpath('./body')

[<Element body at 0x10e84a588>]

In [164]:
xml_root = article.root
xml_root.xpath('.//license')

[<Element front at 0x10e84a888>]

In [167]:
license = xml_root.xpath('.//license')[0]
license.attrib

{'{http://www.w3.org/1999/xlink}href': 'http://creativecommons.org/licenses/by/4.0/', '{http://www.w3.org/1999/xlink}type': 'simple'}

# Project example: which articles use PCR in their methods section?
## First, on finding elements with xpath searching tag AND attribute
* Body of article is divided into sections ('sec')
* Method section attribute of note: `{'sec-type': 'materials|methods'}` or `{'sec-type': 'methods'}`

In [177]:
methods_sections = xml_root.xpath("//sec[@sec-type='materials|methods']")
print(methods_sections)

[<Element sec at 0x10e863cc8>]


In [194]:
from allofplos.samples.corpus_analysis import get_random_list_of_dois

# First get list of articles/DOIs
dois = get_random_list_of_dois(count=50)
pcr_list = []

for doi in dois:
    # Step 1: initialize article object
    article.doi = doi
    xml_root = article.root
    # Step 2: find Method sections
    methods_sections = xml_root.xpath("//sec[@sec-type='materials|methods']")
    if not methods_sections:
        methods_sections = xml_root.xpath("//sec[@sec-type='methods']")
    for sec in methods_sections:
        # Step 3: turn the method sections into strings
        method_string = et.tostring(sec, method='text', encoding='unicode')
        # Step 4: add DOI if 'PCR' in string
        if 'PCR' in method_string:
            pcr_list.append(article.doi)
            break
        else:
            pass

print(pcr_list)
        
    

['10.1371/journal.pone.0142631', '10.1371/journal.pone.0174559', '10.1371/journal.pone.0164614', '10.1371/journal.pone.0042632', '10.1371/journal.pone.0022867', '10.1371/journal.pone.0112671', '10.1371/journal.pone.0165158', '10.1371/journal.pone.0062082', '10.1371/journal.pone.0111352', '10.1371/journal.pone.0039120', '10.1371/journal.pone.0052299', '10.1371/journal.pone.0150292', '10.1371/journal.pgen.1003790', '10.1371/journal.pone.0102851', '10.1371/journal.pone.0126598', '10.1371/journal.pbio.2002267', '10.1371/journal.pone.0015766', '10.1371/journal.pone.0033040']
