Skip to content

Commit

Permalink
+ strip whitespace when extracting metadata from PubMed Central XML
Browse files Browse the repository at this point in the history
  • Loading branch information
erlehmann committed Oct 2, 2012
1 parent 7d860a3 commit 66b5568
Showing 1 changed file with 8 additions and 2 deletions.
10 changes: 8 additions & 2 deletions sources/pmc.py
Expand Up @@ -106,6 +106,12 @@ def list_articles(target_directory, supplementary_materials=False, skip=[]):
result['supplementary-materials'] = _get_supplementary_materials(tree)
yield result

def _strip_whitespace(text):
"""
Strips leading and trailing whitespace for multiple lines.
"""
return '\n'.join([line.strip() for line in text.splitlines()])

def _get_article_categories(tree):
"""
Given an ElementTree, return (some) article categories.
Expand Down Expand Up @@ -176,7 +182,7 @@ def _get_article_abstract(tree):
print abstract.attrib['abstract-type']
continue
else:
return ''.join(abstract.itertext())
return _strip_whitespace(''.join(abstract.itertext()))
return None

def _get_journal_title(tree):
Expand Down Expand Up @@ -578,7 +584,7 @@ def _get_supplementary_material(tree, rid):
caption = sup_tree.find('caption')
result['caption'] = ''
if caption is not None:
result['caption'] = ' '.join(caption.itertext())
result['caption'] = _strip_whitespace(' '.join(caption.itertext()))

media = sup_tree.find('media')
if media is not None:
Expand Down

0 comments on commit 66b5568

Please sign in to comment.