https://github.com/dhimmel/delays/blob/master/process-esummary.ipynb

TODO update with https://github.com/dhimmel/delays/blob/upgrade/pydelays/xml_to_dates.py

In [1]:
import collections
import itertools
import pathlib
import pandas
from pubmedpy.xml_to_dates import iterparse as iterparse_xml

In [2]:
def iter_extract_elems(path, tag):
    """
    Return elements of the specified tag from XML produced by pubmedpy.eutilities.download_pubmed_ids.
    For memory-efficiency, the XML element tree root is cleared after before yielding the next element.
    """
    path = str(path)
    parser = iterparse_xml(path)
    root = next(parser)
    for elem in parser:
        if elem.tag != tag:
            continue
        yield elem
        root.clear()
    root.clear()

In [3]:
month_abbrev_to_int = {
    'Jan': 1,
    'Feb': 2,
    'Mar': 3,
    'Apr': 4,
    'May': 5,
    'Jun': 6,
    'Jul': 7,
    'Aug': 8,
    'Sep': 9,
    'Oct': 10,
    'Nov': 11,
    'Dec': 12,
}

def date_elem_to_str(elem):
    """
    Convert an XML date object to a string like '2002', '2002-01', or '2002-01-05'.
    """
    if elem is None:
        return None
    year = elem.findtext('Year')
    try:
        year = int(year)
    except (ValueError, TypeError):
        return None
    month = elem.findtext('Month')
    month = month_abbrev_to_int.get(month, month)
    try:
        month = int(month)
    except (ValueError, TypeError):
        return f"{year:04d}"
    day = elem.findtext('Day')
    try:
        day = int(day)
    except (ValueError, TypeError):
        return f"{year:04d}-{month:02d}"
    return f"{year:04d}-{month:02d}-{day:02d}"

In [4]:
# Create a generator of XML PubmedArticle elements
article_elems = itertools.chain.from_iterable(
    iter_extract_elems(path, tag='PubmedArticle')
    for path in sorted(pathlib.Path('data/pubmed/efetch').glob('*.xml.xz'))
)

In [5]:
articles = list()
for elem in article_elems:
    # Example efetch XML for <PubmedArticle> at https://github.com/dhimmel/pubmedpy/blob/f554a06e13e24d661dc5ff93ad07179fb3d7f0af/pubmedpy/data/efetch.xml
    article = collections.OrderedDict()
    for id_type in 'pubmed', 'pmc', 'doi':
        article[id_type] = elem.findtext(f"PubmedData/ArticleIdList/ArticleId[@IdType={id_type!r}]")
    article['journal'] = elem.findtext("MedlineCitation/MedlineJournalInfo/MedlineTA")
    authors = list()
    author_elems = elem.findall("MedlineCitation/Article/AuthorList/Author")
    dates = [date_elem_to_str(x) for x in elem.findall("MedlineCitation/Article/ArticleDate")]
    if dates:
        article['pub_date']= sorted(dates)[0]
    else:
        article['pub_date'] = date_elem_to_str(elem.find("MedlineCitation/Article/Journal/JournalIssue/PubDate"))  
    for author_elem in author_elems:
        authors.append({
            "fore_name": author_elem.findtext("ForeName"),
            "last_name": author_elem.findtext("LastName"),
            "affiliations": [x.text for x in author_elem.findall("AffiliationInfo/Affiliation")],
        })
    article['authors'] = authors
    articles.append(article)
len(articles)

28436

In [6]:
article_df = pandas.DataFrame(articles)
article_df = article_df.sort_values('pubmed')
article_df.sample(n=20, random_state=0).sort_values('pubmed')

Unnamed: 0,pubmed,pmc,doi,journal,pub_date,authors
67,12854978,PMC166169,10.1186/1471-2105-4-29,BMC Bioinformatics,2003-07-10,"[{'fore_name': 'James J', 'last_name': 'Campan..."
10332,15073026,,10.1093/bioinformatics/bth174,Bioinformatics,2004-04-08,"[{'fore_name': 'R', 'last_name': 'Pieler', 'af..."
279,15527510,PMC533868,10.1186/1471-2105-5-176,BMC Bioinformatics,2004-11-04,"[{'fore_name': 'Charu G', 'last_name': 'Kumar'..."
330,15673474,PMC548130,10.1186/1471-2105-6-18,BMC Bioinformatics,2005-01-26,"[{'fore_name': 'Rifat A', 'last_name': 'Hamoud..."
11502,16303795,,10.1093/bioinformatics/bti793,Bioinformatics,2005-11-22,"[{'fore_name': 'F', 'last_name': 'Collyn', 'af..."
1265,17217507,PMC1780126,10.1186/1471-2105-7-S4-S15,BMC Bioinformatics,2006-12-12,"[{'fore_name': 'Chaoyang', 'last_name': 'Zhang..."
1574,17594507,PMC1925121,10.1186/1471-2105-8-223,BMC Bioinformatics,2007-06-27,"[{'fore_name': 'Jo-Lan', 'last_name': 'Chung',..."
1736,17931407,PMC2148069,10.1186/1471-2105-8-380,BMC Bioinformatics,2007-10-11,"[{'fore_name': 'Oliviero', 'last_name': 'Carug..."
12780,18006552,,10.1093/bioinformatics/btm522,Bioinformatics,2007-11-15,"[{'fore_name': 'Robin', 'last_name': 'Nunkesse..."
22770,19008937,PMC2570617,10.1371/journal.pcbi.1000221,PLoS Comput Biol,2008-11-14,"[{'fore_name': 'Martin C', 'last_name': 'Stump..."


In [7]:
# PubMed articles in PMC by journal
article_df.journal.value_counts()

Bioinformatics        12963
BMC Bioinformatics     9124
PLoS Comput Biol       6349
Name: journal, dtype: int64

In [8]:
# PubMed articles in PMC by journal
pandas.crosstab(article_df.pmc.notna(), article_df.journal)

journal,BMC Bioinformatics,Bioinformatics,PLoS Comput Biol
pmc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
False,25,8166,52
True,9099,4797,6297


In [9]:
# articles without a publication date
article_df[article_df.pub_date.isna()]

Unnamed: 0,pubmed,pmc,doi,journal,pub_date,authors


In [10]:
# article proportions by date precision
article_df.pub_date.str.len().value_counts()

10    27767
7       510
4       159
Name: pub_date, dtype: int64