# Process PubMed data and export  to TSVs

In [1]:
import datetime
import importlib
import itertools
import logging
import mimetypes
import os

import pandas
from lxml import etree
import tqdm

# Extract historical dates from PubMed records

In [2]:
encoding_to_module = {
    'gzip': 'gzip',
    'bzip2': 'bz2',
    'xz': 'lzma',
}

def iterparse(path):
    """
    First yield the ElementTree root, then yield elements from an XML file.
    """
    # Automatically detect compression
    type_, encoding = mimetypes.guess_type(path)
    if encoding is None:
        opener = open
    else:
        module = encoding_to_module[encoding]
        opener = importlib.import_module(module).open
    
    # Open file and yield from the element tree
    with opener(path, 'rb') as read_file:
        context = etree.iterparse(read_file, events=('start', 'end'))
        yield next(context)[1]
        yield from (elem for event, elem in context if event == 'end')

In [3]:
def parse_date_text(text):
    """
    Parse an `eSummaryResult/DocSum/Item[@Name='History']/Item[@Type='Date']` element.
    The time on the date is discarded. A `datetime.date` object is returned
    """
    date_, time_ = text.split(' ')
    date_tuple = date_.split('/')
    year, month, day = map(int, date_tuple)
    return datetime.date(year, month, day)

In [4]:
def parse_esummary_history(docsum):
    """docsum is an xml Element."""
    # Extract all historical dates
    date_pairs = list()
    seen = set()
    for item in docsum.findall("Item[@Name='History']/Item[@Type='Date']"):
        name = item.get('Name')
        try:
            date_ = parse_date_text(item.text)
        except Exception as e:
            msg = 'article {}; name: {}; date: {}, threw: {}'.format(article['pubmed_id'], name, item.text, e)
            logging.warning(msg)
            continue
        
        date_pair = name, date_
        if date_pair in seen:
            continue
        seen.add(date_pair)
        date_pairs.append(date_pair)
    date_pairs.sort(key=lambda x: x[0])
    history = dict()
    for name, group in itertools.groupby(date_pairs, key=lambda x: x[0]):
        for i, (name, date_) in enumerate(group):
            history[f'{name}_{i}'] = date_
    return history

In [5]:
path = os.path.join('download', 'esearch_journal-articles_1960-2017.tsv.xz')
n_article_ids = len(pandas.read_table(path))
print(f'{n_article_ids:,} article IDs retrieved via esearch')

23,711,961 article IDs retrieved via esearch


In [6]:
articles = list()
path = os.path.join('download', 'esummary_journal-articles_1960-2017.xml.xz')
parser = iterparse(path)
root = next(parser)
progress_bar = tqdm.tqdm_notebook(total=n_article_ids, unit='articles')
for elem in parser:
    if elem.tag != 'DocSum':
        continue
    
    article = dict()
    article['pubmed_id'] = int(elem.findtext('Id'))
    article['journal_nlm_id'] = elem.findtext("Item[@Name='NlmUniqueID']")
    article.update(parse_esummary_history(elem))
    articles.append(article)
    progress_bar.update(1)
    
    # Reset element to free memory 
    root.clear()

progress_bar.close()

Widget Javascript not detected.  It may not be installed properly. Did you enable the widgetsnbextension? If not, then run "jupyter nbextension enable --py --sys-prefix widgetsnbextension"





In [10]:
print(f'{len(articles):,} processed in esummary output')

23,711,958 processed in esummary output


In [11]:
article_df = pandas.DataFrame(articles)
article_df = article_df.sort_values(by='pubmed_id')

In [12]:
# Combine `aheadofprint` and `epublish`
article_df['date_online'] = article_df['aheadofprint_0']
article_df['date_online'].fillna(article_df['epublish_0'], inplace=True)

# Compute proportion missing for each column
lead_columns = ['pubmed_id', 'journal_nlm_id']
missing_pct = article_df.drop(lead_columns, axis=1).isnull().mean().sort_values()
article_df = article_df[lead_columns + missing_pct.index.tolist()]
missing_pct

KeyError: 'aheadofprint_0'

In [14]:
article_df.tail()

Unnamed: 0,accepted_0,accepted_1,entrez_0,journal_nlm_id,medline_0,pmc-release_0,pubmed_0,pubmed_id,received_0,received_1,received_2,retracted_0,revised_0,revised_1,revised_2,revised_3,revised_4,revised_5,revised_6,version_0
23711953,2017-01-17,,2017-02-14,8809128,2017-02-14,,2017-02-14,28192871,2016-12-06,,,,2017-01-08,,,,,,,
23711954,2017-02-02,,2017-02-14,8809128,2017-02-14,,2017-02-14,28192872,2016-11-10,,,,2017-01-24,,,,,,,
23711955,2016-11-01,,2017-02-14,101292775,2017-02-14,,2017-02-14,28192873,2016-04-03,,,,2016-09-13,,,,,,,
23711956,2017-02-02,,2017-02-14,8809128,2017-02-14,,2017-02-14,28192874,2016-12-29,,,,2017-02-02,,,,,,,
23711957,2017-02-03,,2017-02-14,8809128,2017-02-14,,2017-02-14,28192875,2017-01-17,,,,,,,,,,,


In [None]:
# Save article_df
path = os.path.join('data', 'history-dates.tsv.xz')
article_df.to_csv(path, index=False, sep='\t', compression='xz')