# Process PubMed data and export  to TSVs

In [1]:
import bz2
import datetime
import importlib
import itertools
import logging
import mimetypes
import os
import re
import xml.etree.ElementTree

import pandas

# Extract historical dates from PubMed records

In [2]:
encoding_to_module = {
    'gzip': 'gzip',
    'bzip2': 'bz2',
}

def iterparse(path):
    """First yield the ElementTree root, then yield elements from an XML file."""
    # Automatically detect compression
    type_, encoding = mimetypes.guess_type(path)
    if encoding is None:
        opener = open
    else:
        module = encoding_to_module[encoding]
        opener = importlib.import_module(module).open
    # Open file and yield from the element tree
    with opener(path, 'rt') as read_file:
        context = xml.etree.ElementTree.iterparse(read_file, events=('start', 'end'))
        yield next(context)[1]
        yield from (elem for event, elem in context if event == 'end')


In [3]:
def parse_date_text(text):
    """
    Parse an `eSummaryResult/DocSum/Item[@Name='History']/Item[@Type='Date']` element.
    The time on the date is discarded. A `datetime.date` object is returned
    """
    date_, time_ = text.split(' ')
    date_tuple = date_.split('/')
    year, month, day = map(int, date_tuple)
    return datetime.date(year, month, day)

In [4]:
def parse_esummary_history(docsum):
    """docsum is an xml Element."""
    # Extract all historical dates
    date_pairs = list()
    seen = set()
    for item in docsum.findall("Item[@Name='History']/Item[@Type='Date']"):
        name = item.get('Name')
        try:
            date_ = parse_date_text(item.text)
        except Exception as e:
            msg = 'article {}; name: {}; date: {}, threw: {}'.format(article['pubmed_id'], name, item.text, e)
            logging.warning(msg)
            continue
        
        date_pair = name, date_
        if date_pair in seen:
            continue
        seen.add(date_pair)
        date_pairs.append(date_pair)
    date_pairs.sort(key=lambda x: x[0])
    history = dict()
    for name, group in itertools.groupby(date_pairs, key=lambda x: x[0]):
        for i, (name, date_) in enumerate(group):
            history['{}_{}'.format(name, i)] = date_
    return history


In [5]:
path = os.path.join('download', 'esummary_journal-articles_1960-2015.xml.bz2')
articles = list()
n_articles = 0

parser = iterparse(path)
root = next(parser)
for elem in parser:
    if elem.tag != 'DocSum':
        continue
    
    n_articles += 1
    article = dict()
    article['pubmed_id'] = int(elem.findtext('Id'))
    article['journal_nlm_id'] = elem.findtext("Item[@Name='NlmUniqueID']")
    article.update(parse_esummary_history(elem))
    articles.append(article)
    
    # Reset element to free memory 
    root.clear()

n_articles, len(articles)



(22341848, 22341848)

In [6]:
article_df = pandas.DataFrame(articles)
article_df = article_df.sort_values(by='pubmed_id')

In [8]:
# Combine `aheadofprint` and `epublish`
article_df['date_online'] = article_df['aheadofprint_0']
article_df['date_online'].fillna(article_df['epublish_0'], inplace=True)

# Compute proportion missing for each column
lead_columns = ['pubmed_id', 'journal_nlm_id']
missing_pct = article_df.drop(lead_columns, axis=1).isnull().mean().sort_values()
article_df = article_df[lead_columns + missing_pct.index.tolist()]
missing_pct

medline_0         0.000000e+00
pubmed_0          0.000000e+00
entrez_0          9.399402e-07
date_online       7.951436e-01
aheadofprint_0    8.204849e-01
received_0        8.393478e-01
accepted_0        8.443315e-01
revised_0         9.211524e-01
epublish_0        9.706509e-01
pmc-release_0     9.968478e-01
ecollection_0     9.996401e-01
ppublish_0        9.997162e-01
version_0         9.999842e-01
dtype: float64

In [9]:
article_df.tail()

Unnamed: 0,pubmed_id,journal_nlm_id,medline_0,pubmed_0,entrez_0,date_online,aheadofprint_0,received_0,accepted_0,revised_0,epublish_0,pmc-release_0,ecollection_0,ppublish_0,version_0
22341843,26771983,101138582,2016-01-16,2016-01-16,2016-01-16,,,,,,,,,,
22341844,26771984,101138582,2016-01-16,2016-01-16,2016-01-16,,,,,,,,,,
22341845,26771985,101138582,2016-01-16,2016-01-16,2016-01-16,,,,,,,,,,
22341846,26771986,101138582,2016-01-16,2016-01-16,2016-01-16,,,,,,,,,,
22341847,26771990,7506858,2016-01-16,2016-01-16,2016-01-16,,,,,,,,,,


In [13]:
# Save article_df
path = os.path.join('data', 'history-dates.tsv.bz2')
with bz2.open(path, 'wt') as write_file:
    article_df.to_csv(write_file, index=False, sep='\t')

# Process PubMed journal catalog

Download and process PubMed/NLM [journal catalog](http://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.journal_lists/).

In [None]:
# Download PubMed Journals
url = 'ftp://ftp.ncbi.nih.gov/pubmed/J_Medline.txt'
! wget --no-verbose --directory-prefix download --timestamping {url}

In [None]:
# Read PubMed journals
path = os.path.join('download', 'J_Medline.txt')
with open(path) as read_file:
    text = read_file.read()

In [None]:
# Create a dataframe of PubMed journals
rows = list()
pattern = re.compile('^-+$', re.MULTILINE)
for stanza in re.split(pattern, text):
    stanza = stanza.strip()
    if not stanza:
        continue
    row = dict()
    for line in stanza.split('\n'):
        key, value = line.split(': ', 1)
        row[key] = value or None
    rows.append(row)

journal_df = pandas.DataFrame(rows)
journal_df = journal_df.sort_values(by='NlmId')

In [None]:
# Order columns by percent missing
missing_pct = journal_df.isnull().mean().sort_values()
journal_df = journal_df[missing_pct.index]
missing_pct

In [None]:
# Save journal dataframe as a TSV
path = 'data/pubmed-journals.tsv'
journal_df.to_csv(path, sep='\t', index=False)