# Process PubMed data to extract article history dates

Some documentation of the PubMed xml format and field information is [available](http://www.ncbi.nlm.nih.gov/books/NBK3828/#_publisherhelp_XML_Tag_Descriptions_).

In [12]:
import xml.etree.ElementTree
import gzip
import bz2
import os
import datetime
import calendar

import pandas

In [2]:
# Download `pubmed-dates-3.txt.gz`
url = 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/pubmed-dates-3.txt.gz'
! wget --timestamping --no-verbose --directory-prefix download {url}

2015-12-22 20:06:51 URL: ftp://ftp.ncbi.nlm.nih.gov/pubmed/pubmed-dates-3.txt.gz [2234] -> "download/.listing" [1]


In [3]:
# Read the xml export. memory intensive
path = os.path.join('download', 'pubmed-dates-3.txt.gz')
with gzip.open(path, 'rt') as read_file:
    element_tree = xml.etree.ElementTree.parse(read_file)

In [4]:
month_abbr_to_int = {v: k for k, v in enumerate(calendar.month_abbr)}
del month_abbr_to_int['']

def parse_pmpd(elem):
    """Parse an `ArticleDates/History/PubMedPubDate` element."""
    
    # parse year and day
    try:
        year = int(elem.findtext('Year'))
        day = int(elem.findtext('Day'))
    except ValueError:
        return None
    
    # parse month
    month = elem.findtext('Month')
    try:
        month = int(month)
    except ValueError:
        month = month_abbr_to_int.get(month)
    
    return date_no_exceptions(year, month, day)

def date_no_exceptions(year, month, day):
    """Return a datetime.date or `None` in case of error."""
    try:
        return datetime.date(year, month, day)
    except TypeError:
        return None
    except ValueError:
        return None

In [5]:
# Parse and process xml
articles = list()
for elem in element_tree.iterfind('ArticleDates'):
    article = dict()
    
    article['pubmed_id'] = int(elem.findtext('PMID'))
    article['journal'] = elem.findtext('MedlineJournalInfo/MedlineTA')
    article['journal_issn'] = elem.findtext('MedlineJournalInfo/ISSNLinking')
    article['journal_nlm_id'] = elem.findtext('MedlineJournalInfo/NlmUniqueID')
    
    for pmpd in elem.findall('History/PubMedPubDate[@PubStatus]'):
        key = pmpd.get('PubStatus')
        value = parse_pmpd(pmpd)
        if value is None:
            continue
        article[key] = value
    articles.append(article)

article_df = pandas.DataFrame(articles)

In [6]:
# Combine `aheadofprint` and `epublish`
article_df['date_online'] = article_df['aheadofprint']
article_df['date_online'].fillna(article_df['epublish'], inplace=True)

# Compute proportion missing for each column
missing_pct = article_df.isnull().mean().sort_values()
article_df = article_df[missing_pct.index]
missing_pct

journal           0.000000
journal_nlm_id    0.000000
pubmed_id         0.000000
accepted          0.018447
journal_issn      0.027405
received          0.035496
date_online       0.217349
aheadofprint      0.342773
revised           0.505117
epublish          0.874222
pmc-release       0.989231
ecollection       0.998232
ppublish          0.998984
medline           0.999730
pubmed            0.999730
entrez            0.999933
dtype: float64

In [13]:
# Save article_df
path = os.path.join('data', 'articles-all.tsv.bz2')
with bz2.open(path, 'wt') as write_file:
    article_df.to_csv(write_file, index=False, sep='\t')