# Process PubMed data and export  to TSVs

In [None]:
import bz2
import datetime
import importlib
import mimetypes
import os
import re
import xml.etree.ElementTree

import pandas

# Extract historical dates from PubMed records

## Retrieve PubMed records for all journal articles between 1950 and 2015

Enter the following search in [PubMed](http://www.ncbi.nlm.nih.gov/pubmed/):

```
journal article[pt] AND ("1950/01/01"[PDAT] : "2015/12/31"[PDAT])
```

Then click `Send to:`, then choose `File`, then select `XML` for format and publication date for `Sort by`.

After `pubmed_result.xml` finishes downloading, rename it to `pubmed_journal-articles_1950-2015.xml` and compress the file with bzip2.

**Note**: This method for retrieving PubMed records has been [used](https://twitter.com/clathrin/status/688054056800092165) by Steve Royle for a [similar analysis](https://quantixed.wordpress.com/2015/03/16/waiting-to-happen-ii-publication-lag-times/). For this project, we were [previousely retrieving](https://github.com/dhimmel/delays/blob/756ffebf309499a500ec1f83d68803c044ec8729/process.ipynb) history dates from `pubmed-dates-3.txt.gz` available at `ftp://ftp.ncbi.nlm.nih.gov/pubmed/`. However, K. Majewski from the NLM said that `pubmed-dates-3.txt.gz` was created for a special research project and may not contain all records. Therefore, we adopted the Royle method.

In [None]:
encoding_to_module = {
    'gzip': 'gzip',
    'bzip2': 'bz2',
}

def iterparse(path):
    """Return an element tree generator."""
    # Automatically detect compression
    type_, encoding = mimetypes.guess_type(path)
    if encoding is None:
        opener = open
    else:
        module = encoding_to_module[encoding]
        opener = importlib.import_module(module).open
    # Open file and yield from the element tree
    with opener(path, 'rt') as read_file:
        yield from xml.etree.ElementTree.iterparse(read_file)

In [None]:
def parse_pmpd(elem):
    """Parse an `PubmedArticle/PubmedData/History/PubMedPubDate` element."""
    year = elem.findtext('Year')
    month = elem.findtext('Month')
    day = elem.findtext('Day')
    date_tuple = year, month, day
    if not all(date_tuple):
        return None
    year, month, day = map(int, date_tuple)
    return datetime.date(year, month, day)

In [None]:
path = os.path.join('download', 'pubmed_dhimmel.xml.bz2')
articles = list()
for event, elem in iterparse(path):
    if elem.tag != 'PubmedArticle':
        continue
    article = dict()
    
    article['pubmed_id'] = int(elem.findtext('MedlineCitation/PMID'))
    article['journal_nlm_id'] = elem.findtext('MedlineCitation/MedlineJournalInfo/NlmUniqueID')
    
    # Extract all historical dates
    for pmpd in elem.findall('PubmedData/History/PubMedPubDate[@PubStatus]'):
        key = pmpd.get('PubStatus')
        value = parse_pmpd(pmpd)
        if value is None:
            continue
        article[key] = value
    articles.append(article)

article_df = pandas.DataFrame(articles)
article_df = article_df.sort_values(by='pubmed_id')

In [None]:
# Combine `aheadofprint` and `epublish`
article_df['date_online'] = article_df['aheadofprint']
article_df['date_online'].fillna(article_df['epublish'], inplace=True)

# Compute proportion missing for each column
missing_pct = article_df.isnull().mean().sort_values()
article_df = article_df[missing_pct.index]
missing_pct

In [None]:
article_df.head()

In [None]:
# Save article_df
path = os.path.join('data', 'articles-all.tsv.bz2')
with bz2.open(path, 'wt') as write_file:
    article_df.to_csv(write_file, index=False, sep='\t')

# Process PubMed journal catalog

Download and process PubMed/NLM [journal catalog](http://www.ncbi.nlm.nih.gov/books/NBK3827/table/pubmedhelp.T.journal_lists/).

In [None]:
# Download PubMed Journals
url = 'ftp://ftp.ncbi.nih.gov/pubmed/J_Medline.txt'
! wget --no-verbose --directory-prefix download --timestamping {url}

In [None]:
# Read PubMed journals
path = os.path.join('download', 'J_Medline.txt')
with open(path) as read_file:
    text = read_file.read()

In [None]:
# Create a dataframe of PubMed journals
rows = list()
pattern = re.compile('^-+$', re.MULTILINE)
for stanza in re.split(pattern, text):
    stanza = stanza.strip()
    if not stanza:
        continue
    row = dict()
    for line in stanza.split('\n'):
        key, value = line.split(': ', 1)
        row[key] = value or None
    rows.append(row)

journal_df = pandas.DataFrame(rows)
journal_df = journal_df.sort_values(by='NlmId')

In [None]:
# Order columns by percent missing
missing_pct = journal_df.isnull().mean().sort_values()
journal_df = journal_df[missing_pct.index]
missing_pct

In [None]:
# Save journal dataframe as a TSV
path = 'data/pubmed-journals.tsv'
journal_df.to_csv(path, sep='\t', index=False)