# Process PubMed data to extract article history dates

In [None]:
import collections
import xml.etree.ElementTree
import gzip
import os
import datetime
import calendar
import csv

import pandas
import matplotlib
import seaborn
%matplotlib inline

In [None]:
# Download `pubmed-dates-3.txt.gz`
url = 'ftp://ftp.ncbi.nlm.nih.gov/pubmed/pubmed-dates-3.txt.gz'
! wget --timestamping --no-verbose --directory-prefix download {url}

In [None]:
# Read the xml export. memory intensive
path = os.path.join('download', 'pubmed-dates-3.txt.gz')
with gzip.open(path, 'rt') as read_file:
    element_tree = xml.etree.ElementTree.parse(read_file)

In [None]:
month_abbr_to_int = {v: k for k, v in enumerate(calendar.month_abbr)}
del month_abbr_to_int['']

def parse_pmpd(elem):
    """Parse an `ArticleDates/History/PubMedPubDate` element."""
    
    # parse year and day
    try:
        year = int(elem.findtext('Year'))
        day = int(elem.findtext('Day'))
    except ValueError:
        return None
    
    # parse month
    month = elem.findtext('Month')
    try:
        month = int(month)
    except ValueError:
        month = month_abbr_to_int.get(month)
    
    return date_no_exceptions(year, month, day)

def date_no_exceptions(year, month, day):
    """Return a datetime.date or `None` in case of error."""
    try:
        return datetime.date(year, month, day)
    except TypeError:
        return None
    except ValueError:
        return None


In [None]:
# Parse and process xml
articles = list()
for elem in element_tree.iterfind('ArticleDates'):
    article = dict()
    
    article['pubmed_id'] = int(elem.findtext('PMID'))
    article['journal'] = elem.findtext('MedlineJournalInfo/MedlineTA')
    article['journal_issn'] = elem.findtext('MedlineJournalInfo/ISSNLinking')
    article['journal_nlm_id'] = elem.findtext('MedlineJournalInfo/NlmUniqueID')
    
    for pmpd in elem.findall('History/PubMedPubDate[@PubStatus]'):
        key = pmpd.get('PubStatus')
        value = parse_pmpd(pmpd)
        if value is None:
            break
        article[key] = value
    articles.append(article)

article_df = pandas.DataFrame(articles)

In [None]:
# Combine `aheadofprint` and `epublish`
article_df['date_online'] = article_df['aheadofprint']
article_df['date_online'].fillna(article_df['epublish'], inplace=True)

# Compute proportion missing for each column
missing_pct = article_df.isnull().mean().sort_values()
article_df = article_df[missing_pct.index]
missing_pct

In [None]:
# Select a subset of columns
columns = ['journal', 'journal_issn', 'pubmed_id', 'received', 'revised', 'accepted', 'date_online']
filtered_df = article_df[columns]
print('{} articles before filtering'.format(len(filtered_df)))

# Filter articles by missing values
columns.remove('revised')
filtered_df = filtered_df.dropna(subset=columns)
print('{} articles after removing missing dates'.format(len(filtered_df)))

# calculate delays in days
filtered_df['acceptance_delay'] = (filtered_df.accepted - filtered_df.received).dt.days
filtered_df['publication_delay'] = (filtered_df.date_online - filtered_df.accepted).dt.days

# Remove anachronistic articles
filtered_df = filtered_df.query('(acceptance_delay >= 0) & (publication_delay >= 0)')
filtered_df = filtered_df[filtered_df.received >= datetime.date(2000, 1, 1)]
filtered_df = filtered_df[filtered_df.date_online <= datetime.date.today()]
print('{} articles after removing anachronistic dates'.format(len(filtered_df)))

## Oudated code

In [None]:
def read_dates(path):
    """
    Parse `pubmed-dates.txt` or `pubmed-dates-2.txt`.
    """
    read_file = open(path)
    reader = csv.reader(read_file, delimiter='\t')
    articles = list()
    for row in reader:
        article = collections.OrderedDict()
        article['pubmed_id'] = row[0]
        article['journal'] = row[1]
        for field in row[2:]:
            key, value = field.split(':')
            article[key] = value
        articles.append(article)
    read_file.close()
    return articles

In [None]:
path = '/home/dhimmels/Desktop/pubmed-dates.txt'
articles_1 = read_dates(path)

path = '/home/dhimmels/Desktop/pubmed-dates-2.txt'
articles_2 = read_dates(path)

In [None]:
a1_df = pandas.DataFrame(articles_1)
a2_df = pandas.DataFrame(articles_2)

In [None]:
print(len(a1_df), len(a2_df))

In [None]:
sum(a1_df.pubmed_id.isin(a2_df.pubmed_id))

In [None]:
a1_df.sort_values('pubmed_id').tail()

In [None]:
a2_df.sort_values('pubmed_id').tail()