# Extract delays from PubMed history dates

In [1]:
library(dplyr, warn=F)

In [2]:
# Read history dates for all articles
path = file.path('data', 'history-dates.tsv.bz2')
col_types = list(
  accepted_0 = readr::col_date(),
  received_0 = readr::col_date(),
  date_online = readr::col_date()
)
all_df = readr::read_tsv(path, col_types = col_types) %>%
  dplyr::rename(received = received_0, accepted = accepted_0)
head(all_df, 2)

Unnamed: 0,pubmed_id,journal_nlm_id,medline_0,pubmed_0,entrez_0,date_online,aheadofprint_0,received,accepted,revised_0,epublish_0,pmc-release_0,ecollection_0,ppublish_0,version_0
1,1,151424,1975-06-01,1975-06-01,1975-06-01,,,,,,,,,,
2,2,372516,1975-10-27,1975-10-27,1975-10-27,,,,,,,,,,


In [3]:
# Set time constraint to elimate erroneous records
earliest = readr::parse_date('1955-01-01')
latest = readr::parse_date('2015-12-31')

In [None]:
# Create an acceptance  delay dataset
accept_df = all_df %>%
  dplyr::mutate(delay_type = 'Acceptance') %>%
  dplyr::mutate(delay = as.numeric(accepted - received, units='days')) %>%
  dplyr::rename(date = accepted) %>%
  dplyr::select(journal_nlm_id, pubmed_id, delay_type, date, delay) %>%
  dplyr::filter(! is.na(delay)) %>%
  dplyr::filter(delay > 0) %>%
  dplyr::filter(delay <= 365 * 5) %>%
  dplyr::filter(date >= earliest) %>%
  dplyr::filter(date <= latest)

nrow(accept_df)

In [None]:
head(accept_df, 2)

In [None]:
# Create a publication delay dataset
publish_df = all_df %>%
  dplyr::mutate(delay_type = 'Publication') %>%
  dplyr::mutate(delay = as.numeric(date_online - accepted, units='days')) %>%
  dplyr::rename(date = date_online) %>%
  dplyr::select(journal_nlm_id, pubmed_id, delay_type, date, delay) %>%
  dplyr::filter(! is.na(delay)) %>%
  dplyr::filter(delay >= 0) %>%
  dplyr::filter(delay <= 365 * 3) %>%
  dplyr::filter(date >= earliest) %>%
  dplyr::filter(date <= latest)

nrow(publish_df)

In [None]:
head(publish_df, 2)

In [None]:
# Bind acceptance and publication dataframes
delay_df = dplyr::bind_rows(accept_df, publish_df) %>%
  dplyr::arrange(journal_nlm_id, pubmed_id, delay_type)

In [None]:
head(delay_df)

In [None]:
tail(delay_df)

In [None]:
# Save as a gzipped TSV
path = file.path('data', 'delays.tsv')
delay_df %>%
  readr::write_tsv(path)
system2('gzip', c('--force', path))