# Extract delays from PubMed history dates

In [1]:
`%>%` = dplyr::`%>%`

In [2]:
# Read history dates for all articles
path = file.path('data', 'history-dates.tsv.bz2')
col_types = list(
  accepted_0 = readr::col_date(),
  received_0 = readr::col_date(),
  pubmed_0 = readr::col_date(),
  medline_0 = readr::col_date(),
  date_online = readr::col_date()
)
all_df = readr::read_tsv(path, col_types = col_types) %>%
  dplyr::rename(received = received_0, accepted = accepted_0, pubmed = pubmed_0, medline = medline_0)
head(all_df, 2)

pubmed_id,journal_nlm_id,medline,pubmed,entrez_0,date_online,aheadofprint_0,received,accepted,revised_0,epublish_0,pmc-release_0,ecollection_0,ppublish_0,version_0
1,151424,1975-06-01,1975-06-01,1975-06-01,,,,,,,,,,
2,372516,1975-10-27,1975-10-27,1975-10-27,,,,,,,,,,


In [3]:
# Set time constraint to elimate erroneous records
earliest = readr::parse_date('1960-01-01')
latest = readr::parse_date('2017-02-14')

In [4]:
# Count journals and articles by PubMed year
year_df = all_df %>%
  dplyr::filter(pubmed >= earliest) %>%
  dplyr::filter(pubmed <= latest) %>%
  dplyr::mutate(year = lubridate::year(pubmed)) %>%
  dplyr::group_by(year) %>%
  dplyr::summarize(
    n_journals = n_distinct(journal_nlm_id),
    n_articles = n()
  )

path = file.path('data', 'yearly-pubmed-totals.tsv')
year_df %>%
  readr::write_tsv(path)

head(year_df, 2)

year,n_journals,n_articles
1960,1925,111955
1961,2440,119933


In [5]:
# Create an acceptance  delay dataset
accept_df = all_df %>%
  dplyr::mutate(delay_type = 'Acceptance') %>%
  dplyr::mutate(delay = as.numeric(accepted - received, units='days')) %>%
  dplyr::rename(date = accepted) %>%
  dplyr::select(journal_nlm_id, pubmed_id, delay_type, date, delay) %>%
  dplyr::filter(! is.na(delay)) %>%
  dplyr::filter(delay > 0) %>%
  dplyr::filter(delay <= 365 * 5) %>%
  dplyr::filter(date >= earliest) %>%
  dplyr::filter(date <= latest)

nrow(accept_df)

In [6]:
head(accept_df, 2)

journal_nlm_id,pubmed_id,delay_type,date,delay
8214379,2408592,Acceptance,1984-12-07,2
8214379,2412502,Acceptance,1985-03-21,55


In [7]:
# Create a publication delay dataset
publish_df = all_df %>%
  dplyr::mutate(delay_type = 'Publication') %>%
  dplyr::mutate(delay = as.numeric(date_online - accepted, units='days')) %>%
  dplyr::rename(date = date_online) %>%
  dplyr::select(journal_nlm_id, pubmed_id, delay_type, date, delay) %>%
  dplyr::filter(! is.na(delay)) %>%
  dplyr::filter(delay >= 0) %>%
  dplyr::filter(delay <= 365 * 3) %>%
  dplyr::filter(date >= earliest) %>%
  dplyr::filter(date <= latest)

nrow(publish_df)

In [8]:
head(publish_df, 2)

journal_nlm_id,pubmed_id,delay_type,date,delay
9305878,10089389,Publication,1999-01-01,74
9305878,10089390,Publication,1999-01-01,247


In [9]:
# Bind acceptance and publication dataframes
delay_df = dplyr::bind_rows(accept_df, publish_df) %>%
  dplyr::arrange(journal_nlm_id, pubmed_id, delay_type)

In [10]:
head(delay_df)

journal_nlm_id,pubmed_id,delay_type,date,delay
1027,22221113,Acceptance,2011-11-15,111
1027,22221113,Publication,2012-01-05,51
1027,22221154,Acceptance,2011-11-15,227
1027,22221154,Publication,2012-01-05,51
1027,22224504,Acceptance,2011-11-15,88
1027,22224504,Publication,2012-01-08,54


In [11]:
tail(delay_df)

journal_nlm_id,pubmed_id,delay_type,date,delay
9892366,21423322,Acceptance,2010-08-15,90
9892366,21423322,Publication,2010-09-01,17
9892366,26097404,Acceptance,2014-11-19,168
9892366,26097404,Publication,2015-01-10,52
9892366,26321875,Acceptance,2013-01-08,100
9892366,26321875,Publication,2013-02-07,30


In [12]:
# Save as a gzipped TSV
path = file.path('data', 'delays.tsv')
delay_df %>%
  readr::write_tsv(path)
system2('xz', c('--force', path))