# Starting with all pubmed records since 2014, parse dates and filter for records that have publication, acceptance, and retreival dates.

In [1]:
library(readr)
library(dplyr, warn=F)
library(lubridate)
library(magrittr)

In [2]:
col_types = list(
  date_accepted = readr::col_datetime("%Y/%m/%d %H:%M"),
  date_epublish = readr::col_datetime("%Y/%m/%d %H:%M"),
  date_medline  = readr::col_datetime("%Y/%m/%d %H:%M"),
  date_pubmed   = readr::col_datetime("%Y/%m/%d %H:%M"),
  date_received = readr::col_datetime("%Y/%m/%d %H:%M"),
  epub_date     = readr::col_datetime("%Y %b %d")
)

# read in tsv of pubmed records
pubmed_df = readr::read_tsv('data/pubmed-since-2104.tsv.gz', col_types = col_types)
nrow(pubmed_df)



: 1 problems parsing 'data/pubmed-since-2104.tsv.gz'. See problems(...) for more details.

In [3]:
pubmed_df %<>%
  dplyr::filter(! is.na(date_received)) %>%
  dplyr::filter(! is.na(date_accepted)) %>%
  dplyr::filter(! is.na(epub_date)) %>%
  dplyr::mutate(acceptance_days = (date_accepted - date_received ) / lubridate::ddays(1)) %>%
  dplyr::mutate(publication_days = (epub_date - date_accepted) / lubridate::ddays(1)) %>%
  dplyr::select(pubmed_id, doi, pubtype, journal, journal_abbrev, epub_date, acceptance_days, publication_days)

nrow(pubmed_df)

In [4]:
# filter to remove likely errors
pubmed_df %<>%
  dplyr::filter(publication_days > 0) %>%
  dplyr::filter(publication_days < 750) %>%
  dplyr::filter(acceptance_days > 0) %>%
  dplyr::filter(acceptance_days < 1000)

nrow(pubmed_df)

In [5]:
head(pubmed_df)

Unnamed: 0,pubmed_id,doi,pubtype,journal,journal_abbrev,epub_date,acceptance_days,publication_days
1,26111384,10.7554/eLife.07072,Journal Article,eLife,Elife,2015-06-25,111,17
2,26111374,10.1371/journal.pcbi.1004310,Journal Article,PLoS computational biology,PLoS Comput Biol,2015-06-25,163,56
3,26111373,10.1111/wrr.12333,Journal Article,Wound repair and regeneration : official publication of the Wound Healing Society [and] the European Tissue Repair Society,Wound Repair Regen,2015-06-24,97,7
4,26111363,10.1002/ppul.23226,Journal Article,Pediatric pulmonology,Pediatr Pulmonol,2015-06-25,159,32
5,26111358,10.1111/jgh.13026,Journal Article,Journal of gastroenterology and hepatology,J Gastroenterol Hepatol,2015-06-25,151,16
6,26111357,10.1002/asia.201500332,Journal Article,"Chemistry, an Asian journal",Chem Asian J,2015-06-25,82,2


In [6]:
# write as tsv
pubmed_df %>% 
  readr::write_tsv('data/pubmed-since-2014-filtered.tsv')
system('gzip data/pubmed-since-2014-filtered.tsv')

In [7]:
summary_df = pubmed_df %>%
  dplyr::group_by(journal, journal_abbrev) %>%
  dplyr::summarize(
    articles = n(),
    publication_days_mean = mean(publication_days),
    publication_days_median = median(publication_days),
    publication_days_max = max(publication_days),
    publication_days_min = min(publication_days),
    acceptance_days_mean = mean(acceptance_days),
    acceptance_days_median = median(acceptance_days),
    acceptance_days_max = max(acceptance_days),
    acceptance_days_min = min(acceptance_days)
  )

nrow(summary_df)

In [8]:
head(summary_df)

Unnamed: 0,journal,journal_abbrev,articles,publication_days_mean,publication_days_median,publication_days_max,publication_days_min,acceptance_days_mean,acceptance_days_median,acceptance_days_max,acceptance_days_min
1,AAPS PharmSciTech,AAPS PharmSciTech,282,32.96809,27.0,210,8,116.4043,108,447,1
2,ACS macro letters,ACS Macro Lett,17,6.294118,5.0,22,1,54.47059,44,105,8
3,ACS medicinal chemistry letters,ACS Med Chem Lett,59,6.915254,5.0,62,1,65.20339,57,150,6
4,AIDS research and therapy,AIDS Res Ther,60,18.41667,13.5,65,3,139.4333,136,336,23
5,AIDS research and treatment,AIDS Res Treat,41,29.58537,26.0,66,7,116.2927,111,226,32
6,AIP advances,AIP Adv,5,43.0,9.0,176,8,48.2,43,89,16


In [9]:
# write as tsv
summary_df %>% 
  readr::write_tsv('data/pubmed-since-2014-summary.tsv')