In [1]:
library(dplyr, warn = F)
library(ggplot2)

## Read Scopus-PubMed mapping and delays

In [2]:
scopus_base_url = 'https://github.com/dhimmel/journalmetrics/raw/34c000d4a00adf02b25a3b62a7d5e5e437091c1a/data/'
scopus_col_types = list(scopus_id = readr::col_character()) # R fails with big integers like `2200147401`

In [3]:
# Read scopus-pubmed mapping
scopus_map_df = paste0(scopus_base_url, 'pubmed-map.tsv') %>%
  readr::read_tsv(col_types = scopus_col_types)
head(scopus_map_df, 2)

Unnamed: 0,journal_nlm_id,scopus_id
1,431420,26729
2,1251052,70264


In [4]:
# Read delays
delay_df = 'data/delays.tsv.gz' %>%
  readr::read_tsv(col_types = list(
    journal_nlm_id = readr::col_character(),
    date = readr::col_date()
  )) %>%
  dplyr::inner_join(scopus_map_df) %>%
  dplyr::select(-journal_nlm_id) %>% 
  dplyr::distinct(pubmed_id, delay_type)

head(delay_df, 2)

Joining by: "journal_nlm_id"


Unnamed: 0,pubmed_id,delay_type,date,delay,scopus_id
1,22221113,Acceptance,2011-11-15,111,12547
2,22221113,Publication,2012-01-05,51,12547


## Facet-category plotting functions

In [12]:
pad <- function(x) {
  return(sprintf('%-5s', x))
}

abbreviate_number <- function(x) {
  if (nchar(x) <= 3) {return(pad(x))}
  if (nchar(x) <= 5) {
    return(pad(paste0(signif(x / 1e3, digits = 2), 'K')))
  }
  if (nchar(x) <= 6) {
    return(pad(paste0(round(x / 1e3), 'K')))
  }
  return(pad(paste0(signif(x / 1e6, digits = 2), 'M')))
}

abbreviate_number <- Vectorize(abbreviate_number)

In [35]:
summarize_delays <- function(df) {
  #
  ci = t.test(df$delay, conf.level = 0.99)$conf.int
  mid = mean(df$delay)
  data.frame(
    n_articles = nrow(df),
    n_journals = dplyr::n_distinct(df$scopus_id),
    median = median(df$delay),
    low = ci[1],
    mid = mid,
    high = ci[2]
  )  
}


prepare_gg <- function(article_df) {
  categories = article_df %>%
    dplyr::distinct(category) %>%
    dplyr::arrange(mid) %>%
    .[['category']]

  gg = article_df %>%
  dplyr::mutate(category = factor(category, levels=categories)) %>%
  ggplot2::ggplot(ggplot2::aes(y = category)) +
  geom_errorbarh(aes(xmax = mid), height=0, size=4, xmin=0, x=0, color='#80A5F9') +
  geom_point(aes(x = median), size=1, color='#E4F1FE') +
  geom_errorbarh(aes(xmin = low, x = mid, xmax = high), height=0, size=1, color='#0000FF') +
  ggplot2::facet_grid(facet ~ delay_type, scales='free', space='free_y', shrink = TRUE) +
  ggplot2::theme_bw() +
  ggplot2::scale_x_continuous(name = 'Days of delay', ) +
  ggplot2::scale_y_discrete(name = NULL) +
  ggplot2::expand_limits(x = 0) +
  ggplot2::theme(strip.background = element_rect(fill = '#FEF2E2')) +
  ggplot2::theme(axis.text.y = ggplot2::element_text(size = 8, angle = 30, hjust = 1)) +
  ggplot2::theme(plot.margin=grid::unit(c(2, 2, 2, 2), 'points')) +
  ggplot2::geom_text(aes(label = abbreviate_number(n_articles)), x = 0, hjust = -0.1, size=2, color='#E4F1FE')

return(gg)
}


plot_by_category <- function(article_df) {
  gg_summary_df = article_df %>%
    dplyr::group_by(delay_type, facet, category) %>%
    dplyr::filter(n() >= 100) %>%
    dplyr::filter(dplyr::n_distinct(scopus_id) >= 5) %>%
    dplyr::do(summarize_delays(.)) %>%
    dplyr::ungroup()

  gg_accept = prepare_gg(gg_summary_df %>% dplyr::filter(delay_type == 'Acceptance')) 
  gg_publish = prepare_gg(gg_summary_df %>% dplyr::filter(delay_type == 'Publication'))
  gg = gridExtra::arrangeGrob(gg_accept, gg_publish, nrow=1)
  return(gg)   
}

## Subject comparison

In [23]:
# Read scopus subject areas
subject_df = paste0(scopus_base_url, 'subject-areas.tsv') %>%
  readr::read_tsv(col_types = scopus_col_types)
head(subject_df, 2)

Unnamed: 0,scopus_id,asjc_code,asjc_description
1,12000,1700,Computer Science
2,12000,3300,Social Sciences


In [24]:
# Read scopus top-level subjects
top_df = paste0(scopus_base_url, 'title-top-levels.tsv') %>%
  readr::read_tsv(col_types = scopus_col_types)
head(top_df, 2)

Unnamed: 0,scopus_id,top_level_subject
1,12000,Physical Sciences
2,12000,Social Sciences


In [25]:
gg_category_df = dplyr::bind_rows(
  delay_df %>%
    dplyr::inner_join(top_df) %>%
    dplyr::rename(category = top_level_subject) %>%
    dplyr::mutate(facet = 'Top-Level'),
  delay_df %>%
    dplyr::inner_join(subject_df) %>%
    dplyr::select(-asjc_code) %>%
    dplyr::rename(category = asjc_description) %>%
    dplyr::mutate(facet = 'Subject Area')
)

head(gg_category_df, 2)

Joining by: "scopus_id"
Joining by: "scopus_id"


Unnamed: 0,pubmed_id,delay_type,date,delay,scopus_id,category,facet
1,22221113,Acceptance,2011-11-15,111,12547,Health Sciences,Top-Level
2,22221113,Publication,2012-01-05,51,12547,Health Sciences,Top-Level


In [29]:
gg = plot_by_category(gg_category_df)
ggplot2::ggsave('viz/delays-by-subject.pdf', gg, width=8, height=6)

## Journal attributes

In [7]:
# Read scopus title attibutes
attribute_df = paste0(scopus_base_url, 'title-attributes.tsv') %>%
  readr::read_tsv(col_types = scopus_col_types)
head(attribute_df, 2)

Unnamed: 0,scopus_id,active,open_access,main_publisher,source_type,publisher_country
1,12000,0,1,Columbus State University,Journal,United States
2,12001,1,1,"Society for the Experimental Analysis of Behavior, Inc.",Journal,United States


In [8]:
gg_attribute_df = attribute_df %>%
  dplyr::mutate(open_access = c('0'='Closed', '1'='Open')[as.character(open_access)]) %>%
  dplyr::mutate(active = c('0'='Inactive', '1'='Active')[as.character(active)]) %>%
  dplyr::select(-source_type) %>%
  dplyr::inner_join(delay_df) %>%
  tidyr::gather(key = 'facet', value = 'category', active:publisher_country) %>%
  dplyr::mutate(facet=c(active='Active', open_access='Open', main_publisher='Publisher', publisher_country='Publisher Country')[facet])

head(gg_attribute_df, 2)

Joining by: "scopus_id"


Unnamed: 0,scopus_id,pubmed_id,delay_type,date,delay,facet,category
1,12001,19794831,Acceptance,2008-10-07,124,Active,Active
2,12001,19794832,Acceptance,2008-11-17,42,Active,Active


In [36]:
gg = plot_by_category(gg_attribute_df)
ggplot2::ggsave('viz/delays-by-attribute.pdf', gg, width=12, height=16)