Accuracy analysis: select 200 random DOIs (#17)

Refs #15
greenelab · Dec 19, 2017 · a0cea58 · a0cea58
1 parent b7fe08c
commit a0cea58
Show file tree

Hide file tree

Showing 6 changed files with 361 additions and 94 deletions.
diff --git a/README.md b/README.md
@@ -18,6 +18,10 @@ conda env create --file=environment.yml
 Then use `source activate library-access` and `source deactivate` to activate or deactivate the environment.
 On windows, use `activate library-access` and `deactivate` instead.
 
+## Using the Code
+
+The code files in this repository assume that your working directory is set to the top-level directory of this repository.
+
 ## License
 
 The files in this repository are released under the CC0 1.0 public domain dedication ([`LICENSE-CC0.md`](LICENSE-CC0.md)), excepting those that match the glob patterns listed below.

diff --git a/environment.yml b/environment.yml
@@ -9,6 +9,12 @@ dependencies:
 - anaconda::pytest=3.2.1
 - anaconda::python=3.6.1
 - anaconda::r-base=3.4.1
+- anaconda::r-dplyr=0.7.0
+- anaconda::r-ggplot2=2.2.1
+- anaconda::r-knitr=1.16
+- anaconda::r-markdown=0.8
+- anaconda::r-readr=1.1.1
+- anaconda::r-rmarkdown=1.5
 - anaconda::requests=2.14.2
 - anaconda::spyder=3.1.4
 - anaconda::sqlalchemy=1.1.9

diff --git a/evaluate_library_access_from_output_tsv.Rmd b/evaluate_library_access_from_output_tsv.Rmd
diff --git a/evaluate_library_access_from_output_tsv/create_stratefied_sample_of_dois.R b/evaluate_library_access_from_output_tsv/create_stratefied_sample_of_dois.R
@@ -0,0 +1,56 @@
+# Load dependencies ------------------------------------------------------------
+
+# Load magrittr pipe
+`%>%` = dplyr::`%>%`
+
+# Settings ---------------------------------------------------------------------
+
+lzma_compressed_library_access_data_location <- file.path(
+  'data', 'library_coverage_xml_and_fulltext_indicators.tsv.xz'
+)
+
+sample_size_per_cell <- 100  # This will be for each cell, multiplied by 
+# 2 full_text_indicator status
+
+output_tsv_location <- file.path(
+  'evaluate_library_access_from_output_tsv',
+  'manual-doi-checks.tsv'
+)
+
+randomizer_seed_to_set <- 3  # Ensure that random sampling will always return
+# the same result.
+
+# Read the dataset -------------------------------------------------------------
+
+library_access_data <- readr::read_tsv(
+  lzma_compressed_library_access_data_location
+)
+# View(lzma_compressed_library_access_data)  # Check the dataset
+
+# Convert variable to factor:
+library_access_data <- library_access_data %>% dplyr::mutate(
+  full_text_indicator = as.factor(full_text_indicator)
+)
+
+# Create stratefied sample, and clean up the tibble ----------------------------
+
+set.seed(randomizer_seed_to_set)
+stratefied_sample <- library_access_data %>%
+  dplyr::group_by(full_text_indicator) %>%
+  dplyr::sample_n(sample_size_per_cell) %>%
+  # Add columns to fill in manually to the stratefied sample dataframe:
+  dplyr::rename('full_text_indicator_automated' = 'full_text_indicator') %>%
+  dplyr::mutate(
+    date_of_manual_full_text_check_inside_campus = NA,
+    full_text_indicator_manual_inside_campus = NA,
+    date_of_manual_full_text_check_outside_campus = NA,
+    full_text_indicator_manual_outside_campus = NA
+  )
+
+# Write the output to a TSV ----------------------------------------------------
+
+readr::write_tsv(
+  stratefied_sample,
+  output_tsv_location,
+  na = ''
+)
diff --git a/evaluate_library_access_from_output_tsv/facilitate_going_through_dois_manually.R b/evaluate_library_access_from_output_tsv/facilitate_going_through_dois_manually.R
@@ -0,0 +1,94 @@
+# Settings ---------------------------------------------------------------------
+
+manual_tsv_location <- file.path(
+  'evaluate_library_access_from_output_tsv',
+  'manual-doi-checks.tsv'
+)
+
+# Open the tsv -----------------------------------------------------------------
+
+dataset_to_go_through <- readr::read_tsv(
+  manual_tsv_location,
+  na = ''
+)
+# View(dataset_to_go_through)
+
+# Facilitate going through the rows that haven't been filled in ----------------
+
+while (TRUE) {
+  user_location_input <- readline(paste0(
+    'Are you on the university campus network',
+    '(y for on-campus, n for off-campus)? [y/n]'
+  ))
+
+  if (
+    tolower(user_location_input) == 'y' ||
+    tolower(user_location_input) == 'n'
+  ) {
+    if (tolower(user_location_input) == 'y') {
+      column_for_data_entry <- 'full_text_indicator_manual_inside_campus'
+      column_for_date <- 'date_of_manual_full_text_check_inside_campus'
+    } else {
+      column_for_data_entry <- 'full_text_indicator_manual_outside_campus'
+      column_for_date <- 'date_of_manual_full_text_check_outside_campus'
+    }
+
+    break  # Break out of the loop, and move on.
+  } else {
+    message('Please enter y or n. Asking again...')
+  }
+}
+
+for (row_number in which(
+  is.na(dataset_to_go_through[, column_for_data_entry])
+)) {
+  doi_for_row <- dataset_to_go_through[row_number, 'doi']
+
+  url_to_visit <- paste0(
+    'https://doi.org/',
+    doi_for_row
+  )
+
+  message('Opening URL "', url_to_visit, '"...')
+
+  utils::browseURL(url_to_visit)
+
+  while (TRUE) {
+    user_full_text_input <- readline(
+      'Do we have full-text access to this DOI? [y/n/invalid]
+  ("invalid" = invalid DOI)'
+    )
+
+    if (
+      tolower(user_full_text_input) == 'y' ||
+      tolower(user_full_text_input) == 'n' ||
+      tolower(user_full_text_input) == 'invalid'
+    ) {
+      dataset_to_go_through[
+        row_number,
+        column_for_date
+      ] <- as.character(Sys.Date())
+
+      if (tolower(user_full_text_input) == 'y') {
+        dataset_to_go_through[row_number, column_for_data_entry] <- 1
+      } else if (tolower(user_full_text_input) == 'n') {
+        dataset_to_go_through[row_number, column_for_data_entry] <- 0
+      } else {
+        dataset_to_go_through[row_number, column_for_data_entry] <- 'invalid'
+      }
+
+      break  # Break out of the loop, and move on.
+    } else {
+      message('Please enter y, n, or invalid. Asking again...')
+    }
+  }
+
+  # Save the changes to the tsv:
+  write.table(
+    dataset_to_go_through,
+    file = manual_tsv_location,
+    sep = '\t',
+    na = '',
+    row.names = FALSE
+  )
+}