Skip to content

Commit

Permalink
Accuracy analysis: select 200 random DOIs (#17)
Browse files Browse the repository at this point in the history
Refs #15
  • Loading branch information
Jacob Levernier authored and dhimmel committed Dec 19, 2017
1 parent b7fe08c commit a0cea58
Show file tree
Hide file tree
Showing 6 changed files with 361 additions and 94 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ conda env create --file=environment.yml
Then use `source activate library-access` and `source deactivate` to activate or deactivate the environment.
On windows, use `activate library-access` and `deactivate` instead.

## Using the Code

The code files in this repository assume that your working directory is set to the top-level directory of this repository.

## License

The files in this repository are released under the CC0 1.0 public domain dedication ([`LICENSE-CC0.md`](LICENSE-CC0.md)), excepting those that match the glob patterns listed below.
Expand Down
6 changes: 6 additions & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@ dependencies:
- anaconda::pytest=3.2.1
- anaconda::python=3.6.1
- anaconda::r-base=3.4.1
- anaconda::r-dplyr=0.7.0
- anaconda::r-ggplot2=2.2.1
- anaconda::r-knitr=1.16
- anaconda::r-markdown=0.8
- anaconda::r-readr=1.1.1
- anaconda::r-rmarkdown=1.5
- anaconda::requests=2.14.2
- anaconda::spyder=3.1.4
- anaconda::sqlalchemy=1.1.9
Expand Down
94 changes: 0 additions & 94 deletions evaluate_library_access_from_output_tsv.Rmd

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# Load dependencies ------------------------------------------------------------

# Load magrittr pipe
`%>%` = dplyr::`%>%`

# Settings ---------------------------------------------------------------------

lzma_compressed_library_access_data_location <- file.path(
'data', 'library_coverage_xml_and_fulltext_indicators.tsv.xz'
)

sample_size_per_cell <- 100 # This will be for each cell, multiplied by
# 2 full_text_indicator status

output_tsv_location <- file.path(
'evaluate_library_access_from_output_tsv',
'manual-doi-checks.tsv'
)

randomizer_seed_to_set <- 3 # Ensure that random sampling will always return
# the same result.

# Read the dataset -------------------------------------------------------------

library_access_data <- readr::read_tsv(
lzma_compressed_library_access_data_location
)
# View(lzma_compressed_library_access_data) # Check the dataset

# Convert variable to factor:
library_access_data <- library_access_data %>% dplyr::mutate(
full_text_indicator = as.factor(full_text_indicator)
)

# Create stratefied sample, and clean up the tibble ----------------------------

set.seed(randomizer_seed_to_set)
stratefied_sample <- library_access_data %>%
dplyr::group_by(full_text_indicator) %>%
dplyr::sample_n(sample_size_per_cell) %>%
# Add columns to fill in manually to the stratefied sample dataframe:
dplyr::rename('full_text_indicator_automated' = 'full_text_indicator') %>%
dplyr::mutate(
date_of_manual_full_text_check_inside_campus = NA,
full_text_indicator_manual_inside_campus = NA,
date_of_manual_full_text_check_outside_campus = NA,
full_text_indicator_manual_outside_campus = NA
)

# Write the output to a TSV ----------------------------------------------------

readr::write_tsv(
stratefied_sample,
output_tsv_location,
na = ''
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Settings ---------------------------------------------------------------------

manual_tsv_location <- file.path(
'evaluate_library_access_from_output_tsv',
'manual-doi-checks.tsv'
)

# Open the tsv -----------------------------------------------------------------

dataset_to_go_through <- readr::read_tsv(
manual_tsv_location,
na = ''
)
# View(dataset_to_go_through)

# Facilitate going through the rows that haven't been filled in ----------------

while (TRUE) {
user_location_input <- readline(paste0(
'Are you on the university campus network',
'(y for on-campus, n for off-campus)? [y/n]'
))

if (
tolower(user_location_input) == 'y' ||
tolower(user_location_input) == 'n'
) {
if (tolower(user_location_input) == 'y') {
column_for_data_entry <- 'full_text_indicator_manual_inside_campus'
column_for_date <- 'date_of_manual_full_text_check_inside_campus'
} else {
column_for_data_entry <- 'full_text_indicator_manual_outside_campus'
column_for_date <- 'date_of_manual_full_text_check_outside_campus'
}

break # Break out of the loop, and move on.
} else {
message('Please enter y or n. Asking again...')
}
}

for (row_number in which(
is.na(dataset_to_go_through[, column_for_data_entry])
)) {
doi_for_row <- dataset_to_go_through[row_number, 'doi']

url_to_visit <- paste0(
'https://doi.org/',
doi_for_row
)

message('Opening URL "', url_to_visit, '"...')

utils::browseURL(url_to_visit)

while (TRUE) {
user_full_text_input <- readline(
'Do we have full-text access to this DOI? [y/n/invalid]
("invalid" = invalid DOI)'
)

if (
tolower(user_full_text_input) == 'y' ||
tolower(user_full_text_input) == 'n' ||
tolower(user_full_text_input) == 'invalid'
) {
dataset_to_go_through[
row_number,
column_for_date
] <- as.character(Sys.Date())

if (tolower(user_full_text_input) == 'y') {
dataset_to_go_through[row_number, column_for_data_entry] <- 1
} else if (tolower(user_full_text_input) == 'n') {
dataset_to_go_through[row_number, column_for_data_entry] <- 0
} else {
dataset_to_go_through[row_number, column_for_data_entry] <- 'invalid'
}

break # Break out of the loop, and move on.
} else {
message('Please enter y, n, or invalid. Asking again...')
}
}

# Save the changes to the tsv:
write.table(
dataset_to_go_through,
file = manual_tsv_location,
sep = '\t',
na = '',
row.names = FALSE
)
}
Loading

0 comments on commit a0cea58

Please sign in to comment.