greenelab · jglev · Oct 13, 2017 · Oct 16, 2017 · Oct 16, 2017 · Oct 16, 2017
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,9 @@
+.directory
+__pycache__
+pull_request_Information.md
+Datasets/State_of_OA/Not_Currently_Used
+Datasets/State_of_OA/state-of-oa-dois_CLEAN.tsv
+library_coverage_xml_and_fulltext_indicators.db*
+library_management_system_downloader/Not_Currently_Used
+library_management_system_downloader/downloader_configuration_file.py
+example_sql_for_looking_at_sqlite_database.sql
diff --git a/Datasets/State_of_OA/state-of-oa-dois.tsv b/Datasets/State_of_OA/state-of-oa-dois.tsv
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,18 @@
+name: UPenn_Holdings_Project
+dependencies:
+- anaconda::beautifulsoup4=4.6.0
+- anaconda::ipython=5.3.0
+- anaconda::pandas=0.20.1
+- anaconda::pep8=1.7.0
+- anaconda::pip=9.0.1
+- anaconda::python=3.6.1
+- anaconda::r-base=3.4.1
+- anaconda::requests=2.14.2
+- anaconda::spyder=3.1.4
+- anaconda::sqlalchemy=1.1.9
+- anaconda::sqlite=3.13.0
+- anaconda::yaml=0.1.6
+- pip:
+  - backoff==1.4.3
+  - ipython-genutils==0.2.0
+  - ratelimit==1.4.1
diff --git a/estimate_true_rate_of_fulltext_access_bayesian_approach.R b/estimate_true_rate_of_fulltext_access_bayesian_approach.R
@@ -0,0 +1,204 @@
+# TODO(jlevern) Copyright statement placeholder
+#   Portions of this code (marked below) are based on a stan model released 
+#   by Bob Carpenter in 2012 under a "new BSD license" (i.e., a BSD 3-Clause
+#   license).
+# TODO(jlevern) Author placeholder
+# TODO(jlevern) File description placeholder
+
+# Settings ---------------------------------------------------------------------
+
+# The location of the tsv dataset with DOI information (specifically, the
+# DOI open access "colors"). This will be used below to allow subsetting data based on those colors.
+location_of_original_tsv_datset <- 'Datasets/State_of_OA/state-of-oa-dois.tsv'
+
+location_of_sqlite_full_text_database <- 'library_coverage_xml_and_fulltext_indicators.db'
+
+# Load packages ----------------------------------------------------------------
+
+## Load rstan ------------------------------------------------------------------
+
+# rstan can be finicky to install (in Linux), so I've included an
+# extended comment below to facilitate the process.
+
+# Per https://github.com/stan-dev/rstan/wiki/Installing-RStan-on-Mac-or-Linux,
+# rstan should be installed with the following:
+# install.packages("rstan", repos = "https://cloud.r-project.org/", dependencies=TRUE)
+
+# If you encounter an error about "lang__grammars__expression_grammar_inst" (or
+# something similar) when installing rstan, doing the following may help to
+# resolve it:
+# - Create a file called ~/.R/Makevars, and add to it the following:
+#   CXXFLAGS=-DBOOST_PHOENIX_NO_VARIADIC_EXPRESSION
+#   This follows 
+#   https://github.com/stan-dev/rstan/issues/447#issuecomment-325172186
+# - Also, following 
+#   http://sites.psu.edu/theubunturblog/2012/09/03/installing-rstan-in-ubuntu/,
+#   install r-cran-rcpp and r-cran-inline using your system's package manager
+#   (those are the Ubuntu package names).
+
+# This follows the guide for loading rstan at
+# http://mc-stan.org/workshops/ASA2016/day-1.pdf
+library(rstan)
+rstan_options(auto_write = TRUE)
+options(mc.cores = parallel::detectCores())
+
+## Load other packages ---------------------------------------------------------
+
+library('bayesplot')
+library('DBI')  # Load DBI to use RSQLite, following the RSQLite vignette
+  # (https://cran.r-project.org/web/packages/RSQLite/vignettes/RSQLite.html)
+library('RSQLite')
+library('rstantools')
+library('shinystan')
+
+# Load data --------------------------------------------------------------------
+
+# Load the original dois dataset:
+doi_metadata_dataset <- read.table(
+  location_of_original_tsv_datset,
+  header = TRUE,
+  sep = "\t",
+  quote = "",
+  strip.white = TRUE,
+  stringsAsFactors = c(FALSE, TRUE)  # The two columns are DOI and O.A. color
+)
+# View(doi_metadata_dataset)  # Check the dataset visually
+
+# Load the sqlite database that contains full-text holdings information:
+fulltext_access_database <- dbConnect(
+  RSQLite::SQLite(),
+  location_of_sqlite_full_text_database
+)
+# dbListTables(fulltext_access_database)  # Check the list of database tables
+
+# Get whatever full-text information we've stored in our sqlite database:
+dois_and_full_text_indicator <- dbGetQuery(
+  fulltext_access_database,
+  'SELECT
+    dois_table.doi,
+    library_holdings_data.full_text_indicator
+  FROM library_holdings_data
+  JOIN dois_table
+  ON library_holdings_data.doi_foreign_key = dois_table.database_id
+  WHERE
+    library_holdings_data.full_text_indicator IS NOT NULL'
+)
+
+dbDisconnect(fulltext_access_database)  # Close our connection with the database
+
+# Join open-access color to the above:
+full_doi_information_dataset <- merge(
+  dois_and_full_text_indicator,
+  doi_metadata_dataset,
+  by = 'doi'
+)
+# View(full_doi_information_dataset)  # Check our work visually.
+
+# Remove all duplicate DOIs except the *last* (i.e., most recently added to the
+# database) instance of each:
+full_doi_information_dataset <- full_doi_information_dataset[
+  !duplicated(
+    full_doi_information_dataset[, 'doi'],
+    fromLast = T
+  )
+,]
+# View(full_doi_information_dataset)  # Check our work visually.
+
+# Summarize our dataset for the user -------------------------------------------
+
+message("Our dataset has ", nrow(full_doi_information_dataset), " rows, each ",
+  "of a unique doi (in cases of duplicate DOIs, only the last-listed instance ",
+  "is retained)."
+)
+
+# Print a frequency table summarizing the different types of colors:
+message("We have the following numbers of each type of Open Access color:"
+)
+message(paste0(capture.output(
+  as.data.frame(
+    table(
+      full_doi_information_dataset[, c('oadoi_color')],
+      dnn = 'Color'
+    ),
+    responseName = 'Frequency'
+  )
+), collapse = "\n"))
+
+
+# Define a model for stan ------------------------------------------------------
+
+# Below, we defined a basic stan model for a Bernoulli likelihood (data-
+# generating) function, with a flat beta-distributed prior:
+# This comes from the stan developers' examples, at
+# https://github.com/stan-dev/example-models/blob/master/basic_estimators/bernoulli.stan,
+# which, per https://github.com/stan-dev/example-models, is released under a
+# New BSD License, copyright Bob Carpenter 2012.
+stan_model <- "
+data { 
+  int<lower=0> N;
+  int<lower=0,upper=1> y[N];
+} 
+parameters {
+  real<lower=0,upper=1> theta;
+} 
+model {
+  theta ~ beta(1,1);  // A flat prior
+  for (n in 1:N)
+    y[n] ~ bernoulli(theta);
+}
+"
+
+# Run the stan model -----------------------------------------------------------
+
+model_fit <- stan(
+  model_code = stan_model,
+  model_name = "Bernoulli likelihood, Beta(1,1) Prior",
+  data = list(
+    N = nrow(full_doi_information_dataset),
+    y = full_doi_information_dataset$full_text_indicator
+  ),
+  iter = 1000,
+  chains = 4,
+  verbose = TRUE
+)
+
+# Analyze the stan output ------------------------------------------------------
+
+## Explore the stan output manually --------------------------------------------
+
+shinystan_object <- launch_shinystan(model_fit)
+
+## Additional notes about interpreting this model ------------------------------
+
+# Regarding the relationship between a Bayesian Credible Interval vs. a more
+# standard Confidence Interval: From https://en.wikipedia.org/wiki/Credible_interval:
+  # "For the case of a single parameter and data that can be summarised in a
+  # single sufficient statistic, it can be shown that the credible interval and
+  # the confidence interval will coincide if the unknown parameter is a location
+  # parameter, with a prior that is a uniform flat distribution."
+
+## Make some visualizations from the stan output -------------------------------
+
+# This follows 
+# http://mc-stan.org/bayesplot/reference/bayesplot-package.html#examples
+
+posterior <- as.matrix(model_fit)
+
+plot_title <- ggtitle(
+  "Posterior distribution",
+  "with median and 95% credible interval"
+)
+mcmc_areas(
+  posterior,
+  pars = c("theta"),
+  prob = 0.95
+) + 
+plot_title
+
+# If we would like to get the Credible Interval ourselves, we can use the
+# following:
+# theta_posterior <- as.data.frame(posterior)$theta
+# mean(theta_posterior)
+# sd(theta_posterior)
+# quantile(theta_posterior, c(.025, .50, .975))
+
diff --git a/library_coverage_xml_and_fulltext_indicators.db b/library_coverage_xml_and_fulltext_indicators.db
diff --git a/library_management_system_downloader/__init.py__ b/library_management_system_downloader/__init.py__
@@ -0,0 +1 @@
+
diff --git a/library_management_system_downloader/create_api_request.py b/library_management_system_downloader/create_api_request.py
@@ -0,0 +1,55 @@
+import logging
+
+import backoff
+from ratelimit import rate_limited
+import requests
+
+
+class ErrorWithAPI(Exception):
+    pass
+
+
+# From the backoff documentation (https://pypi.python.org/pypi/backoff),
+# set maximum number of tries on a failed download:
+@backoff.on_exception(
+        backoff.expo,
+        requests.exceptions.RequestException,
+        max_tries=8)
+# From the ratelimit documentation (https://pypi.python.org/pypi/ratelimit),
+# set the rate limit (in seconds) for API calls:
+# A rate limit of 0.2 means max five queries per one second.
+@rate_limited(0.2)
+def create_api_request(
+        item_doi,
+        api_base_url,
+        static_api_request_parameters_dictionary=None,
+        custom_user_agent_string=None):
+    """Given an item DOI (lacking the 'doi:/' prefix), query an OpenURL
+    resolver) and return XML from it."""
+
+    if static_api_request_parameters_dictionary is None:
+        static_api_request_parameters_dictionary = {}
+
+    custom_api_query_header = {}
+    if custom_user_agent_string:
+        custom_api_query_header['user-agent'] = custom_user_agent_string
+
+    # Update the static api parameters to include the (dynamic) DOI:
+    api_request_parameters = static_api_request_parameters_dictionary.copy()
+    api_request_parameters.update({
+            'rft_id': f'info:doi/{item_doi}'})
+
+    api_response = requests.get(
+            api_base_url,
+            params=api_request_parameters,
+            headers=custom_api_query_header)
+
+    if api_response.status_code != 200:
+        raise ErrorWithAPI(
+                f'Problem contacting API: We received Status Code '
+                '{api_response.status_code}. The full response text is '
+                'below: {api_response.text}')
+
+    logging.info(f'Returning query results from URL "{api_response.url}"')
+
+    return api_response
diff --git a/library_management_system_downloader/downloader_configuration_file_TEMPLATE.py b/library_management_system_downloader/downloader_configuration_file_TEMPLATE.py
@@ -0,0 +1,21 @@
+# These are parameters that stay the same from query to query (unlike the
+# doi):
+static_parameters_for_api = {}
+
+user_agent_custom_string = (
+        'This is part of a bulk download for a research '
+        'project by Jane Doe of the University of ________. '
+        'I have put a threshold on the download in order '
+        'not to cause a problem with your servers. If this '
+        'download IS causing a problem, please feel free to email me at '
+        '________, or to call me directly at ________.')
+
+api_base_url = (
+            'https://example.com/openurl')
+
+# This dataset is currently expected to contain two columns:
+# 'doi' and 'oadoi_color' (containing values 'closed', 'bronze', 'hybrid',
+# etc.)
+input_tsv_dataset_location = 'Datasets/State_of_OA/state-of-oa-dois.tsv'
+
+rerun_dois_that_are_already_in_database = False
diff --git a/library_management_system_downloader/evaluate_api_response_for_fulltext_indication.py b/library_management_system_downloader/evaluate_api_response_for_fulltext_indication.py
@@ -0,0 +1,48 @@
+from bs4 import BeautifulSoup
+import logging
+import unittest
+
+
+def fulltext_indication(api_response_xml):
+    """Given XML from an API response, determine whether fulltext is available
+    for that item. Return True if fulltext is available, and False if it is
+    not."""
+    parseable_response_xml = BeautifulSoup(
+            api_response_xml,
+            features='xml')
+
+    doi_value = parseable_response_xml.find(
+            id='rft.doi')
+
+    full_text_indicator_value = parseable_response_xml.find(
+            id='full_text_indicator')
+
+    logging.info(
+          f'The value for the "full_text_indicator" key for doi "{doi_value}" '
+          'is "full_text_indicator_value".')
+
+    if(full_text_indicator_value is not None and
+       full_text_indicator_value.string == 'true'):
+        return 1
+    else:
+        return 0
+
+# =============================================================================
+# Function examples / tests
+# =============================================================================
+
+
+class TestFulltextIndication(unittest.TestCase):
+
+    def test_true(self):
+        self.assertEqual(
+                fulltext_indication(
+                        '<key id="full_text_indicator">true</key>'),
+                1)
+
+    def test_false(self):
+        self.assertEqual(
+                fulltext_indication('tester'),
+                0)
+
+# unittest.main()  # To run the above unit tests.