cmu-delphi · krivard · Jan 19, 2022 · Nov 11, 2021 · Nov 12, 2021 · Nov 12, 2021
diff --git a/.github/workflows/python-ci.yml b/.github/workflows/python-ci.yml
@@ -16,7 +16,7 @@ jobs:
     if: github.event.pull_request.draft == false
     strategy:
       matrix:
-        packages: [_delphi_utils_python, changehc, claims_hosp, combo_cases_and_deaths, doctor_visits, google_symptoms, hhs_hosp, hhs_facilities, jhu, nchs_mortality, nowcast, quidel, quidel_covidtest, safegraph_patterns, sir_complainsalot, usafacts]
+        packages: [_delphi_utils_python, changehc, claims_hosp, combo_cases_and_deaths, doctor_visits, dsew_community_profile, google_symptoms, hhs_hosp, hhs_facilities, jhu, nchs_mortality, nowcast, quidel, quidel_covidtest, safegraph_patterns, sir_complainsalot, usafacts]
     defaults:
       run:
         working-directory: ${{ matrix.packages }}

diff --git a/ansible/templates/dsew_community_profile-prod.json.j2 b/ansible/templates/dsew_community_profile-prod.json.j2
@@ -0,0 +1,32 @@
+{
+  "common": {
+    "export_dir": "./receiving",
+    "log_filename": "dsew_cpr.log"
+  },
+  "indicator": {
+    "input_cache": "./input_cache",
+    "reports": "new"
+  },
+  "validation": {
+    "common": {
+      "data_source": "dsew_cpr",
+      "span_length": 14,
+      "min_expected_lag": {"all": "5"},
+      "max_expected_lag": {"all": "9"},
+      "dry_run": true,
+      "suppressed_errors": []
+    },
+    "static": {
+      "minimum_sample_size": 0,
+      "missing_se_allowed": true,
+      "missing_sample_size_allowed": true
+    },
+    "dynamic": {
+      "ref_window_size": 7,
+      "smoothed_signals": [
+        "naats_total_7dav",
+        "naats_positivity_7dav"
+      ]
+    }
+  }
+}
diff --git a/dsew_community_profile/.pylintrc b/dsew_community_profile/.pylintrc
@@ -0,0 +1,22 @@
+
+[MESSAGES CONTROL]
+
+disable=logging-format-interpolation,
+    too-many-locals,
+    too-many-arguments,
+    # Allow pytest functions to be part of a class.
+    no-self-use,
+    # Allow pytest classes to have one test.
+    too-few-public-methods
+
+[BASIC]
+
+# Allow arbitrarily short-named variables.
+variable-rgx=[a-z_][a-z0-9_]*
+argument-rgx=[a-z_][a-z0-9_]*
+attr-rgx=[a-z_][a-z0-9_]*
+
+[DESIGN]
+
+# Don't complain about pytest "unused" arguments.
+ignored-argument-names=(_.*|run_as_module)
diff --git a/dsew_community_profile/DETAILS.md b/dsew_community_profile/DETAILS.md
@@ -0,0 +1,133 @@
+# Dataset layout
+
+The Data Strategy and Execution Workgroup (DSEW) publishes a Community Profile
+Report each weekday, comprising a pair of files: an Excel workbook (.xlsx) and a
+PDF which shows select metrics from the workbook as time series charts and
+choropleth maps. These files are listed as attachments on the healthdata.gov
+site:
+
+https://healthdata.gov/Health/COVID-19-Community-Profile-Report/gqxm-d9w9
+
+Each Excel file attachment has a filename. The filename contains a date,
+presumably the publish date. The attachment also has an alphanumeric
+assetId. Both the filename and the assetId are required for downloading the
+file. Whether this means that updated versions of a particular file may be
+uploaded by DSEW at later times is not known. The attachment does not explicitly
+list an upload timestamp. To be safe, we cache our downloads using both the
+assetId and the filename.
+
+# Workbook layout
+
+Each Excel file is a workbook with multiple sheets. The exemplar file used in
+writing this indicator is "Community Profile Report 20211102.xlsx". The sheets
+include:
+
+- User Notes: Instructions for using the workbook
+- Overview: US National figures for the last 5 weeks, plus monthly peaks back to
+  April 2020
+- Regions*: Figures for FEMA regions (double-checked: they match HHS regions
+  except that FEMA 2 does not include Palau while HHS 2 does)
+- States*: Figures for US states and territories
+- CBSAs*: Figures for US Census Block Statistical Areas
+- Counties*: Figures for US counties
+- Weekly Transmission Categories: Lists of high, substantial, and moderate
+  transmission states and territories
+- National Peaks: Monthly national peaks back to April 2020
+- National Historic: Daily national figures back to January 22 2020
+- Data Notes: Source and methods information for all metrics
+- Color Thresholds: Color-coding is used extensively in all sheets; these are
+  the keys
+
+The starred sheets above have nearly-identical column layouts, and together
+cover the county, MSA, state, and HHS geographical levels used in
+covidcast. Rather than aggregate them ourselves and risk a mismatch, this
+indicator lifts these geographical aggregations directly from the corresponding
+sheets of the workbook. 
+
+GeoMapper _is_ used to generate national figures from
+state, due to architectural differences between the starred sheets and the
+Overview sheet. If we discover that our nation-level figures differ too much
+from those listed in the Overview sheet, we can add dedicated parsing for the
+Overview sheet and remove GeoMapper from this indicator altogether.
+
+# Sheet layout
+
+## Headers
+
+Each starred sheet has two rows of headers. The first row uses merged cells to
+group several columns together under a single "overheader". This overheader
+often includes the reference period for that group of columns, such as:
+
+- CASES/DEATHS: LAST WEEK (October 26-November 1)
+- TESTING: LAST WEEK (October 24-30, Test Volume October 20-26)
+- TESTING: PREVIOUS WEEK (October 17-23, Test Volume October 13-19)
+
+Overheaders have changed periodically since the first report. For example, the
+"TESTING: LAST WEEK" overheader above has also appeared as "VIRAL (RT-PCR) LAB
+TESTING: LAST WEEK", with and without a separate reference date for Test
+Volume. All known overheader forms are checked in test_pull.py.
+
+The second row contains a header for each column. The headers uniquely identify
+each column included in the sheet. Column headers include spaces, and typically
+specify both the metric and the reference period over which it was calculated,
+such as:
+
+- Total NAATs - last 7 days (may be an underestimate due to delayed reporting)
+- NAAT positivity rate - previous 7 days (may be an underestimate due to delayed
+  reporting)
+
+Columns headers have also changed periodically since the first report. For
+example, the "Total NAATs - last 7 days" header above has also appeared as
+"Total RT-PCR diagnostic tests - last 7 days".
+
+## Contents
+
+Each starred sheet contains test positivity and total test volume figures for
+two reference periods, "last [week]" and "previous [week]". In some reports, the
+reference periods for test positivity and total test volume are the same; in
+others, they are different, such that the report contains figures for four
+distinct reference periods, two for each metric we extract.
+
+# Time series conversions and parsing notes
+
+## Reference date
+
+The reference period in the overheader never includes the year. We guess the
+reference year by picking the same year as the publish date (i.e., the date
+extracted from the filename), and if the reference month is greater than the
+publish month, subtract 1 from the reference year. This adequately covers the
+December-January boundary.
+
+We select as reference date the end date of the reference period for each
+metric. Reference periods are always 7 days, so this indicator produces
+seven-day averages. We divide the total testing volume by seven and leave the
+test positivity alone.
+
+## Geo ID
+
+The Counties sheet lists FIPS codes numerically, such that FIPS with a leading
+zero only have four digits. We fix this by zero-filling to five characters.
+
+MSAs are a subset of CBSAs. We fix this by selecting only CBSAs with type
+"Metropolitan".
+
+Most of the starred sheets have the geo id as the first non-index column. The
+Region sheet has no such column. We fix this by generating the HHS ids from the
+index column instead.
+
+## Combining multiple reports
+
+Each report file generates two reference dates for each metric, up to four
+reference dates total. Since it's not clear whether new versions of past files
+are ever made available, the default mode (params.indicator.reports="new")
+fetches any files that are not already in the input cache, then combines the
+results into a single data frame before exporting. This will generate correct
+behavior should (for instance) a previously-downloaded file get a new assetId.
+
+For the initial run on an empty input cache, and for runs configured to process
+a range of reports (using params.indicator.reports=YYYY-mm-dd--YYYY-mm-dd), this
+indicator makes no distinction between figures that came from different
+reports. That may not be what you want. If the covidcast issue date needs to
+match the date on the report filename, then the indicator must instead be run
+repeatedly, with equal start and end dates, keeping the output of each run
+separate.
diff --git a/dsew_community_profile/Makefile b/dsew_community_profile/Makefile
@@ -0,0 +1,30 @@
+.PHONY = venv, lint, test, clean
+
+dir = $(shell find ./delphi_* -name __init__.py | grep -o 'delphi_[_[:alnum:]]*')
+
+venv:
+	python3.8 -m venv env
+
+install: venv
+	. env/bin/activate; \
+	pip install wheel ; \
+	pip install -e ../_delphi_utils_python ;\
+	pip install -e .
+
+install-ci: venv
+	. env/bin/activate; \
+	pip install wheel ; \
+	pip install ../_delphi_utils_python ;\
+	pip install .
+
+lint:
+	. env/bin/activate; pylint $(dir)
+	. env/bin/activate; pydocstyle $(dir)
+
+test:
+	. env/bin/activate ;\
+	(cd tests && ../env/bin/pytest --cov=$(dir) --cov-report=term-missing)
+
+clean:
+	rm -rf env
+	rm -f params.json
diff --git a/dsew_community_profile/README.md b/dsew_community_profile/README.md
@@ -0,0 +1,84 @@
+# COVID-19 Community Profile Report
+
+The Data Strategy and Execution Workgroup (DSEW) publishes a Community Profile
+Report each weekday at this location:
+
+https://healthdata.gov/Health/COVID-19-Community-Profile-Report/gqxm-d9w9
+
+This indicator extracts COVID-19 test figures from these reports.
+
+Indicator-specific parameters:
+
+* `input_cache`: a directory where Excel (.xlsx) files downloaded from
+  healthdata.gov will be stored for posterity. Each file is 3.3 MB in size, so
+  we expect this directory to require ~1GB of disk space for each year of
+  operation.
+* `reports`: {new | all | YYYY-mm-dd--YYYY-mm-dd} a string indicating which
+  reports to export. The default, "new", downloads and exports only reports not
+  already found in the input cache. The "all" setting exports data for all
+  available reports, downloading them to the input cache if necessary. The date
+  range setting refers to the date listed in the filename for the report,
+  presumably the publish date. Only reports named with a date within the
+  specified range (inclusive) will be downloaded to the input cache if necessary
+  and exported.
+* `export_start_date`: a YYYY-mm-dd string indicating the first date to export.
+* `export_end_date`: a YYYY-mm-dd string indicating the final date to export.
+
+## Running the Indicator
+
+The indicator is run by directly executing the Python module contained in this
+directory. The safest way to do this is to create a virtual environment,
+installed the common DELPHI tools, and then install the module and its
+dependencies. To do this, run the following command from this directory:
+
+```
+make install
+```
+
+This command will install the package in editable mode, so you can make changes that
+will automatically propagate to the installed package. 
+
+All of the user-changable parameters are stored in `params.json`. To execute
+the module and produce the output datasets (by default, in `receiving`), run
+the following:
+
+```
+env/bin/python -m delphi_dsew_community_profile
+```
+
+If you want to enter the virtual environment in your shell, 
+you can run `source env/bin/activate`. Run `deactivate` to leave the virtual environment. 
+
+Once you are finished, you can remove the virtual environment and 
+params file with the following:
+
+```
+make clean
+```
+
+## Testing the code
+
+To run static tests of the code style, run the following command:
+
+```
+make lint
+```
+
+Unit tests are also included in the module. To execute these, run the following
+command from this directory:
+
+```
+make test
+```
+
+To run individual tests, run the following:
+
+```
+(cd tests && ../env/bin/pytest <your_test>.py --cov=delphi_dsew_community_profile --cov-report=term-missing)
+```
+
+The output will show the number of unit tests that passed and failed, along
+with the percentage of code covered by the tests. 
+
+None of the linting or unit tests should fail, and the code lines that are not covered by unit tests should be small and
+should not include critical sub-routines. 
diff --git a/dsew_community_profile/REVIEW.md b/dsew_community_profile/REVIEW.md
@@ -0,0 +1,38 @@
+## Code Review (Python)
+
+A code review of this module should include a careful look at the code and the
+output. To assist in the process, but certainly not in replace of it, please
+check the following items.
+
+**Documentation**
+
+- [ ] the README.md file template is filled out and currently accurate; it is
+possible to load and test the code using only the instructions given
+- [ ] minimal docstrings (one line describing what the function does) are
+included for all functions; full docstrings describing the inputs and expected
+outputs should be given for non-trivial functions
+
+**Structure**
+
+- [ ] code should pass lint checks (`make lint`)
+- [ ] any required metadata files are checked into the repository and placed
+within the directory `static`
+- [ ] any intermediate files that are created and stored by the module should
+be placed in the directory `cache`
+- [ ] final expected output files to be uploaded to the API are placed in the
+`receiving` directory; output files should not be committed to the respository
+- [ ] all options and API keys are passed through the file `params.json`
+- [ ] template parameter file (`params.json.template`) is checked into the
+code; no personal (i.e., usernames) or private (i.e., API keys) information is
+included in this template file
+
+**Testing**
+
+- [ ] module can be installed in a new virtual environment (`make install`)
+- [ ] reasonably high level of unit test coverage covering all of the main logic
+of the code (e.g., missing coverage for raised errors that do not currently seem
+possible to reach are okay; missing coverage for options that will be needed are
+not)
+- [ ] all unit tests run without errors (`make test`)
+- [ ] indicator directory has been added to GitHub CI
+(`covidcast-indicators/.github/workflows/python-ci.yml`)
diff --git a/dsew_community_profile/cache/.gitignore b/dsew_community_profile/cache/.gitignore
diff --git a/dsew_community_profile/delphi_dsew_community_profile/__init__.py b/dsew_community_profile/delphi_dsew_community_profile/__init__.py
@@ -0,0 +1,13 @@
+# -*- coding: utf-8 -*-
+"""Module to pull and clean indicators from the XXXXX source.
+
+This file defines the functions that are made public by the module. As the
+module is intended to be executed though the main method, these are primarily
+for testing.
+"""
+
+from __future__ import absolute_import
+
+from . import run
+
+__version__ = "0.1.0"
diff --git a/dsew_community_profile/delphi_dsew_community_profile/__main__.py b/dsew_community_profile/delphi_dsew_community_profile/__main__.py
@@ -0,0 +1,12 @@
+# -*- coding: utf-8 -*-
+"""Call the function run_module when executed.
+
+This file indicates that calling the module (`python -m delphi_dsew_community_profile`) will
+call the function `run_module` found within the run.py file. There should be
+no need to change this template.
+"""
+
+from delphi_utils import read_params
+from .run import run_module  # pragma: no cover
+
+run_module(read_params())  # pragma: no cover