Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 4.1.38
current_version = 4.1.39
commit = False
tag = False

Expand Down
2 changes: 1 addition & 1 deletion dev/local/setup.cfg
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[metadata]
name = Delphi Development
version = 4.1.38
version = 4.1.39

[options]
packages =
Expand Down
11 changes: 11 additions & 0 deletions integrations/server/test_covidcast.py
Original file line number Diff line number Diff line change
Expand Up @@ -476,3 +476,14 @@ def test_week_formats(self):
self.assertEqual(expected, colond)
self.assertEqual(expected, dashed)
self.assertEqual(expected, enumed)

def test_hsa_nci(self):
row = CovidcastTestRow.make_default_row(geo_type='hsa_nci', geo_value='99')
self._insert_rows([row])
response = self.request_based_on_row(row)
expected = [row.as_api_row_dict()]
self.assertEqual(response, {
'result': 1,
'epidata': expected,
'message': 'success',
})
29 changes: 17 additions & 12 deletions src/acquisition/covidcast/csv_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,18 +52,21 @@ class CsvRowValue:
class CsvImporter:
"""Finds and parses covidcast CSV files."""

# set of allowed resolutions (aka "geo_type")
GEOGRAPHIC_RESOLUTIONS = {'county', 'hrr', 'msa', 'dma', 'state', 'hhs', 'nation', 'hsa_nci'}

# regex pattern for matching geo types, note: sort longer string first to avoid wrong substring matches
geo_types_pattern = "|".join(sorted(GEOGRAPHIC_RESOLUTIONS, key=len, reverse=True))

# .../source/yyyymmdd_geo_signal.csv
PATTERN_DAILY = re.compile(r'^.*/([^/]*)/(\d{8})_(\w+?)_(\w+)\.csv$')
PATTERN_DAILY = re.compile(r'^.*/([^/]*)/(\d{8})_(' + geo_types_pattern + r')_(.+)\.csv$')

# .../source/weekly_yyyyww_geo_signal.csv
PATTERN_WEEKLY = re.compile(r'^.*/([^/]*)/weekly_(\d{6})_(\w+?)_(\w+)\.csv$')
PATTERN_WEEKLY = re.compile(r'^.*/([^/]*)/weekly_(\d{6})_(' + geo_types_pattern + r')_(.+)\.csv$')

# .../issue_yyyymmdd
PATTERN_ISSUE_DIR = re.compile(r'^.*/([^/]*)/issue_(\d{8})$')

# set of allowed resolutions (aka "geo_type")
GEOGRAPHIC_RESOLUTIONS = {'county', 'hrr', 'msa', 'dma', 'state', 'hhs', 'nation'}

# set of required CSV columns
REQUIRED_COLUMNS = {'geo_id', 'val', 'se', 'sample_size'}

Expand Down Expand Up @@ -158,7 +161,7 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()
daily_match = CsvImporter.PATTERN_DAILY.match(path.lower())
weekly_match = CsvImporter.PATTERN_WEEKLY.match(path.lower())
if not daily_match and not weekly_match:
logger.warning(event='invalid csv path/filename', detail=path, file=path)
logger.warning(event='invalid csv path/filename or geo_type', detail=path, file=path)
yield (path, None)
continue

Expand Down Expand Up @@ -186,12 +189,8 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()
issue_value=issue_epiweek_value
lag_value=delta_epiweeks(time_value_week, issue_epiweek_value)

# # extract and validate geographic resolution
# extract geographic resolution
geo_type = match.group(3).lower()
if geo_type not in CsvImporter.GEOGRAPHIC_RESOLUTIONS:
logger.warning(event='invalid geo_type', detail=geo_type, file=path)
yield (path, None)
continue

# extract additional values, lowercased for consistency
source = match.group(1).lower()
Expand Down Expand Up @@ -300,7 +299,7 @@ def extract_and_check_row(row: DataFrameRow, geo_type: str, filepath: Optional[s
# geo_id was `None`
return (None, 'geo_id')

if geo_type in ('hrr', 'msa', 'dma', 'hhs'):
if geo_type in ('hrr', 'msa', 'dma', 'hhs', 'hsa_nci'):
# these particular ids are prone to be written as ints -- and floats
try:
geo_id = str(CsvImporter.floaty_int(geo_id))
Expand Down Expand Up @@ -339,6 +338,12 @@ def extract_and_check_row(row: DataFrameRow, geo_type: str, filepath: Optional[s
if len(geo_id) != 2 or not 'aa' <= geo_id <= 'zz':
return (None, 'geo_id')

elif geo_type == 'hsa_nci':
# valid codes should be 1-3 digit numbers, or the special code of "1022" for blank
# https://seer.cancer.gov/seerstat/variables/countyattribs/hsa.html
if not re.match(r'^(1022|\d{1,3})$', geo_id):
return (None, 'geo_id')

else:
return (None, 'geo_type')

Expand Down
2 changes: 1 addition & 1 deletion src/client/delphi_epidata.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ Epidata <- (function() {
# API base url
BASE_URL <- getOption('epidata.url', default = 'https://api.delphi.cmu.edu/epidata/')

client_version <- '4.1.38'
client_version <- '4.1.39'

auth <- getOption("epidata.auth", default = NA)

Expand Down
2 changes: 1 addition & 1 deletion src/client/delphi_epidata.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
}
})(this, function (exports, fetchImpl, jQuery) {
const BASE_URL = "https://api.delphi.cmu.edu/epidata/";
const client_version = "4.1.38";
const client_version = "4.1.39";

// Helper function to cast values and/or ranges to strings
function _listitem(value) {
Expand Down
2 changes: 1 addition & 1 deletion src/client/packaging/npm/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
"name": "delphi_epidata",
"description": "Delphi Epidata API Client",
"authors": "Delphi Group",
"version": "4.1.38",
"version": "4.1.39",
"license": "MIT",
"homepage": "https://github.com/cmu-delphi/delphi-epidata",
"bugs": {
Expand Down
2 changes: 1 addition & 1 deletion src/server/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

load_dotenv()

VERSION = "4.1.38"
VERSION = "4.1.39"

MAX_RESULTS = int(10e6)
MAX_COMPATIBILITY_RESULTS = int(3650)
Expand Down
1 change: 1 addition & 0 deletions src/server/_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def __init__(self, geo_type: str, geo_values: Union[bool, Sequence[str]]):
if geo_values == ['']:
raise ValidationFailedException(f"geo_value is empty for the requested geo_type {geo_type}!")
# TODO: keep this translator in sync with CsvImporter.GEOGRAPHIC_RESOLUTIONS in acquisition/covidcast/ and with GeoMapper
# NOTE: We are not including `hsa_nci` here as the geomapper code does not support that version of the HSA definition.
geo_type_translator = {
"county": "fips",
"state": "state_id",
Expand Down
9 changes: 9 additions & 0 deletions tests/acquisition/covidcast/test_csv_importer.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ def test_find_csv_files(self, mock_glob: MagicMock):
path_prefix + 'invalid/weekly_222222_b_c.csv',
# invalid geography
path_prefix + 'invalid/20200418_province_c.csv',
# valid hsa_nci day
path_prefix + 'valid/20200408_hsa_nci_sig.csv',
# valid hsa_nci week
path_prefix + 'valid/weekly_202015_hsa_nci_sig.csv',
# ignored
path_prefix + 'ignored/README.md',
]
Expand All @@ -113,6 +117,8 @@ def test_find_csv_files(self, mock_glob: MagicMock):
(glob_paths[5], None),
(glob_paths[6], None),
(glob_paths[7], None),
(glob_paths[8], PathDetails(expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days, 'valid', 'sig', 'day', time_value_day, 'hsa_nci')),
(glob_paths[9], PathDetails(expected_issue_week, delta_epiweeks(202015, expected_issue_week), 'valid', 'sig', 'week', 202015, 'hsa_nci')),
])
self.assertEqual(found, expected)

Expand Down Expand Up @@ -182,6 +188,7 @@ def make_row(

# cases to test each failure mode
failure_cases = [
(make_row(geo_type='hsa_nci', geo_id='1111'), 'geo_id'),
(make_row(geo_type='county', geo_id='1234'), 'geo_id'),
(make_row(geo_type='county', geo_id='00000'), 'geo_id'),
(make_row(geo_type='hrr', geo_id='600'), 'geo_id'),
Expand Down Expand Up @@ -215,6 +222,8 @@ def make_row(
(make_row(value=None, stderr=np.nan, sample_size='', missing_value=str(float(Nans.DELETED)), missing_stderr=str(float(Nans.DELETED)), missing_sample_size=str(float(Nans.DELETED))), CsvRowValue('vi', None, None, None, Nans.DELETED, Nans.DELETED, Nans.DELETED)),
(make_row(stderr='', sample_size='NA', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.OTHER))), CsvRowValue('vi', 1.23, None, None, Nans.NOT_MISSING, Nans.OTHER, Nans.OTHER)),
(make_row(sample_size=None, missing_value='missing_value', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.NOT_MISSING))), CsvRowValue('vi', 1.23, 4.56, None, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER)),
(make_row(geo_type='hsa_nci', geo_id='1022'), CsvRowValue('1022', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)),
(make_row(geo_type='hsa_nci', geo_id='012'), CsvRowValue('12', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)),
]

for ((geo_type, row), field) in success_cases:
Expand Down