From fd6250f7fb6a13adbbcac4ffe902f274eff6071f Mon Sep 17 00:00:00 2001 From: minhkhul <118945681+minhkhul@users.noreply.github.com> Date: Wed, 12 Nov 2025 11:53:12 -0500 Subject: [PATCH 1/2] Add hsa_nci geo resolution + Adjust csv regex patterns (#1690) * Add hsa_nci geo resolution + Adjust csv regex patterns * adjust test * replace old geo_type check with different warning * Add validation check for hsa_nci + validation test cases * add warning to input validation code chunk about hsa_nci * remove hsa from GEOGRAPHIC_RESOLUTIONS * remove hsa from test * adjust PATTERN_DAILY PATTERN_WEEKLY string construction to avoid doubel braces * Update src/server/_params.py Co-authored-by: george * Update src/acquisition/covidcast/csv_importer.py Co-authored-by: george * integration test --------- Co-authored-by: george --- integrations/server/test_covidcast.py | 11 +++++++ src/acquisition/covidcast/csv_importer.py | 29 +++++++++++-------- src/server/_params.py | 1 + .../covidcast/test_csv_importer.py | 9 ++++++ 4 files changed, 38 insertions(+), 12 deletions(-) diff --git a/integrations/server/test_covidcast.py b/integrations/server/test_covidcast.py index 8b7fc6f52..bc9bfedfa 100644 --- a/integrations/server/test_covidcast.py +++ b/integrations/server/test_covidcast.py @@ -476,3 +476,14 @@ def test_week_formats(self): self.assertEqual(expected, colond) self.assertEqual(expected, dashed) self.assertEqual(expected, enumed) + + def test_hsa_nci(self): + row = CovidcastTestRow.make_default_row(geo_type='hsa_nci', geo_value='99') + self._insert_rows([row]) + response = self.request_based_on_row(row) + expected = [row.as_api_row_dict()] + self.assertEqual(response, { + 'result': 1, + 'epidata': expected, + 'message': 'success', + }) diff --git a/src/acquisition/covidcast/csv_importer.py b/src/acquisition/covidcast/csv_importer.py index 33df578a9..7b8d13c45 100644 --- a/src/acquisition/covidcast/csv_importer.py +++ b/src/acquisition/covidcast/csv_importer.py @@ -52,18 +52,21 @@ class CsvRowValue: class CsvImporter: """Finds and parses covidcast CSV files.""" + # set of allowed resolutions (aka "geo_type") + GEOGRAPHIC_RESOLUTIONS = {'county', 'hrr', 'msa', 'dma', 'state', 'hhs', 'nation', 'hsa_nci'} + + # regex pattern for matching geo types, note: sort longer string first to avoid wrong substring matches + geo_types_pattern = "|".join(sorted(GEOGRAPHIC_RESOLUTIONS, key=len, reverse=True)) + # .../source/yyyymmdd_geo_signal.csv - PATTERN_DAILY = re.compile(r'^.*/([^/]*)/(\d{8})_(\w+?)_(\w+)\.csv$') + PATTERN_DAILY = re.compile(r'^.*/([^/]*)/(\d{8})_(' + geo_types_pattern + r')_(.+)\.csv$') # .../source/weekly_yyyyww_geo_signal.csv - PATTERN_WEEKLY = re.compile(r'^.*/([^/]*)/weekly_(\d{6})_(\w+?)_(\w+)\.csv$') + PATTERN_WEEKLY = re.compile(r'^.*/([^/]*)/weekly_(\d{6})_(' + geo_types_pattern + r')_(.+)\.csv$') # .../issue_yyyymmdd PATTERN_ISSUE_DIR = re.compile(r'^.*/([^/]*)/issue_(\d{8})$') - # set of allowed resolutions (aka "geo_type") - GEOGRAPHIC_RESOLUTIONS = {'county', 'hrr', 'msa', 'dma', 'state', 'hhs', 'nation'} - # set of required CSV columns REQUIRED_COLUMNS = {'geo_id', 'val', 'se', 'sample_size'} @@ -158,7 +161,7 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today() daily_match = CsvImporter.PATTERN_DAILY.match(path.lower()) weekly_match = CsvImporter.PATTERN_WEEKLY.match(path.lower()) if not daily_match and not weekly_match: - logger.warning(event='invalid csv path/filename', detail=path, file=path) + logger.warning(event='invalid csv path/filename or geo_type', detail=path, file=path) yield (path, None) continue @@ -186,12 +189,8 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today() issue_value=issue_epiweek_value lag_value=delta_epiweeks(time_value_week, issue_epiweek_value) - # # extract and validate geographic resolution + # extract geographic resolution geo_type = match.group(3).lower() - if geo_type not in CsvImporter.GEOGRAPHIC_RESOLUTIONS: - logger.warning(event='invalid geo_type', detail=geo_type, file=path) - yield (path, None) - continue # extract additional values, lowercased for consistency source = match.group(1).lower() @@ -300,7 +299,7 @@ def extract_and_check_row(row: DataFrameRow, geo_type: str, filepath: Optional[s # geo_id was `None` return (None, 'geo_id') - if geo_type in ('hrr', 'msa', 'dma', 'hhs'): + if geo_type in ('hrr', 'msa', 'dma', 'hhs', 'hsa_nci'): # these particular ids are prone to be written as ints -- and floats try: geo_id = str(CsvImporter.floaty_int(geo_id)) @@ -339,6 +338,12 @@ def extract_and_check_row(row: DataFrameRow, geo_type: str, filepath: Optional[s if len(geo_id) != 2 or not 'aa' <= geo_id <= 'zz': return (None, 'geo_id') + elif geo_type == 'hsa_nci': + # valid codes should be 1-3 digit numbers, or the special code of "1022" for blank + # https://seer.cancer.gov/seerstat/variables/countyattribs/hsa.html + if not re.match(r'^(1022|\d{1,3})$', geo_id): + return (None, 'geo_id') + else: return (None, 'geo_type') diff --git a/src/server/_params.py b/src/server/_params.py index d565f894d..da0ed0ae1 100644 --- a/src/server/_params.py +++ b/src/server/_params.py @@ -59,6 +59,7 @@ def __init__(self, geo_type: str, geo_values: Union[bool, Sequence[str]]): if geo_values == ['']: raise ValidationFailedException(f"geo_value is empty for the requested geo_type {geo_type}!") # TODO: keep this translator in sync with CsvImporter.GEOGRAPHIC_RESOLUTIONS in acquisition/covidcast/ and with GeoMapper + # NOTE: We are not including `hsa_nci` here as the geomapper code does not support that version of the HSA definition. geo_type_translator = { "county": "fips", "state": "state_id", diff --git a/tests/acquisition/covidcast/test_csv_importer.py b/tests/acquisition/covidcast/test_csv_importer.py index 0906febd1..684ba01b1 100644 --- a/tests/acquisition/covidcast/test_csv_importer.py +++ b/tests/acquisition/covidcast/test_csv_importer.py @@ -94,6 +94,10 @@ def test_find_csv_files(self, mock_glob: MagicMock): path_prefix + 'invalid/weekly_222222_b_c.csv', # invalid geography path_prefix + 'invalid/20200418_province_c.csv', + # valid hsa_nci day + path_prefix + 'valid/20200408_hsa_nci_sig.csv', + # valid hsa_nci week + path_prefix + 'valid/weekly_202015_hsa_nci_sig.csv', # ignored path_prefix + 'ignored/README.md', ] @@ -113,6 +117,8 @@ def test_find_csv_files(self, mock_glob: MagicMock): (glob_paths[5], None), (glob_paths[6], None), (glob_paths[7], None), + (glob_paths[8], PathDetails(expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days, 'valid', 'sig', 'day', time_value_day, 'hsa_nci')), + (glob_paths[9], PathDetails(expected_issue_week, delta_epiweeks(202015, expected_issue_week), 'valid', 'sig', 'week', 202015, 'hsa_nci')), ]) self.assertEqual(found, expected) @@ -182,6 +188,7 @@ def make_row( # cases to test each failure mode failure_cases = [ + (make_row(geo_type='hsa_nci', geo_id='1111'), 'geo_id'), (make_row(geo_type='county', geo_id='1234'), 'geo_id'), (make_row(geo_type='county', geo_id='00000'), 'geo_id'), (make_row(geo_type='hrr', geo_id='600'), 'geo_id'), @@ -215,6 +222,8 @@ def make_row( (make_row(value=None, stderr=np.nan, sample_size='', missing_value=str(float(Nans.DELETED)), missing_stderr=str(float(Nans.DELETED)), missing_sample_size=str(float(Nans.DELETED))), CsvRowValue('vi', None, None, None, Nans.DELETED, Nans.DELETED, Nans.DELETED)), (make_row(stderr='', sample_size='NA', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.OTHER))), CsvRowValue('vi', 1.23, None, None, Nans.NOT_MISSING, Nans.OTHER, Nans.OTHER)), (make_row(sample_size=None, missing_value='missing_value', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.NOT_MISSING))), CsvRowValue('vi', 1.23, 4.56, None, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER)), + (make_row(geo_type='hsa_nci', geo_id='1022'), CsvRowValue('1022', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)), + (make_row(geo_type='hsa_nci', geo_id='012'), CsvRowValue('12', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)), ] for ((geo_type, row), field) in success_cases: From 986c5485e1ab95d10d43aa06ed27c2646effa76b Mon Sep 17 00:00:00 2001 From: melange396 Date: Thu, 13 Nov 2025 15:35:57 +0000 Subject: [PATCH 2/2] chore: release delphi-epidata 4.1.39 --- .bumpversion.cfg | 2 +- dev/local/setup.cfg | 2 +- src/client/delphi_epidata.R | 2 +- src/client/delphi_epidata.js | 2 +- src/client/packaging/npm/package.json | 2 +- src/server/_config.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index e10c0c411..1728fd7e2 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 4.1.38 +current_version = 4.1.39 commit = False tag = False diff --git a/dev/local/setup.cfg b/dev/local/setup.cfg index d4025d35b..abeae2d61 100644 --- a/dev/local/setup.cfg +++ b/dev/local/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = Delphi Development -version = 4.1.38 +version = 4.1.39 [options] packages = diff --git a/src/client/delphi_epidata.R b/src/client/delphi_epidata.R index 9a180a576..99d72b1f5 100644 --- a/src/client/delphi_epidata.R +++ b/src/client/delphi_epidata.R @@ -15,7 +15,7 @@ Epidata <- (function() { # API base url BASE_URL <- getOption('epidata.url', default = 'https://api.delphi.cmu.edu/epidata/') - client_version <- '4.1.38' + client_version <- '4.1.39' auth <- getOption("epidata.auth", default = NA) diff --git a/src/client/delphi_epidata.js b/src/client/delphi_epidata.js index f9d5928b7..fee6a10df 100644 --- a/src/client/delphi_epidata.js +++ b/src/client/delphi_epidata.js @@ -22,7 +22,7 @@ } })(this, function (exports, fetchImpl, jQuery) { const BASE_URL = "https://api.delphi.cmu.edu/epidata/"; - const client_version = "4.1.38"; + const client_version = "4.1.39"; // Helper function to cast values and/or ranges to strings function _listitem(value) { diff --git a/src/client/packaging/npm/package.json b/src/client/packaging/npm/package.json index e6708b4da..fa4ff5376 100644 --- a/src/client/packaging/npm/package.json +++ b/src/client/packaging/npm/package.json @@ -2,7 +2,7 @@ "name": "delphi_epidata", "description": "Delphi Epidata API Client", "authors": "Delphi Group", - "version": "4.1.38", + "version": "4.1.39", "license": "MIT", "homepage": "https://github.com/cmu-delphi/delphi-epidata", "bugs": { diff --git a/src/server/_config.py b/src/server/_config.py index 776de4a0f..f5924f803 100644 --- a/src/server/_config.py +++ b/src/server/_config.py @@ -7,7 +7,7 @@ load_dotenv() -VERSION = "4.1.38" +VERSION = "4.1.39" MAX_RESULTS = int(10e6) MAX_COMPATIBILITY_RESULTS = int(3650)