From ff823c65860c33f45710e2bdcd14aaa264fcb4d4 Mon Sep 17 00:00:00 2001
From: minhkhul <minhkhul@andrew.cmu.edu>
Date: Tue, 11 Nov 2025 12:03:54 -0500
Subject: [PATCH 01/11] Add hsa_nci geo resolution + Adjust csv regex patterns

---
 src/acquisition/covidcast/csv_importer.py        | 13 ++++++++-----
 tests/acquisition/covidcast/test_csv_importer.py | 12 ++++++++++++
 2 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/acquisition/covidcast/csv_importer.py b/src/acquisition/covidcast/csv_importer.py
index 33df578a9..3d22290a5 100644
--- a/src/acquisition/covidcast/csv_importer.py
+++ b/src/acquisition/covidcast/csv_importer.py
@@ -52,18 +52,21 @@ class CsvRowValue:
 class CsvImporter:
   """Finds and parses covidcast CSV files."""
 
+  # set of allowed resolutions (aka "geo_type")
+  GEOGRAPHIC_RESOLUTIONS = {'county', 'hrr', 'msa', 'dma', 'state', 'hhs', 'nation', "hsa", "hsa_nci"}
+
+  # regex pattern for matching geo types, note: sort longer string first to avoid wrong substring matches
+  geo_types_pattern = "|".join(sorted(GEOGRAPHIC_RESOLUTIONS, key=len, reverse=True))
+
   # .../source/yyyymmdd_geo_signal.csv
-  PATTERN_DAILY = re.compile(r'^.*/([^/]*)/(\d{8})_(\w+?)_(\w+)\.csv$')
+  PATTERN_DAILY = re.compile(rf'^.*/([^/]*)/(\d{{8}})_({geo_types_pattern})_(\w+)\.csv$')
 
   # .../source/weekly_yyyyww_geo_signal.csv
-  PATTERN_WEEKLY = re.compile(r'^.*/([^/]*)/weekly_(\d{6})_(\w+?)_(\w+)\.csv$')
+  PATTERN_WEEKLY = re.compile(rf'^.*/([^/]*)/weekly_(\d{{6}})_({geo_types_pattern})_(\w+)\.csv$')
 
   # .../issue_yyyymmdd
   PATTERN_ISSUE_DIR = re.compile(r'^.*/([^/]*)/issue_(\d{8})$')
 
-  # set of allowed resolutions (aka "geo_type")
-  GEOGRAPHIC_RESOLUTIONS = {'county', 'hrr', 'msa', 'dma', 'state', 'hhs', 'nation'}
-
   # set of required CSV columns
   REQUIRED_COLUMNS = {'geo_id', 'val', 'se', 'sample_size'}
 
diff --git a/tests/acquisition/covidcast/test_csv_importer.py b/tests/acquisition/covidcast/test_csv_importer.py
index 0906febd1..2c9ff58c1 100644
--- a/tests/acquisition/covidcast/test_csv_importer.py
+++ b/tests/acquisition/covidcast/test_csv_importer.py
@@ -96,6 +96,14 @@ def test_find_csv_files(self, mock_glob: MagicMock):
       path_prefix + 'invalid/20200418_province_c.csv',
       # ignored
       path_prefix + 'ignored/README.md',
+      # valid hsa day
+      path_prefix + 'valid/20200408_hsa_sig.csv',
+      # valid hsa_nci day
+      path_prefix + 'valid/20200408_hsa_nci_sig.csv',
+      # valid hsa_nci week
+      path_prefix + 'valid/weekly_202015_hsa_nci_sig.csv',
+      # valid hsa week
+      path_prefix + 'valid/weekly_202015_hsa_sig.csv',
     ]
     mock_glob.return_value = glob_paths
 
@@ -113,6 +121,10 @@ def test_find_csv_files(self, mock_glob: MagicMock):
       (glob_paths[5], None),
       (glob_paths[6], None),
       (glob_paths[7], None),
+      (glob_paths[8], PathDetails(expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days, 'valid', 'sig', 'day', time_value_day, 'hsa')),
+      (glob_paths[9], PathDetails(expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days, 'valid', 'sig', 'day', time_value_day, 'hsa_nci')),
+      (glob_paths[10], PathDetails(expected_issue_week, delta_epiweeks(202015, expected_issue_week), 'valid', 'sig', 'week', 202015, 'hsa_nci')),
+      (glob_paths[11], PathDetails(expected_issue_week, delta_epiweeks(202015, expected_issue_week), 'valid', 'sig', 'week', 202015, 'hsa')),
     ])
     self.assertEqual(found, expected)
 

From d2c8188a8b547d43fe62906986ccd5a855f588d1 Mon Sep 17 00:00:00 2001
From: minhkhul <minhkhul@andrew.cmu.edu>
Date: Tue, 11 Nov 2025 12:38:19 -0500
Subject: [PATCH 02/11] adjust test

---
 tests/acquisition/covidcast/test_csv_importer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/acquisition/covidcast/test_csv_importer.py b/tests/acquisition/covidcast/test_csv_importer.py
index 2c9ff58c1..79bcd3d7a 100644
--- a/tests/acquisition/covidcast/test_csv_importer.py
+++ b/tests/acquisition/covidcast/test_csv_importer.py
@@ -94,8 +94,6 @@ def test_find_csv_files(self, mock_glob: MagicMock):
       path_prefix + 'invalid/weekly_222222_b_c.csv',
       # invalid geography
       path_prefix + 'invalid/20200418_province_c.csv',
-      # ignored
-      path_prefix + 'ignored/README.md',
       # valid hsa day
       path_prefix + 'valid/20200408_hsa_sig.csv',
       # valid hsa_nci day
@@ -104,6 +102,8 @@ def test_find_csv_files(self, mock_glob: MagicMock):
       path_prefix + 'valid/weekly_202015_hsa_nci_sig.csv',
       # valid hsa week
       path_prefix + 'valid/weekly_202015_hsa_sig.csv',
+      # ignored
+      path_prefix + 'ignored/README.md',
     ]
     mock_glob.return_value = glob_paths
 

From 012b38ba6b4661f815339830c254d78c614513b8 Mon Sep 17 00:00:00 2001
From: minhkhul <minhkhul@andrew.cmu.edu>
Date: Tue, 11 Nov 2025 18:08:35 -0500
Subject: [PATCH 03/11] replace old geo_type check with different warning

---
 src/acquisition/covidcast/csv_importer.py | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/src/acquisition/covidcast/csv_importer.py b/src/acquisition/covidcast/csv_importer.py
index 3d22290a5..1ada0ba03 100644
--- a/src/acquisition/covidcast/csv_importer.py
+++ b/src/acquisition/covidcast/csv_importer.py
@@ -59,10 +59,10 @@ class CsvImporter:
   geo_types_pattern = "|".join(sorted(GEOGRAPHIC_RESOLUTIONS, key=len, reverse=True))
 
   # .../source/yyyymmdd_geo_signal.csv
-  PATTERN_DAILY = re.compile(rf'^.*/([^/]*)/(\d{{8}})_({geo_types_pattern})_(\w+)\.csv$')
+  PATTERN_DAILY = re.compile(rf'^.*/([^/]*)/(\d{{8}})_({geo_types_pattern})_(.+)\.csv$')
 
   # .../source/weekly_yyyyww_geo_signal.csv
-  PATTERN_WEEKLY = re.compile(rf'^.*/([^/]*)/weekly_(\d{{6}})_({geo_types_pattern})_(\w+)\.csv$')
+  PATTERN_WEEKLY = re.compile(rf'^.*/([^/]*)/weekly_(\d{{6}})_({geo_types_pattern})_(.+)\.csv$')
 
   # .../issue_yyyymmdd
   PATTERN_ISSUE_DIR = re.compile(r'^.*/([^/]*)/issue_(\d{8})$')
@@ -161,7 +161,7 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()
       daily_match = CsvImporter.PATTERN_DAILY.match(path.lower())
       weekly_match = CsvImporter.PATTERN_WEEKLY.match(path.lower())
       if not daily_match and not weekly_match:
-        logger.warning(event='invalid csv path/filename', detail=path, file=path)
+        logger.warning(event='invalid csv path/filename or geo_type', detail=path, file=path)
         yield (path, None)
         continue
 
@@ -191,10 +191,6 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()
 
       # # extract and validate geographic resolution
       geo_type = match.group(3).lower()
-      if geo_type not in CsvImporter.GEOGRAPHIC_RESOLUTIONS:
-        logger.warning(event='invalid geo_type', detail=geo_type, file=path)
-        yield (path, None)
-        continue
 
       # extract additional values, lowercased for consistency
       source = match.group(1).lower()

From 6654f8862bea4123a6e8f1806ed588d30b04a7d9 Mon Sep 17 00:00:00 2001
From: minhkhul <minhkhul@andrew.cmu.edu>
Date: Tue, 11 Nov 2025 19:14:50 -0500
Subject: [PATCH 04/11] Add validation check for hsa_nci + validation test
 cases

---
 src/acquisition/covidcast/csv_importer.py        | 8 +++++++-
 tests/acquisition/covidcast/test_csv_importer.py | 3 +++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/acquisition/covidcast/csv_importer.py b/src/acquisition/covidcast/csv_importer.py
index 1ada0ba03..59b9928a0 100644
--- a/src/acquisition/covidcast/csv_importer.py
+++ b/src/acquisition/covidcast/csv_importer.py
@@ -299,7 +299,7 @@ def extract_and_check_row(row: DataFrameRow, geo_type: str, filepath: Optional[s
       # geo_id was `None`
       return (None, 'geo_id')
 
-    if geo_type in ('hrr', 'msa', 'dma', 'hhs'):
+    if geo_type in ('hrr', 'msa', 'dma', 'hhs', 'hsa_nci'):
       # these particular ids are prone to be written as ints -- and floats
       try:
         geo_id = str(CsvImporter.floaty_int(geo_id))
@@ -338,6 +338,12 @@ def extract_and_check_row(row: DataFrameRow, geo_type: str, filepath: Optional[s
       if len(geo_id) != 2 or not 'aa' <= geo_id <= 'zz':
         return (None, 'geo_id')
 
+    elif geo_type == 'hsa_nci':
+      # valid codes should be 1-3 digit numbers, or the special code of "1022" for blank
+      # https://seer.cancer.gov/seerstat/variables/countyattribs/hsa.html
+      if not re.match(r'^(1022|\d{1,3})$', geo_id):
+        return (None, 'geo_id')
+
     else:
       return (None, 'geo_type')
 
diff --git a/tests/acquisition/covidcast/test_csv_importer.py b/tests/acquisition/covidcast/test_csv_importer.py
index 79bcd3d7a..29c800d99 100644
--- a/tests/acquisition/covidcast/test_csv_importer.py
+++ b/tests/acquisition/covidcast/test_csv_importer.py
@@ -194,6 +194,7 @@ def make_row(
 
     # cases to test each failure mode
     failure_cases = [
+      (make_row(geo_type='hsa_nci', geo_id='1111'), 'geo_id'),
       (make_row(geo_type='county', geo_id='1234'), 'geo_id'),
       (make_row(geo_type='county', geo_id='00000'), 'geo_id'),
       (make_row(geo_type='hrr', geo_id='600'), 'geo_id'),
@@ -227,6 +228,8 @@ def make_row(
       (make_row(value=None, stderr=np.nan, sample_size='', missing_value=str(float(Nans.DELETED)), missing_stderr=str(float(Nans.DELETED)), missing_sample_size=str(float(Nans.DELETED))), CsvRowValue('vi', None, None, None, Nans.DELETED, Nans.DELETED, Nans.DELETED)),
       (make_row(stderr='', sample_size='NA', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.OTHER))), CsvRowValue('vi', 1.23, None, None, Nans.NOT_MISSING, Nans.OTHER, Nans.OTHER)),
       (make_row(sample_size=None, missing_value='missing_value', missing_stderr=str(float(Nans.OTHER)), missing_sample_size=str(float(Nans.NOT_MISSING))), CsvRowValue('vi', 1.23, 4.56, None, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.OTHER)),
+      (make_row(geo_type='hsa_nci', geo_id='1022'), CsvRowValue('1022', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)),
+      (make_row(geo_type='hsa_nci', geo_id='012'), CsvRowValue('12', 1.23, 4.56, 100.5, Nans.NOT_MISSING, Nans.NOT_MISSING, Nans.NOT_MISSING)),
     ]
 
     for ((geo_type, row), field) in success_cases:

From ea02d3321f4e8363af4f3cf59c1c1c259d3b977b Mon Sep 17 00:00:00 2001
From: minhkhul <minhkhul@andrew.cmu.edu>
Date: Tue, 11 Nov 2025 19:19:06 -0500
Subject: [PATCH 05/11] add warning to input validation code chunk about
 hsa_nci

---
 src/server/_params.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/server/_params.py b/src/server/_params.py
index d565f894d..92a7c7aad 100644
--- a/src/server/_params.py
+++ b/src/server/_params.py
@@ -59,6 +59,7 @@ def __init__(self, geo_type: str, geo_values: Union[bool, Sequence[str]]):
             if geo_values == ['']:
                 raise ValidationFailedException(f"geo_value is empty for the requested geo_type {geo_type}!")
             # TODO: keep this translator in sync with CsvImporter.GEOGRAPHIC_RESOLUTIONS in acquisition/covidcast/ and with GeoMapper
+            # We are not updating hsa_nci here as this was done with the older geomapper code which does not support the same hsa definition.
             geo_type_translator = {
                 "county": "fips",
                 "state": "state_id",

From 23d3fcb671950dcbf7bb62b62644ddb36ca90a13 Mon Sep 17 00:00:00 2001
From: minhkhul <minhkhul@andrew.cmu.edu>
Date: Tue, 11 Nov 2025 19:22:39 -0500
Subject: [PATCH 06/11] remove hsa from GEOGRAPHIC_RESOLUTIONS

---
 src/acquisition/covidcast/csv_importer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/acquisition/covidcast/csv_importer.py b/src/acquisition/covidcast/csv_importer.py
index 59b9928a0..7c24ad730 100644
--- a/src/acquisition/covidcast/csv_importer.py
+++ b/src/acquisition/covidcast/csv_importer.py
@@ -53,7 +53,7 @@ class CsvImporter:
   """Finds and parses covidcast CSV files."""
 
   # set of allowed resolutions (aka "geo_type")
-  GEOGRAPHIC_RESOLUTIONS = {'county', 'hrr', 'msa', 'dma', 'state', 'hhs', 'nation', "hsa", "hsa_nci"}
+  GEOGRAPHIC_RESOLUTIONS = {'county', 'hrr', 'msa', 'dma', 'state', 'hhs', 'nation', 'hsa_nci'}
 
   # regex pattern for matching geo types, note: sort longer string first to avoid wrong substring matches
   geo_types_pattern = "|".join(sorted(GEOGRAPHIC_RESOLUTIONS, key=len, reverse=True))

From de0205256573f51f91e137200bb8546f533fdd5e Mon Sep 17 00:00:00 2001
From: minhkhul <minhkhul@andrew.cmu.edu>
Date: Tue, 11 Nov 2025 19:43:03 -0500
Subject: [PATCH 07/11] remove hsa from test

---
 tests/acquisition/covidcast/test_csv_importer.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/tests/acquisition/covidcast/test_csv_importer.py b/tests/acquisition/covidcast/test_csv_importer.py
index 29c800d99..684ba01b1 100644
--- a/tests/acquisition/covidcast/test_csv_importer.py
+++ b/tests/acquisition/covidcast/test_csv_importer.py
@@ -94,14 +94,10 @@ def test_find_csv_files(self, mock_glob: MagicMock):
       path_prefix + 'invalid/weekly_222222_b_c.csv',
       # invalid geography
       path_prefix + 'invalid/20200418_province_c.csv',
-      # valid hsa day
-      path_prefix + 'valid/20200408_hsa_sig.csv',
       # valid hsa_nci day
       path_prefix + 'valid/20200408_hsa_nci_sig.csv',
       # valid hsa_nci week
       path_prefix + 'valid/weekly_202015_hsa_nci_sig.csv',
-      # valid hsa week
-      path_prefix + 'valid/weekly_202015_hsa_sig.csv',
       # ignored
       path_prefix + 'ignored/README.md',
     ]
@@ -121,10 +117,8 @@ def test_find_csv_files(self, mock_glob: MagicMock):
       (glob_paths[5], None),
       (glob_paths[6], None),
       (glob_paths[7], None),
-      (glob_paths[8], PathDetails(expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days, 'valid', 'sig', 'day', time_value_day, 'hsa')),
-      (glob_paths[9], PathDetails(expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days, 'valid', 'sig', 'day', time_value_day, 'hsa_nci')),
-      (glob_paths[10], PathDetails(expected_issue_week, delta_epiweeks(202015, expected_issue_week), 'valid', 'sig', 'week', 202015, 'hsa_nci')),
-      (glob_paths[11], PathDetails(expected_issue_week, delta_epiweeks(202015, expected_issue_week), 'valid', 'sig', 'week', 202015, 'hsa')),
+      (glob_paths[8], PathDetails(expected_issue_day, (date.today() - date(year=time_value_day // 10000, month=(time_value_day // 100) % 100, day=time_value_day % 100)).days, 'valid', 'sig', 'day', time_value_day, 'hsa_nci')),
+      (glob_paths[9], PathDetails(expected_issue_week, delta_epiweeks(202015, expected_issue_week), 'valid', 'sig', 'week', 202015, 'hsa_nci')),
     ])
     self.assertEqual(found, expected)
 

From 064a4fdadbd1ae900165002b14bafd71328e8596 Mon Sep 17 00:00:00 2001
From: minhkhul <minhkhul@andrew.cmu.edu>
Date: Tue, 11 Nov 2025 19:48:17 -0500
Subject: [PATCH 08/11] adjust PATTERN_DAILY PATTERN_WEEKLY string construction
 to avoid doubel braces

---
 src/acquisition/covidcast/csv_importer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/acquisition/covidcast/csv_importer.py b/src/acquisition/covidcast/csv_importer.py
index 7c24ad730..e30c72a81 100644
--- a/src/acquisition/covidcast/csv_importer.py
+++ b/src/acquisition/covidcast/csv_importer.py
@@ -59,10 +59,10 @@ class CsvImporter:
   geo_types_pattern = "|".join(sorted(GEOGRAPHIC_RESOLUTIONS, key=len, reverse=True))
 
   # .../source/yyyymmdd_geo_signal.csv
-  PATTERN_DAILY = re.compile(rf'^.*/([^/]*)/(\d{{8}})_({geo_types_pattern})_(.+)\.csv$')
+  PATTERN_DAILY = re.compile(r'^.*/([^/]*)/(\d{8})_(' + geo_types_pattern + r')_(.+)\.csv$')
 
   # .../source/weekly_yyyyww_geo_signal.csv
-  PATTERN_WEEKLY = re.compile(rf'^.*/([^/]*)/weekly_(\d{{6}})_({geo_types_pattern})_(.+)\.csv$')
+  PATTERN_WEEKLY = re.compile(r'^.*/([^/]*)/weekly_(\d{6})_(' + geo_types_pattern + r')_(.+)\.csv$')
 
   # .../issue_yyyymmdd
   PATTERN_ISSUE_DIR = re.compile(r'^.*/([^/]*)/issue_(\d{8})$')

From fc6ea47724bbb0458fc0cabed563a7a207bd4e90 Mon Sep 17 00:00:00 2001
From: minhkhul <118945681+minhkhul@users.noreply.github.com>
Date: Wed, 12 Nov 2025 11:08:16 -0500
Subject: [PATCH 09/11] Update src/server/_params.py

Co-authored-by: george <george.haff@gmail.com>
---
 src/server/_params.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/server/_params.py b/src/server/_params.py
index 92a7c7aad..da0ed0ae1 100644
--- a/src/server/_params.py
+++ b/src/server/_params.py
@@ -59,7 +59,7 @@ def __init__(self, geo_type: str, geo_values: Union[bool, Sequence[str]]):
             if geo_values == ['']:
                 raise ValidationFailedException(f"geo_value is empty for the requested geo_type {geo_type}!")
             # TODO: keep this translator in sync with CsvImporter.GEOGRAPHIC_RESOLUTIONS in acquisition/covidcast/ and with GeoMapper
-            # We are not updating hsa_nci here as this was done with the older geomapper code which does not support the same hsa definition.
+            # NOTE: We are not including `hsa_nci` here as the geomapper code does not support that version of the HSA definition.
             geo_type_translator = {
                 "county": "fips",
                 "state": "state_id",

From 048a901f118d56117d7a930b48d7ab7e78427b04 Mon Sep 17 00:00:00 2001
From: minhkhul <118945681+minhkhul@users.noreply.github.com>
Date: Wed, 12 Nov 2025 11:08:25 -0500
Subject: [PATCH 10/11] Update src/acquisition/covidcast/csv_importer.py

Co-authored-by: george <george.haff@gmail.com>
---
 src/acquisition/covidcast/csv_importer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/acquisition/covidcast/csv_importer.py b/src/acquisition/covidcast/csv_importer.py
index e30c72a81..7b8d13c45 100644
--- a/src/acquisition/covidcast/csv_importer.py
+++ b/src/acquisition/covidcast/csv_importer.py
@@ -189,7 +189,7 @@ def find_csv_files(scan_dir, issue=(date.today(), epi.Week.fromdate(date.today()
         issue_value=issue_epiweek_value
         lag_value=delta_epiweeks(time_value_week, issue_epiweek_value)
 
-      # # extract and validate geographic resolution
+      # extract geographic resolution
       geo_type = match.group(3).lower()
 
       # extract additional values, lowercased for consistency

From 9a678296f7caee5328011a284a845e4c4102c30a Mon Sep 17 00:00:00 2001
From: minhkhul <minhkhul@andrew.cmu.edu>
Date: Wed, 12 Nov 2025 11:42:50 -0500
Subject: [PATCH 11/11] integration test

---
 integrations/server/test_covidcast.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/integrations/server/test_covidcast.py b/integrations/server/test_covidcast.py
index 8b7fc6f52..bc9bfedfa 100644
--- a/integrations/server/test_covidcast.py
+++ b/integrations/server/test_covidcast.py
@@ -476,3 +476,14 @@ def test_week_formats(self):
     self.assertEqual(expected, colond)
     self.assertEqual(expected, dashed)
     self.assertEqual(expected, enumed)
+
+  def test_hsa_nci(self):
+    row = CovidcastTestRow.make_default_row(geo_type='hsa_nci', geo_value='99')
+    self._insert_rows([row])
+    response = self.request_based_on_row(row)
+    expected = [row.as_api_row_dict()]
+    self.assertEqual(response, {
+      'result': 1,
+      'epidata': expected,
+      'message': 'success',
+    })