diff --git a/nchs_mortality/.gitignore b/nchs_mortality/.gitignore index d98285328..344822e5b 100644 --- a/nchs_mortality/.gitignore +++ b/nchs_mortality/.gitignore @@ -5,6 +5,9 @@ params.json # Do not commit output files receiving/*.csv +daily_receiving/*.csv +cache/*.csv +daily_cache/*.csv # Do not commit test files tests/receiving/*.csv diff --git a/nchs_mortality/delphi_nchs_mortality/run.py b/nchs_mortality/delphi_nchs_mortality/run.py index ec5416bb2..b8a7832d4 100644 --- a/nchs_mortality/delphi_nchs_mortality/run.py +++ b/nchs_mortality/delphi_nchs_mortality/run.py @@ -9,7 +9,7 @@ from typing import Dict, Any import numpy as np -from delphi_utils import S3ArchiveDiffer, get_structured_logger, create_export_csv +from delphi_utils import S3ArchiveDiffer, get_structured_logger, create_export_csv, Nans from .archive_diffs import arch_diffs from .constants import (METRICS, SENSOR_NAME_MAP, @@ -17,6 +17,18 @@ from .pull import pull_nchs_mortality_data +def add_nancodes(df): + """Add nancodes to the dataframe.""" + # Default missingness codes + df["missing_val"] = Nans.NOT_MISSING + df["missing_se"] = Nans.NOT_APPLICABLE + df["missing_sample_size"] = Nans.NOT_APPLICABLE + + # Mark any remaining nans with unknown + remaining_nans_mask = df["val"].isnull() + df.loc[remaining_nans_mask, "missing_val"] = Nans.OTHER + return df + def run_module(params: Dict[str, Any]): """Run module for processing NCHS mortality data. @@ -67,7 +79,8 @@ def run_module(params: Dict[str, Any]): df["val"] = df[metric] df["se"] = np.nan df["sample_size"] = np.nan - df = df[~df["val"].isnull()] + df = add_nancodes(df) + # df = df[~df["val"].isnull()] sensor_name = "_".join([SENSOR_NAME_MAP[metric]]) dates = create_export_csv( df, @@ -91,7 +104,8 @@ def run_module(params: Dict[str, Any]): df["val"] = df[metric] / df["population"] * INCIDENCE_BASE df["se"] = np.nan df["sample_size"] = np.nan - df = df[~df["val"].isnull()] + df = add_nancodes(df) + # df = df[~df["val"].isnull()] sensor_name = "_".join([SENSOR_NAME_MAP[metric], sensor]) dates = create_export_csv( df, diff --git a/nchs_mortality/tests/test_run.py b/nchs_mortality/tests/test_run.py index 36dba6698..b842bd4c1 100644 --- a/nchs_mortality/tests/test_run.py +++ b/nchs_mortality/tests/test_run.py @@ -60,4 +60,8 @@ def test_output_file_format(self, run_as_module, date): df = pd.read_csv( join(output_folder, "weekly_202026_state_deaths_covid_incidence_prop.csv") ) - assert (df.columns.values == ["geo_id", "val", "se", "sample_size"]).all() + expected_columns = [ + "geo_id", "val", "se", "sample_size", + "missing_val", "missing_se", "missing_sample_size" + ] + assert (df.columns.values == expected_columns).all()