Skip to content
Merged
16 changes: 11 additions & 5 deletions google_symptoms/delphi_google_symptoms/geo.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def generate_transition_matrix(geo_res):
if geo_res == "hrr":
map_df["population"] = map_df["population"] * map_df["weight"]

aggregated_pop = map_df.groupby(geo_res).sum().reset_index()
aggregated_pop = map_df.groupby(geo_res).sum(numeric_only=True).reset_index()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wondering about the numeric_only=True. In the function arg type hint, it says there's no default, but in the docs below that it says it defaults to bool=True. Guessing that it actually tries to use NANs by default?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I changed this because I'm getting a warning about to-be-deprecated behavior. Thought I'd just throw it in here.

FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. 
In a future version, numeric_only will default to False. Either specify numeric_only or select 
only columns which should be valid for the function.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The current default does actually seem to be numeric_only=True

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah thanks for that! Curious why they're moving away from numeric_only=True.

map_df = map_df.merge(
aggregated_pop, on=geo_res, how="inner", suffixes=["_raw", "_groupsum"]
)
Expand Down Expand Up @@ -79,8 +79,11 @@ def geo_map(df, geo_res, namescols = None):
return df

map_df = generate_transition_matrix(geo_res)
converted_df = pd.DataFrame(columns = df.columns)
for _date in df["timestamp"].unique():

dates_list = df["timestamp"].unique()
dfs_list = [pd.DataFrame()] * len(dates_list)

for i, _date in enumerate(dates_list):
val_lists = df[df["timestamp"] == _date].merge(
map_df["geo_id"], how="right"
)[namescols].fillna(0)
Expand All @@ -92,5 +95,8 @@ def geo_map(df, geo_res, namescols = None):
newdf["geo_id"] = list(map_df.keys())[1:]
mask = (newdf == 0)
newdf[mask] = np.nan
converted_df = converted_df.append(newdf)
return converted_df
dfs_list[i] = newdf

# Reindex to make sure output has same columns as input df. Filled with
# NaN values if column doesn't already exist.
return pd.concat(dfs_list).reindex(df.columns, axis=1)
3 changes: 2 additions & 1 deletion google_symptoms/delphi_google_symptoms/pull.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ def preprocess(df, level):
index_df = pd.MultiIndex.from_product(
[geo_list, date_list], names=['geo_id', 'date']
)
df.date = pd.to_datetime(df.date)
df = df.set_index(
["geo_id", "date"]
).reindex(
Expand Down Expand Up @@ -296,7 +297,7 @@ def pull_gs_data(credentials, export_start_date, export_end_date, num_export_day
df_dc_county = dfs["state"][dfs["state"]["geo_id"] == "dc"].drop(
"geo_id", axis=1)
df_dc_county["geo_id"] = DC_FIPS
dfs["county"] = dfs["county"].append(df_dc_county)
dfs["county"] = pd.concat([dfs["county"], df_dc_county])
except KeyError:
pass

Expand Down
5 changes: 3 additions & 2 deletions google_symptoms/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@
required = [
"mock",
"numpy",
"pandas==1.3.5",
"pandas",
"pydocstyle",
"pytest",
"pytest-cov",
"pylint==2.8.3",
"delphi-utils",
"freezegun",
"pandas-gbq"
"pandas-gbq",
"db-dtypes"
]

setup(
Expand Down
8 changes: 8 additions & 0 deletions google_symptoms/tests/test_geo.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def test_hrr(self):
).drop("weight", axis="columns")
hrr_pop = fips2hrr.groupby("hrr"
).sum(
numeric_only=True
).reset_index(
).rename(columns={"population": "hrr_pop"})
df_plus = df.merge(fips2hrr, left_on="geo_id", right_on="fips", how="left"
Expand All @@ -59,6 +60,7 @@ def test_hrr(self):
combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3
).groupby("hrr"
).sum(
numeric_only=True
).drop(
labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]],
axis="columns"
Expand Down Expand Up @@ -91,6 +93,7 @@ def test_msa(self):
fips2msa = gmpr.add_population_column(gmpr.get_crosswalk("fips", "msa"), "fips")
msa_pop = fips2msa.groupby("msa"
).sum(
numeric_only=True
).reset_index(
).rename(columns={"population": "msa_pop"})
df_plus = df.merge(fips2msa, left_on="geo_id", right_on="fips", how="left"
Expand All @@ -103,6 +106,7 @@ def test_msa(self):
combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3
).groupby("msa"
).sum(
numeric_only=True
).drop(
labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]],
axis="columns"
Expand Down Expand Up @@ -136,6 +140,7 @@ def test_hhs(self):
state2hhs = gmpr.add_geocode(state2hhs, "state_code", "hhs")
hhs_pop = state2hhs.groupby("hhs"
).sum(
numeric_only=True
).reset_index(
).rename(columns={"population": "hhs_pop"})
df_plus = df.merge(state2hhs, left_on="geo_id", right_on="state_id", how="left"
Expand All @@ -148,6 +153,7 @@ def test_hhs(self):
combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3
).groupby("hhs"
).sum(
numeric_only=True
).drop(
labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]],
axis="columns"
Expand Down Expand Up @@ -181,6 +187,7 @@ def test_nation(self):
state2nation = gmpr.add_geocode(state2nation, "state_code", "nation")
nation_pop = state2nation.groupby("nation"
).sum(
numeric_only=True
).reset_index(
).rename(columns={"population": "nation_pop"})
df_plus = df.merge(state2nation, left_on="geo_id", right_on="state_id", how="left"
Expand All @@ -193,6 +200,7 @@ def test_nation(self):
combined_metric = lambda x: x.metric_0/3 + x.metric_1/3 + x.metric_2/3
).groupby("nation"
).sum(
numeric_only=True
).drop(
labels=[METRICS[23], METRICS[24], METRICS[25], COMBINED_METRIC[4]],
axis="columns"
Expand Down
11 changes: 11 additions & 0 deletions google_symptoms/tests/test_pull.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pytest
import mock
import db_dtypes
from freezegun import freeze_time
from datetime import date, datetime
import pandas as pd
Expand Down Expand Up @@ -90,6 +91,16 @@ def test_invalid_fips(self):
with pytest.raises(AssertionError):
preprocess(df, "county")

def test_no_rows_nulled(self):
"""
Check that rows are not mysteriously nulled out. See
https://github.com/cmu-delphi/covidcast-indicators/pull/1496 for motivating issue.
"""
# Cast date field to `dbdate` to match dataframe dtypes as provided by the BigQuery fetch.
df = pd.read_csv(good_input["state"]).astype({"date": "dbdate"})
out = preprocess(df, "state")
assert df.shape[0] == out[~out.Cough.isna()].shape[0]


class TestPullHelperFuncs:
@freeze_time("2021-01-05")
Expand Down