# CDC: Monkeypox derived timeseries

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import us
import urllib.request, json
import datetime as dt

In [3]:
today = pd.Timestamp.now(tz="America/Los_Angeles").strftime("%Y-%m-%d")
time = pd.Timestamp.now(tz="America/Los_Angeles").strftime("%-I:%M %p")

---

## Get historical case timeseries

#### CDC only updates this weekly

In [44]:
cdc_timeseries = (
    pd.read_csv(
        "data/processed/monkeypox_cases_timeseries_cdc_latest.csv",
        parse_dates=["date"],
        names=["date", "cases", "asof", "cumulative_sum"],
        header=0,
    )
    .drop(["asof"], axis=1)
    .sort_values("date", ascending=False)
).reset_index(drop=True)

In [45]:
cdc_timeseries.head()

In [65]:
cdc_timeseries["date"] = pd.to_datetime(cdc_timeseries["date"]).dt.strftime("%Y-%m-%d")

In [26]:
cdc_max_date = cdc_timeseries["date"].max()

In [27]:
cdc_max_date

In [63]:
cdc_timeseries.dtypes

date              datetime64[ns]
cases                      int64
cumulative_sum             int64
dtype: object

#### The latest timeseries we have

In [28]:
# historical_src = (
#     pd.read_csv(
#         "data/processed/monkeypox_cases_timeseries_cdc_historical.csv",
#         parse_dates=["date"],
#     )
#     .sort_values("date", ascending=False)
#     .reset_index(drop=True)
# )

In [29]:
# historical_src["date"] = historical_src["date"].astype(str)

In [30]:
# historical_df = historical_src[historical_src["date"] < today].reset_index(drop=True)

In [31]:
# historical_df.head()

---

## CDC Monkeypox

#### Latest totals, aggregated by state

In [66]:
states_src = pd.read_csv(
    "https://www.cdc.gov/wcms/vizdata/poxvirus/monkeypox/data/USmap_counts.csv"
)

In [67]:
states_src.columns = states_src.columns.str.lower().str.replace(" ", "_", regex=False)

In [68]:
states_src.drop(["case_range"], axis=1, inplace=True)

In [69]:
states_src["cases"] = states_src["cases"].astype(int)

In [70]:
states = states_src[
    (states_src["location"] != "Total") & (states_src["location"] != "Non-US Resident")
].copy()

In [71]:
states.head()

Unnamed: 0,location,cases,asof
0,Alabama,34,Data as of 17 Aug 2022 2:00 PM EDT
1,Alaska,2,Data as of 17 Aug 2022 2:00 PM EDT
2,Arizona,213,Data as of 17 Aug 2022 2:00 PM EDT
3,Arkansas,21,Data as of 17 Aug 2022 2:00 PM EDT
4,California,2356,Data as of 17 Aug 2022 2:00 PM EDT


---

#### Aggregate totals among all states to add to timeseries

In [72]:
latest_total = states["cases"].sum()

In [73]:
historical_total = cdc_timeseries[
    cdc_timeseries["date"] == cdc_timeseries["date"].max()
]["cumulative_sum"][0]

In [74]:
cdc_timeseries[cdc_timeseries["date"] == cdc_timeseries["date"].max()][
    "cumulative_sum"
][0]

13057

In [75]:
change = latest_total - historical_total

In [76]:
change

457

In [77]:
updated_data = {"date": today, "cases": change, "cumulative_sum": latest_total}

In [78]:
updated_data_df = pd.DataFrame(updated_data, index=[0])

In [82]:
updated_data_df

Unnamed: 0,date,cases,cumulative_sum
0,2022-08-18,457,13514


In [83]:
updated_data_df["date"] = pd.to_datetime(updated_data_df["date"]).dt.strftime(
    "%Y-%m-%d"
)

In [93]:
df = (
    pd.concat([cdc_timeseries, updated_data_df])
    .drop_duplicates(subset="date")
    .sort_values("date", ascending=False)
    .copy()
)

In [94]:
df

Unnamed: 0,date,cases,cumulative_sum
0,2022-08-18,457,13514
0,2022-08-17,347,13057
1,2022-08-16,361,12710
2,2022-08-15,670,12349
3,2022-08-14,10,11679
...,...,...,...
90,2022-05-19,1,3
91,2022-05-18,0,2
92,2022-05-17,1,2
93,2022-05-16,0,1


---

## Exports

In [95]:
df.to_csv(f"data/processed/monkeypox_cases_timeseries_cdc_historical.csv", index=False)
df.to_csv(
    f"data/processed/monkeypox_cases_timeseries_cdc_historical_{today}.csv", index=False
)
df.to_csv(f"data/processed/monkeypox_cases_derived_timeseries_latest.csv", index=False)
df.to_json(
    f"data/processed/monkeypox_cases_derived_timeseries_latest.json",
    orient="records",
    indent=4,
)