# Get vaccine coverage by ZIP Codes data from CDPH

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import datetime as dt
import json
import os
import glob
import urllib.request

In [3]:
pd.options.display.max_columns = 50
pd.options.display.max_rows = 1000
pd.set_option("display.max_colwidth", None)

In [4]:
today = dt.datetime.today().strftime("%Y-%m-%d")

### Get the metadata from the API

In [5]:
endpoint = "https://data.chhs.ca.gov/api/3/action/package_show?id=covid-19-vaccine-progress-dashboard-data-by-zip-code"

In [6]:
jsonurl = urllib.request.urlopen(endpoint)
text = json.loads(jsonurl.read())

### Get the object with the max date

In [7]:
data = text["result"]["resources"]

Loop through the dict to get the file names that contain dates

In [8]:
dates = []
slugs = []
for obj in data:
    slug = obj["name"]
    try:
        date = pd.to_datetime(
            slug.replace("COVID-19 Vaccines by ZIP Code ", "")
        ).strftime("%Y-%m-%d")
        dates.append(date)
        slugs.append(slug)
    except:
        pass

Use that list of dates to pick out the latest file

In [9]:
latest_date_str = max(d for d in dates)

In [10]:
for obj in data:
    slug = obj["name"]
    try:
        date = pd.to_datetime(
            slug.replace("COVID-19 Vaccines by ZIP Code ", "")
        ).strftime("%Y-%m-%d")
        if date == latest_date_str:
            latest_obj = obj
        else:
            pass
    except:
        pass

In [11]:
latest_date = pd.to_datetime(latest_obj["created"]).strftime("%Y-%m-%d")

### Read that latest file into a dataframe to limit redacted rows and save it in raw

In [12]:
df = pd.read_csv(latest_obj["url"])

In [13]:
df = df[df["VEM Source"] != "No VEM Assigned"]

In [14]:
df.to_csv("data/raw/" + latest_date + ".csv", index=False)

---

## Concatenate all the weekly updates

### Get all files and assign a date to each table based on the file name

In [15]:
path = ""
files = glob.glob(os.path.join(path, "data/raw/*.csv"))

In [16]:
file_df = (
    pd.read_csv(f, low_memory=False).assign(date=os.path.basename(f)) for f in files
)

### Concatenate them into one timeseries and clean update date field

In [17]:
concat_df = pd.concat(
    file_df,
    ignore_index=True,
)

In [18]:
concat_df["date"] = pd.to_datetime(
    concat_df["date"].str.replace(".csv", "", regex=False)
)

---

## Export

### All updates

In [19]:
concat_df.to_csv("data/timeseries.csv", index=False)

### Latest update

In [20]:
concat_df[concat_df["date"] == concat_df["date"].max()].to_csv(
    "data/latest.csv", index=False
)