# Get vaccine coverage by ZIP Codes data from CDPH

In [107]:
%load_ext lab_black

The lab_black extension is already loaded. To reload it, use:
  %reload_ext lab_black


In [108]:
import pandas as pd
import datetime as dt
import json
import os
import glob
import urllib.request

In [109]:
pd.options.display.max_columns = 50
pd.options.display.max_rows = 1000
pd.set_option("display.max_colwidth", None)

In [110]:
today = dt.datetime.today().strftime("%Y-%m-%d")

### Get the metadata from the API

In [111]:
endpoint = "https://data.chhs.ca.gov/api/3/action/package_show?id=covid-19-vaccine-progress-dashboard-data-by-zip-code"

In [112]:
jsonurl = urllib.request.urlopen(endpoint)
text = json.loads(jsonurl.read())

### Get the object with the max date

In [113]:
data = text["result"]["resources"]

Loop through the dict to get the file names that contain dates

In [114]:
dates = []
for obj in data:
    slug = obj["name"]
    # find valid dates in the name field and append them to a list
    try:
        date = pd.to_datetime(
            slug.replace("COVID-19 Vaccines by ZIP Code ", "")
        ).strftime("%Y-%m-%d")
        dates.append(date)
    except:
        pass

Pick out the latest date from that list

In [115]:
latest_date_str = max(d for d in dates)

Loop through the objects again and match the latest one with the latest date

In [116]:
for obj in data:
    slug = obj["name"]
    try:
        date = pd.to_datetime(
            slug.replace("COVID-19 Vaccines by ZIP Code ", "")
        ).strftime("%Y-%m-%d")
        if date == latest_date_str:
            latest_obj = obj
        else:
            pass
    except:
        pass

In [117]:
latest_date = pd.to_datetime(latest_obj["created"]).strftime("%Y-%m-%d")

### Read that latest file into a dataframe to limit redacted rows and save it in raw

In [118]:
df = pd.read_csv(latest_obj["url"])

In [119]:
df = df[df["VEM Source"] != "No VEM Assigned"]

In [120]:
df.to_csv("data/raw/" + latest_date + ".csv", index=False)

---

## Concatenate all the weekly updates

### Get all files and assign a date to each table based on the file name

In [121]:
path = ""
files = glob.glob(os.path.join(path, "data/raw/*.csv"))

In [122]:
file_df = (
    pd.read_csv(f, low_memory=False).assign(date=os.path.basename(f)) for f in files
)

### Concatenate them into one timeseries and clean update date field

In [123]:
concat_df = pd.concat(
    file_df,
    ignore_index=True,
)

In [124]:
concat_df["date"] = pd.to_datetime(
    concat_df["date"].str.replace(".csv", "", regex=False)
)

In [127]:
concat_df[concat_df["Zip Code Tabulation Area"] == 91320]

Unnamed: 0,Zip Code Tabulation Area,Local Health Jurisdiction,Vaccine Equity Metric Quartile,VEM Source,16+ Population,Persons Fully Vaccinated,Persons Partially Vaccinated,Percent of Population Fully Vaccinated,Percent of Population Partially Vaccinated,Percent of Population with 1+ Dose,Redacted,date,County,12+ Population
1636,91320,Ventura,4.0,Healthy Places Index Score,35413.0,13005.0,7558.0,0.367238,0.213424,0.580662,No,2021-04-14,,
3325,91320,Ventura,4.0,Healthy Places Index Score,35413.0,13356.0,7690.0,0.37715,0.217152,0.594302,No,2021-04-21,,
4992,91320,Ventura,4.0,Healthy Places Index Score,35413.0,15020.0,7690.0,0.424138,0.217152,0.64129,No,2021-04-28,,
6688,91320,Ventura,4.0,Healthy Places Index Score,35413.0,16972.0,6816.0,0.479259,0.192472,0.671731,No,2021-05-05,Ventura,
8384,91320,Ventura,4.0,Healthy Places Index Score,35413.0,18768.0,5824.0,0.529975,0.164459,0.694434,No,2021-05-12,Ventura,
10087,91320,Ventura,4.0,Healthy Places Index Score,,20246.0,5165.0,0.529767,0.13515,0.664917,No,2021-05-18,Ventura,38216.800781
11769,91320,Ventura,4.0,Healthy Places Index Score,,21618.0,4773.0,0.565667,0.124893,0.69056,No,2021-05-25,Ventura,38216.800781
13469,91320,Ventura,4.0,Healthy Places Index Score,,22315.0,4635.0,0.583906,0.121282,0.705187,No,2021-06-01,Ventura,38216.800781
15146,91320,Ventura,4.0,Healthy Places Index Score,,22315.0,4635.0,0.583906,0.121282,0.705187,No,2021-06-02,Ventura,38216.800781


---

## Export

### All updates

In [125]:
concat_df.to_csv("data/timeseries.csv", index=False)

### Latest update

In [126]:
concat_df[concat_df["date"] == concat_df["date"].max()].to_csv(
    "data/latest.csv", index=False
)