# Get vaccine coverage by ZIP Codes data from CDPH

In [1]:
%load_ext lab_black

In [2]:
import pandas as pd
import datetime as dt
import json
import os
import glob

In [3]:
pd.options.display.max_columns = 50
pd.options.display.max_rows = 1000
pd.set_option("display.max_colwidth", None)

In [4]:
today = dt.datetime.today().strftime("%Y-%m-%d")

### Get the "download all" url/file from CDPH

In [5]:
url = "https://data.chhs.ca.gov/dataset/ead44d40-fd63-4f9f-950a-3b0111074de8/resource/654d8c82-9d54-4178-b850-360ef082a0a4/download/covid-19-vaccine-progress-dashboard-data-by-zip-code-pkyfgd.zip"

In [6]:
!wget {url} -O data/raw/zip/datapackage.zip

--2021-05-07 19:03:58--  https://data.chhs.ca.gov/dataset/ead44d40-fd63-4f9f-950a-3b0111074de8/resource/654d8c82-9d54-4178-b850-360ef082a0a4/download/covid-19-vaccine-progress-dashboard-data-by-zip-code-pkyfgd.zip
Resolving data.chhs.ca.gov (data.chhs.ca.gov)... 2606:4700::6813:da70, 2606:4700::6813:db70, 104.19.218.112, ...
Connecting to data.chhs.ca.gov (data.chhs.ca.gov)|2606:4700::6813:da70|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://og-production-open-data-chelseama-892364687672.s3.amazonaws.com/resources/654d8c82-9d54-4178-b850-360ef082a0a4/covid-19-vaccine-progress-dashboard-data-by-zip-code-pkyfgd.zip?Signature=B4ErXtqUkUH3SlL580VzHsKjQ3I%3D&Expires=1620443040&AWSAccessKeyId=AKIAJJIENTAPKHZMIPXQ [following]
--2021-05-07 19:04:00--  https://og-production-open-data-chelseama-892364687672.s3.amazonaws.com/resources/654d8c82-9d54-4178-b850-360ef082a0a4/covid-19-vaccine-progress-dashboard-data-by-zip-code-pkyfgd.zip?Signature=B4ErXtqUkUH3Sl

In [7]:
!unzip -o data/raw/zip/datapackage.zip -d data/raw/zip/
!rm -rf data/raw/zip/datapackage.zip

Archive:  data/raw/zip/datapackage.zip
  inflating: data/raw/zip/covid-19-vaccines-by-zip-code-data-dictionary.xlsx  
  inflating: data/raw/zip/covid-19-vaccines-by-zip-code-4-21-21.csv  
  inflating: data/raw/zip/covid-19-vaccines-by-zip-code-4-28-21.csv  
  inflating: data/raw/zip/datapackage.json  


---

### Get the latest datapackage

In [8]:
with open("data/raw/zip/datapackage.json") as json_data:
    data = json.load(json_data)

### Turn useful details info into a dataframe

In [9]:
def parse_resources(d):
    return {
        "description": d["description"],
        "path": d["sources"][0]["path"],
        "title": d["sources"][0]["title"],
    }

In [10]:
df = pd.DataFrame(
    list(map(parse_resources, [d for d in data["resources"] if d.get("sources")]))
)

### Remove rows without weekly update urls and date field from the good urls

In [11]:
df = df[~df["description"].str.contains("Data Dictionary")]

In [12]:
df["date"] = pd.to_datetime(
    df["description"].str.strip("COVID-19 Vaccines by ZIP Code ")
)

### Download the latest url

In [13]:
url = df[df["date"] == df["date"].max()]["path"].to_string(index=False)
update_date = df[df["date"] == df["date"].max()]["date"].to_string(index=False)

In [14]:
!wget {url} -O data/raw/{update_date}.csv

--2021-05-07 19:04:01--  https://data.chhs.ca.gov/dataset/ead44d40-fd63-4f9f-950a-3b0111074de8/resource/1fd9be93-8c39-47ac-a60b-b189e34dda51/download/coveragebyzip_od-table-1.csv
Resolving data.chhs.ca.gov (data.chhs.ca.gov)... 2606:4700::6813:da70, 2606:4700::6813:db70, 104.19.218.112, ...
Connecting to data.chhs.ca.gov (data.chhs.ca.gov)|2606:4700::6813:da70|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://og-production-open-data-chelseama-892364687672.s3.amazonaws.com/resources/1fd9be93-8c39-47ac-a60b-b189e34dda51/coveragebyzip_od-table-1.csv?Signature=av%2BNfIuEE%2B2i05YbsSV29PJjnAE%3D&Expires=1620443042&AWSAccessKeyId=AKIAJJIENTAPKHZMIPXQ [following]
--2021-05-07 19:04:02--  https://og-production-open-data-chelseama-892364687672.s3.amazonaws.com/resources/1fd9be93-8c39-47ac-a60b-b189e34dda51/coveragebyzip_od-table-1.csv?Signature=av%2BNfIuEE%2B2i05YbsSV29PJjnAE%3D&Expires=1620443042&AWSAccessKeyId=AKIAJJIENTAPKHZMIPXQ
Resolving og-production-o

---

## Concatenate past weekly updates

### Get all files and assign a date to each table based on the file name

In [15]:
path = ""
files = glob.glob(os.path.join(path, "data/raw/*.csv"))

In [16]:
file_df = (
    pd.read_csv(f, low_memory=False).assign(date=os.path.basename(f)) for f in files
)

### Concatenate them into one timeseries and clean update date field

In [17]:
concat_df = pd.concat(
    file_df,
    ignore_index=True,
)

In [18]:
concat_df["date"] = pd.to_datetime(
    concat_df["date"].str.replace(".csv", "", regex=False)
)

---

## Export

### All updates

In [19]:
concat_df.to_csv("data/timeseries.csv", index=False)

### Latest update

In [20]:
concat_df[concat_df["date"] == concat_df["date"].max()].to_csv(
    "data/latest.csv", index=False
)