URL preparation

In [1]:
# Geometry URL
zipcode_geometry_url = "http://www.dph.illinois.gov/sites/default/files/COVID19/il_illinois_zip_codes.json?nocache=1"
county_geometry_url = "http://www.dph.illinois.gov/sites/default/files/Illinois_County_Geo_ch.json"

# Data URL
county_data_url = "http://www.dph.illinois.gov/sitefiles/CountyDemos.json?nocache=1"
zipcode_data_url = "http://www.dph.illinois.gov/sitefiles/COVIDZip.json?nocache=1"
history_data_url = "http://www.dph.illinois.gov/sitefiles/COVIDHistoricalTestResults.json?nocache=1"

In [2]:
import pandas as pd
import json
import numpy as np
import geopandas as gpd
from urllib.request import urlopen

Download data

In [3]:
with urlopen(zipcode_data_url) as response:
    zipcode_dict = json.load(response)
    zipcode_data = pd.DataFrame(zipcode_dict['zip_values'])
    

In [4]:
with urlopen(county_data_url) as response:
    county_dict = json.load(response)
    county_data = pd.DataFrame(county_dict['county_demographics'])
    

In [5]:
with urlopen(history_data_url) as response:
    history_dict = json.load(response)


In [6]:
zipcode_geometry = gpd.read_file(zipcode_geometry_url)
county_geometry = gpd.read_file(county_geometry_url)

In [7]:
with urlopen(zipcode_geometry_url) as response:
    tmp = json.load(response)
    zipcode_gpd = gpd.GeoDataFrame(tmp['features'])

In [8]:
zipcode_gpd['geometry'] = zipcode_geometry
zipcode_gpd = zipcode_gpd[['id','geometry']]
zipcode_gpd['id'] = zipcode_gpd['id'].astype(str)

In [9]:
county_gpd = county_geometry[['id','geometry']]

Zipcode Data generation

In [10]:
zipcode_data

Unnamed: 0,confirmed_cases,demographics,total_tested,zip
0,7,"{'age': [{'age_group': 'Unknown', 'count': 0, ...",77,46324
1,6,"{'age': [{'age_group': 'Unknown', 'count': 0, ...",37,46394
2,13,"{'age': [{'age_group': 'Unknown', 'count': 0, ...",89,53142
3,6,"{'age': [{'age_group': 'Unknown', 'count': 0, ...",54,53143
4,7,"{'age': [{'age_group': 'Unknown', 'count': 0, ...",50,53215
5,8,"{'age': [{'age_group': 'Unknown', 'count': 0, ...",33,54302
6,56,"{'age': [{'age_group': 'Unknown', 'count': 0, ...",400,60002
7,154,"{'age': [{'age_group': 'Unknown', 'count': 0, ...",1573,60004
8,82,"{'age': [{'age_group': 'Unknown', 'count': 0, ...",481,60005
9,116,"{'age': [{'age_group': 'Unknown', 'count': 0, ...",563,60007


In [11]:
zipcode_gpd.dtypes

id            object
geometry    geometry
dtype: object

In [12]:
zipcode_final_gpd = pd.merge(zipcode_gpd,zipcode_data, how = 'left', left_on=['id'], right_on=['zip'])

In [13]:
zipcode_final_gpd['confirmed_cases'] = zipcode_final_gpd['confirmed_cases'].replace(np.nan,0)
zipcode_final_gpd['total_tested'] = zipcode_final_gpd['total_tested'].replace(np.nan,0)
zipcode_final_gpd = zipcode_final_gpd[['id','confirmed_cases','total_tested','geometry']]

In [14]:
zipcode_final_gpd.to_file('dph_zipcode_data.geojson', driver='GeoJSON', encoding='utf-8')
print('done')

done


Static County Data Generation

In [45]:
county_static = pd.merge(county_gpd, county_data, how="left", left_on="id",right_on="County")

In [47]:
county_static.to_file('dph_county_static_data.geojson', driver='GeoJSON', encoding='utf-8')
print('done')

done


County Data generation


In [15]:
# transform to new york times format
county_history = pd.DataFrame(history_dict['historical_county']['values'])
l = []
for case in history_dict['historical_county']['values']:
    testDate = case['testDate']
    values = case['values']
    for x in values:
        x['date'] = testDate
        l.append(x)
county_history = pd.DataFrame(l)

In [16]:
# eliminate unassigned data
county_history = county_history[county_history['County'] != 'Unassigned']

In [17]:
# pivot table
county_pivot = pd.pivot_table(county_history, index=['County'],columns=['date'])

In [37]:
county_pivot['cases_ts'][0]

[0.0,
 0.0,
 0.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 1.0,
 2.0,
 2.0,
 2.0,
 2.0,
 24.0,
 25.0,
 25.0,
 26.0,
 27.0,
 27.0,
 29.0,
 30.0,
 31.0,
 34.0,
 2.0,
 29.0,
 29.0,
 29.0,
 29.0,
 32.0,
 32.0,
 32.0,
 33.0,
 40.0,
 40.0,
 2.0,
 40.0,
 3.0,
 3.0,
 8.0,
 9.0,
 11.0,
 22.0,
 40.0,
 40.0,
 40.0,
 40.0,
 40.0]

In [40]:
county_pivot['confirmed_cases'].iloc[0]

date
3/17/2020     0.0
3/18/2020     0.0
3/19/2020     0.0
3/20/2020     1.0
3/21/2020     1.0
3/22/2020     1.0
3/24/2020     1.0
3/25/2020     1.0
3/26/2020     1.0
3/27/2020     1.0
3/28/2020     1.0
3/29/2020     2.0
3/30/2020     2.0
3/31/2020     2.0
4/1/2020      2.0
4/10/2020    24.0
4/11/2020    25.0
4/12/2020    25.0
4/13/2020    26.0
4/14/2020    27.0
4/15/2020    27.0
4/16/2020    29.0
4/17/2020    30.0
4/18/2020    31.0
4/19/2020    34.0
4/2/2020      2.0
4/20/2020    29.0
4/21/2020    29.0
4/22/2020    29.0
4/23/2020    29.0
4/24/2020    32.0
4/25/2020    32.0
4/26/2020    32.0
4/27/2020    33.0
4/28/2020    40.0
4/29/2020    40.0
4/3/2020      2.0
4/30/2020    40.0
4/4/2020      3.0
4/5/2020      3.0
4/6/2020      8.0
4/7/2020      9.0
4/8/2020     11.0
4/9/2020     22.0
5/1/2020     40.0
5/2/2020     40.0
5/3/2020     40.0
5/4/2020     40.0
5/5/2020     40.0
Name: Adams, dtype: float64

In [34]:
# get case time series
county_pivot['cases_ts'] = county_pivot['confirmed_cases'].values.tolist()

In [19]:
# get first case date
county_pivot['dt_first_case'] = (county_pivot['confirmed_cases'] > 0).idxmax(axis=1)
county_pivot.loc[county_pivot['confirmed_cases'].iloc[:, -1] <= 0, 'dt_first_case'] = np.nan

In [20]:
# Get date information
date = county_pivot['confirmed_cases'].columns
dt_first = date[0]
dt_today = date[-1]
dt_yesterday = date[-2]

In [42]:
date

Index(['3/17/2020', '3/18/2020', '3/19/2020', '3/20/2020', '3/21/2020',
       '3/22/2020', '3/24/2020', '3/25/2020', '3/26/2020', '3/27/2020',
       '3/28/2020', '3/29/2020', '3/30/2020', '3/31/2020', '4/1/2020',
       '4/10/2020', '4/11/2020', '4/12/2020', '4/13/2020', '4/14/2020',
       '4/15/2020', '4/16/2020', '4/17/2020', '4/18/2020', '4/19/2020',
       '4/2/2020', '4/20/2020', '4/21/2020', '4/22/2020', '4/23/2020',
       '4/24/2020', '4/25/2020', '4/26/2020', '4/27/2020', '4/28/2020',
       '4/29/2020', '4/3/2020', '4/30/2020', '4/4/2020', '4/5/2020',
       '4/6/2020', '4/7/2020', '4/8/2020', '4/9/2020', '5/1/2020', '5/2/2020',
       '5/3/2020', '5/4/2020', '5/5/2020'],
      dtype='object', name='date')

In [21]:
# Add today_case and today_new_case columns
county_pivot['today_case'] = county_pivot['confirmed_cases'][dt_today]
county_pivot['today_new_case'] = county_pivot['confirmed_cases'][dt_today] - county_pivot['confirmed_cases'][dt_yesterday]

In [22]:
# Reformat
county_report = county_pivot[['cases_ts','dt_first_case','today_case','today_new_case']]
county_report = county_report.reset_index()
county_report.columns = ['County','cases_ts','dt_first_case','today_case','today_new_case']
county_report['cases_ts'] = county_report['cases_ts'].apply(lambda x: ','.join(map(str,x)))

In [23]:
np.setdiff1d(county_report['County'],county_gpd['id'])
# Illinois, Out of State and Suburban Cook is not in geometry

array(['Illinois', 'Out Of State', 'Suburban Cook'], dtype=object)

In [24]:
county_report = county_report[(county_report['County'] != 'Illinois') & (county_report['County'] != 'Out of State') & (county_report['County'] != 'Suburban Cook')]

In [25]:
def standardDate(str):
    l = str.split('/')
    month = l[0]
    day = l[1]
    year = l[2]
    if len(month) < 2:
        month = '0' + month
    if len(day) < 2:
        day = '0' + day
    return year + '-' + month + '-' + day

In [26]:
county_report['dt_first_case'] = county_report['dt_first_case'].apply(lambda x: standardDate(x) if type(x) == str else x)
county_report['dt_start'] = standardDate(dt_first)
county_report['dt_end'] = standardDate(dt_today)
county_report['dt_unit'] = 'day'

In [27]:
county_final_gpd = pd.merge(county_gpd, county_report, how="left", left_on="id", right_on="County")

In [28]:
county_final_gpd['population'] = 1

In [29]:
county_final_gpd.to_file('dph_county_data.geojson', driver='GeoJSON', encoding='utf-8')
print('done')


done
