URL preparation

In [91]:
# Geometry URL
zipcode_geometry_url = "http://www.dph.illinois.gov/sites/default/files/COVID19/il_illinois_zip_codes.json?nocache=1"
county_geometry_url = "http://www.dph.illinois.gov/sites/default/files/Illinois_County_Geo_ch.json"

# Data URL
county_data_url = "http://www.dph.illinois.gov/sitefiles/CountyDemos.json?nocache=1"
zipcode_data_url = "http://www.dph.illinois.gov/sitefiles/COVIDZip.json?nocache=1"
history_data_url = "http://www.dph.illinois.gov/sitefiles/COVIDHistoricalTestResults.json?nocache=1"

In [92]:
import pandas as pd
import json
import numpy as np
import geopandas as gpd
from urllib.request import urlopen

Download data

In [93]:
# with urlopen(zipcode_data_url) as response:
#     zipcode_dict = json.load(response)
#     zipcode_data = pd.DataFrame(zipcode_dict['zip_values'])
with open("./idph_COVIDZip.json") as f:
    zipcode_dict = json.load(f)
    zipcode_data = pd.DataFrame(zipcode_dict['zip_values'])

In [94]:
# with urlopen(county_data_url) as response:
#     county_dict = json.load(response)
#     county_data = pd.DataFrame(county_dict['county_demographics'])
    

In [95]:
# with urlopen(history_data_url) as response:
#     history_dict = json.load(response)
with open("./idph_COVIDHistoricalTestResults.json") as f:
    history_dict = json.load(f)

In [96]:
zipcode_geometry = gpd.read_file(zipcode_geometry_url)
county_geometry = gpd.read_file(county_geometry_url)

In [97]:
with urlopen(zipcode_geometry_url) as response:
    tmp = json.load(response)
    zipcode_gpd = gpd.GeoDataFrame(tmp['features'])

In [98]:
zipcode_gpd['geometry'] = zipcode_geometry
zipcode_gpd = zipcode_gpd[['id','geometry']]
zipcode_gpd['id'] = zipcode_gpd['id'].astype(str)

In [99]:
county_gpd = county_geometry[['id','geometry']]

Zipcode Data generation

In [100]:
zipcode_data

Unnamed: 0,zip,confirmed_cases,total_tested,demographics
0,53142,8,145,"{'age': [{'age_group': 'Unknown', 'count': 0, ..."
1,60002,68,549,"{'age': [{'age_group': 'Unknown', 'count': 0, ..."
2,60004,205,2416,"{'age': [{'age_group': 'Unknown', 'count': 6, ..."
3,60005,113,734,"{'age': [{'age_group': 'Unknown', 'count': 0, ..."
4,60007,181,877,"{'age': [{'age_group': 'Unknown', 'count': 0, ..."
...,...,...,...,...
491,62920,54,190,"{'age': [{'age_group': 'Unknown', 'count': 0, ..."
492,62948,8,262,"{'age': [{'age_group': 'Unknown', 'count': 0, ..."
493,62959,24,632,"{'age': [{'age_group': 'Unknown', 'count': 0, ..."
494,62966,48,440,"{'age': [{'age_group': 'Unknown', 'count': 0, ..."


In [101]:
zipcode_gpd.dtypes

id            object
geometry    geometry
dtype: object

In [102]:
zipcode_final_gpd = pd.merge(zipcode_gpd,zipcode_data, how = 'left', left_on=['id'], right_on=['zip'])

In [103]:
zipcode_final_gpd['confirmed_cases'] = zipcode_final_gpd['confirmed_cases'].replace(np.nan,0)
zipcode_final_gpd['total_tested'] = zipcode_final_gpd['total_tested'].replace(np.nan,0)
zipcode_final_gpd = zipcode_final_gpd[['id','confirmed_cases','total_tested','geometry']]

In [104]:
zipcode_final_gpd.to_file('dph_zipcode_data.geojson', driver='GeoJSON', encoding='utf-8')
print('done')

done


Static County Data Generation


In [105]:
county_static_data = pd.DataFrame(history_dict['characteristics_by_county']['values'])

In [106]:
county_static = pd.merge(county_gpd, county_static_data, how="left", left_on="id",right_on="County")

In [107]:
county_static = county_static.rename(columns={'id':'NAME'})

In [108]:
county_static.to_file('dph_county_static_data.geojson', driver='GeoJSON', encoding='utf-8')
print('done')

done


County Data generation


In [109]:
# transform to new york times format
county_history = pd.DataFrame(history_dict['historical_county']['values'])
l = []
for case in history_dict['historical_county']['values']:
    testDate = case['testDate']
    values = case['values']
    for x in values:
        x['date'] = testDate
        l.append(x)
county_history = pd.DataFrame(l)

In [110]:
# eliminate unassigned data
county_history = county_history[county_history['County'] != 'Unassigned']

In [111]:
def standardDate(str):
    l = str.split('/')
    month = l[0]
    day = l[1]
    year = l[2]
    if len(month) < 2:
        month = '0' + month
    if len(day) < 2:
        day = '0' + day
    return year + '-' + month + '-' + day

In [112]:
# pivot table
county_pivot = pd.pivot_table(county_history, index=['County'],columns=['date'])

In [113]:
county_pivot['deaths'].iloc[2]

date
3/17/2020    0.0
3/18/2020    0.0
3/19/2020    0.0
3/20/2020    0.0
3/21/2020    0.0
3/22/2020    0.0
3/24/2020    0.0
3/25/2020    0.0
3/26/2020    0.0
3/27/2020    0.0
3/28/2020    0.0
3/29/2020    0.0
3/30/2020    0.0
3/31/2020    0.0
4/1/2020     0.0
4/10/2020    0.0
4/11/2020    0.0
4/12/2020    0.0
4/13/2020    0.0
4/14/2020    0.0
4/15/2020    0.0
4/16/2020    1.0
4/17/2020    1.0
4/18/2020    1.0
4/19/2020    1.0
4/2/2020     0.0
4/20/2020    1.0
4/21/2020    1.0
4/22/2020    1.0
4/23/2020    1.0
4/24/2020    1.0
4/25/2020    1.0
4/26/2020    1.0
4/27/2020    1.0
4/28/2020    1.0
4/29/2020    1.0
4/3/2020     0.0
4/30/2020    1.0
4/4/2020     0.0
4/5/2020     0.0
4/6/2020     0.0
4/7/2020     0.0
4/8/2020     0.0
4/9/2020     0.0
5/1/2020     1.0
5/10/2020    1.0
5/11/2020    1.0
5/12/2020    1.0
5/13/2020    1.0
5/14/2020    1.0
5/15/2020    1.0
5/2/2020     1.0
5/3/2020     1.0
5/4/2020     1.0
5/5/2020     1.0
5/6/2020     1.0
5/7/2020     1.0
5/8/2020     1.0
5/9/2020 

In [114]:
# Standardized Date Format
county_cases = county_pivot['confirmed_cases']
county_deaths = county_pivot['deaths']

In [115]:
county_cases = county_cases.rename(columns=standardDate)
county_deaths = county_deaths.rename(columns=standardDate)

In [116]:
# Add missing 03/23 data with 03/22
county_cases['2020-03-23'] = county_cases['2020-03-22']
county_deaths['2020-03-23'] = county_deaths['2020-03-22']

In [117]:
county_cases.head(1)

date,2020-03-17,2020-03-18,2020-03-19,2020-03-20,2020-03-21,2020-03-22,2020-03-24,2020-03-25,2020-03-26,2020-03-27,...,2020-05-15,2020-05-02,2020-05-03,2020-05-04,2020-05-05,2020-05-06,2020-05-07,2020-05-08,2020-05-09,2020-03-23
County,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Adams,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,42.0,40.0,40.0,40.0,40.0,40.0,40.0,41.0,41.0,1.0


In [118]:
# Get date information
date = county_cases.columns.tolist()
date.sort()
dt_first = date[0]
dt_today = date[-1]
dt_yesterday = date[-2]

In [119]:
from datetime import datetime
from datetime import timedelta

In [120]:
def find_missing_date(date):
    dt_range = datetime.strptime(date[-1], "%Y-%m-%d") - datetime.strptime(date[0], "%Y-%m-%d")
    if len(date) != dt_range.days + 1:
        print('Alert! Missing Date or Redundant date')
        for x in range(len(date) - 1):
            if (datetime.strptime(date[x + 1], "%Y-%m-%d") - datetime.strptime(date[x], "%Y-%m-%d")).days != 1:
                missing_date = datetime.strptime(date[x], "%Y-%m-%d") + timedelta(days=1)
                print('Missing Date:')
                print(missing_date)

In [121]:
# Looking for missing dates
find_missing_date(date)

In [122]:
# reorder date
county_cases = county_cases[date]
county_deaths = county_deaths[date]

In [123]:
# get case time series
cases_ts = county_cases.values.tolist()
deaths_ts = county_deaths.values.tolist()

In [124]:
# get first case date
county_cases['dt_first_case'] = (county_cases > 0).idxmax(axis=1)
county_cases.loc[county_cases.iloc[:, -2] <= 0, 'dt_first_case'] = np.nan
county_deaths['dt_first_death'] = (county_deaths > 0).idxmax(axis=1)
county_deaths.loc[county_deaths.iloc[:, -2] <= 0, 'dt_first_death'] = np.nan

In [125]:
county_cases['cases_ts'] = cases_ts
county_deaths['deaths_ts'] = deaths_ts

In [126]:
# Add today_case and today_new_case columns
county_cases['today_case'] = county_cases[dt_today]
county_cases['today_new_case'] = county_cases[dt_today] - county_cases[dt_yesterday]
county_deaths['today_death'] = county_deaths[dt_today]
county_deaths['today_new_death'] = county_deaths[dt_today] - county_deaths[dt_yesterday]

In [127]:
case_report = county_cases[['cases_ts','dt_first_case','today_case','today_new_case']]

In [128]:
death_report = county_deaths[['deaths_ts','dt_first_death','today_death','today_new_death']]

In [129]:
county_report = case_report.join(death_report, how="outer")

In [130]:
county_report = county_report.reset_index()
county_report.columns = ['NAME','cases_ts','dt_first_case','today_case','today_new_case','deaths_ts','dt_first_death','today_death','today_new_death']
county_report['cases_ts'] = county_report['cases_ts'].apply(lambda x: ','.join(map(str,x)))
county_report['deaths_ts'] = county_report['deaths_ts'].apply(lambda x: ','.join(map(str,x)))

In [131]:
np.setdiff1d(county_report['NAME'],county_gpd['id'])
# Illinois, Out of State and Suburban Cook is not in geometry

array(['Illinois', 'Out Of State', 'Suburban Cook'], dtype=object)

In [132]:
county_report = county_report[(county_report['NAME'] != 'Illinois') & (county_report['NAME'] != 'Out of State') & (county_report['NAME'] != 'Suburban Cook')]



In [133]:
county_report['dt_start'] = dt_first
county_report['dt_end'] = dt_today
county_report['dt_unit'] = 'day'

In [134]:
county_report

Unnamed: 0,NAME,cases_ts,dt_first_case,today_case,today_new_case,deaths_ts,dt_first_death,today_death,today_new_death,dt_start,dt_end,dt_unit
0,Adams,"0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1....",2020-03-20,42.0,1.0,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-05-01,1.0,0.0,2020-03-17,2020-05-15,day
1,Alexander,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-04-16,8.0,0.0,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",,0.0,0.0,2020-03-17,2020-05-15,day
2,Bond,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-03-29,11.0,1.0,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-04-16,1.0,0.0,2020-03-17,2020-05-15,day
3,Boone,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-04-05,274.0,6.0,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-04-08,14.0,1.0,2020-03-17,2020-05-15,day
4,Brown,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-05-01,9.0,0.0,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",,0.0,0.0,2020-03-17,2020-05-15,day
...,...,...,...,...,...,...,...,...,...,...,...,...
101,Whiteside,"1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2....",2020-03-17,113.0,2.0,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-04-02,8.0,0.0,2020-03-17,2020-05-15,day
102,Will,"2.0,3.0,9.0,10.0,12.0,21.0,21.0,28.0,40.0,67.0...",2020-03-17,4277.0,187.0,"0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,3.0,4....",2020-03-20,237.0,6.0,2020-03-17,2020-05-15,day
103,Williamson,"0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1....",2020-03-19,51.0,-1.0,"0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-03-19,1.0,0.0,2020-03-17,2020-05-15,day
104,Winnebago,"1.0,1.0,1.0,2.0,4.0,5.0,5.0,5.0,7.0,8.0,8.0,9....",2020-03-17,1390.0,53.0,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-04-01,33.0,1.0,2020-03-17,2020-05-15,day


In [135]:
county_final_gpd = pd.merge(county_gpd, county_report, how="left", left_on="id", right_on="NAME")

In [136]:
county_final_gpd['population'] = 1

In [137]:
county_final_gpd

Unnamed: 0,id,geometry,NAME,cases_ts,dt_first_case,today_case,today_new_case,deaths_ts,dt_first_death,today_death,today_new_death,dt_start,dt_end,dt_unit,population
0,McHenry,"POLYGON ((-88.70742 42.49352, -88.70741 42.493...",McHenry,"2.0,4.0,6.0,8.0,11.0,12.0,12.0,14.0,19.0,27.0,...",2020-03-17,1123.0,40.0,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2....",2020-03-26,62.0,5.0,2020-03-17,2020-05-15,day,1
1,Boone,"POLYGON ((-88.70742 42.49352, -88.70750 42.493...",Boone,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-04-05,274.0,6.0,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-04-08,14.0,1.0,2020-03-17,2020-05-15,day,1
2,Ogle,"POLYGON ((-89.68809 42.19950, -89.68807 42.184...",Ogle,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-03-31,165.0,2.0,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-04-15,2.0,0.0,2020-03-17,2020-05-15,day,1
3,Will,"POLYGON ((-88.26146 41.72439, -88.26103 41.708...",Will,"2.0,3.0,9.0,10.0,12.0,21.0,21.0,28.0,40.0,67.0...",2020-03-17,4277.0,187.0,"0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,2.0,3.0,3.0,4....",2020-03-20,237.0,6.0,2020-03-17,2020-05-15,day,1
4,LaSalle,"POLYGON ((-88.93885 41.62837, -88.93891 41.628...",LaSalle,"0.0,0.0,1.0,1.0,1.0,1.0,1.0,3.0,3.0,3.0,3.0,3....",2020-03-19,117.0,4.0,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-03-29,7.0,2.0,2020-03-17,2020-05-15,day,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,Lawrence,"POLYGON ((-87.90806 38.85013, -87.90819 38.835...",Lawrence,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-04-07,4.0,0.0,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",,0.0,0.0,2020-03-17,2020-05-15,day,1
99,Marion,"POLYGON ((-89.13844 38.73633, -89.13847 38.721...",Marion,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-03-30,47.0,0.0,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",,0.0,0.0,2020-03-17,2020-05-15,day,1
100,Union,"POLYGON ((-89.04143 37.59650, -89.06017 37.597...",Union,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-04-15,117.0,4.0,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-05-05,4.0,3.0,2020-03-17,2020-05-15,day,1
101,Pope,"POLYGON ((-88.70860 37.59926, -88.70876 37.584...",Pope,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",2020-05-08,1.0,0.0,"0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0....",,0.0,0.0,2020-03-17,2020-05-15,day,1


In [138]:
county_final_gpd.to_file('dph_county_data.geojson', driver='GeoJSON', encoding='utf-8')
print('done')


done
