forked from reichlab/covid19-forecast-hub
-
Notifications
You must be signed in to change notification settings - Fork 0
/
zoltar-truth-data.py
168 lines (130 loc) · 6.04 KB
/
zoltar-truth-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import json
from zoltpy import util
import pandas as pd
import pymmwr as pm
import datetime
import warnings
import requests
import io
warnings.simplefilter(action='ignore')
def get_epi_data(date):
format_str = '%m/%d/%y' # The format
dt = datetime.datetime.strptime(date, format_str).date()
epi = pm.date_to_epiweek(dt)
return epi.year, epi.week, epi.day
def get_epi_data_TZ(date):
format_str = '%Y-%m-%d' # The format
dt = datetime.datetime.strptime(date, format_str).date()
epi = pm.date_to_epiweek(dt)
epi_week = epi.week
epi_day = epi.day
if epi_day >= 3: # cut off is Tuesday
epi_week = epi_week + 1
return epi.year, epi_week, epi.day
def get_available_timezeros(project_name):
conn = util.authenticate()
project = [project for project in conn.projects if project.name == project_name][0]
project_timezeros = project.timezeros
timezero = []
for timezero_array in project_timezeros:
timezero += [timezero_array.timezero_date]
return timezero
def configure_JHU_data(df, target):
# convert matrix to repeating row format
df_truth = df.unstack()
df_truth = df_truth.reset_index()
# get epi data from date
df_truth['year'], df_truth['week'], df_truth['day'] = \
zip(*df_truth['level_0'].map(get_epi_data))
# rename columns
df_truth = df_truth.rename(columns={0: "value",
"level_1": "location_long"})
# Get state IDs
df_truth = df_truth.merge(fips_codes, left_on='location_long', right_on='location_name', how='left')
# Drop duplicate column
df_truth = df_truth.drop(['location_name'], axis=1)
# Drop NAs
df_truth = df_truth.dropna(subset=['location', 'value'])
# add leading zeros to state code
df_truth['location'] = df_truth['location'].apply(lambda x: '{0:0>2}'.format(x))
# Observed data on the seventh day
# or group by week for incident deaths
if target == 'Incident Deaths':
df_vis = df_truth.groupby(['week', 'location_long'], as_index=False).agg({'level_0': 'last',
'value': 'sum',
'year': 'last',
'day': 'last',
'location': 'last',
'abbreviation': 'last'})
else:
df_vis = df_truth
df_vis['week'] = df_vis['week'] + 1 # shift epiweek on axis
# add leading zeros to epi week
df_vis['week'] = df_vis['week'].apply(lambda x: '{0:0>2}'.format(x))
# define epiweek
df_vis['epiweek'] = df_vis['year'].astype(str) + df_vis['week']
# rename columns
df_truth_long = df_vis.rename(columns={"week": "epiweek",
"location": "unit",
"level_0": "date"})
# get timezero
df_truth_long['date'] = pd.to_datetime(df_truth_long['date'])
# initialize df_targets
df_targets = pd.DataFrame(columns=list(df_truth_long.columns).append('target'))
# use Saturday truth values
df_truth_values = df_truth_long[df_truth_long['day'] == 7]
# find week-ahead targets
for i in range(4):
weeks_ahead = i + 1 # add one to [0,3]
days_back = 5 + ((weeks_ahead - 1) * 7) # timezero is on Mondays
df_calc = df_truth_values # initialize df
# find timezero and target
df_calc['timezero'] = df_calc['date'] - datetime.timedelta(days=days_back)
if target == "Cumulative Deaths":
df_calc['target'] = "%i wk ahead cum death" % weeks_ahead
else:
df_calc['target'] = "%i wk ahead inc death" % weeks_ahead
# concatenate truth
df_targets = pd.concat([df_targets, df_calc])
# get epi data from Timezero
df_targets['timezero'] = df_targets['timezero'].astype(str)
df_targets['tz_year'], df_targets['tz_week'], df_targets['tz_day'] = \
zip(*df_targets['timezero'].map(get_epi_data_TZ))
# truth targets by timezero week
df_targets = df_targets[["tz_week", "unit", "target", "value"]]
# Map all timezeros in Zoltar to Corresponding weeks
df_map_wk_to_tz = pd.DataFrame(columns=['timezero'])
df_map_wk_to_tz['timezero'] = get_available_timezeros("COVID-19 Forecasts")
df_map_wk_to_tz['tz_year'], df_map_wk_to_tz['tz_week'], df_map_wk_to_tz['tz_day'] = \
zip(*df_map_wk_to_tz['timezero'].map(get_epi_data_TZ))
# Merge timezeros with truth values and targets
df_final = pd.merge(df_targets, df_map_wk_to_tz, how='right', on=['tz_week'])
# select columns
df_final = df_final[["timezero", "unit", "target", "value"]]
# drop empty rows
nan_value = float("NaN")
df_final.replace("", nan_value, inplace=True)
df_final.dropna(inplace=True)
return df_final
url = "https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_US.csv"
url_req = requests.get(url).content
df = pd.read_csv(io.StringIO(url_req.decode('utf-8')))
fips_codes = pd.read_csv('../data-locations/locations.csv')
# aggregate by state and nationally
state_agg = df.groupby(['Province_State']).sum()
us_nat = df.groupby(['Country_Region']).sum()
df_state_nat = state_agg.append(us_nat)
# drop unnecessary columns
cols = list(range(0, 6))
df_truth = df_state_nat.drop(df_state_nat.columns[cols], axis=1)
# calculate incidents from cumulative
df_truth_cumulative = df_truth
df_truth_incident = df_truth - df_truth.shift(periods=1, axis='columns')
# re-format files
df_cum_death = configure_JHU_data(df_truth_cumulative, "Cumulative Deaths")
df_inc_death = configure_JHU_data(df_truth_incident, "Incident Deaths")
# concatenate targers
zoltar_truth = pd.concat([df_cum_death, df_inc_death])
# write truth to csv
file_path = './zoltar-truth.csv'
zoltar_truth.to_csv(file_path, index=False)