In [1]:
import pandas as pd
import requests
from tqdm.notebook import tqdm

In [2]:
def next_monday(date):
    return pd.date_range(start=date, end=date + pd.offsets.Day(6), freq='W-MON')[0]

# Setting

In [3]:
test_date = pd.to_datetime('2021-03-22')

In [4]:
models_to_exclude = ['COVIDhub-ensemble', 'COVIDhub-trained_ensemble']

locations_to_exclude = ["11", "60", "66", "69", "72", "74", "78"]

# DC,11,District of Columbia
# AS,60,American Samoa
# GU,66,Guam
# MP,69,Northern Mariana Islands
# PR,72,Puerto Rico
# UM,74,U.S. Minor Outlying Islands
# VI,78,Virgin Islands

In [5]:
dtype={'target': str, 'location': str, 'type': str, 'quantile': float, 'value': float}
parse_dates=['forecast_date', 'target_end_date']

# Load Files

In [6]:
url = "https://api.github.com/repos/reichlab/covid19-forecast-hub/git/trees/master?recursive=1"
r = requests.get(url)
res = r.json()

In [7]:
files = [file["path"] for file in res["tree"] if (file["path"].startswith('data-processed/') and file["path"].endswith('.csv'))]

In [8]:
files[:5]

['data-processed/AIpert-pwllnod/2020-12-21-AIpert-pwllnod.csv',
 'data-processed/AIpert-pwllnod/2020-12-28-AIpert-pwllnod.csv',
 'data-processed/AIpert-pwllnod/2021-01-04-AIpert-pwllnod.csv',
 'data-processed/AIpert-pwllnod/2021-01-11-AIpert-pwllnod.csv',
 'data-processed/AIpert-pwllnod/2021-01-18-AIpert-pwllnod.csv']

In [9]:
df_files = pd.DataFrame({'filename':files})

df_files['model'] = df_files.filename.apply(lambda f: f.split('/')[1])

df_files['forecast_date'] = df_files.filename.apply(lambda f: f.split('/')[2][:10])
df_files.forecast_date = pd.to_datetime(df_files.forecast_date)

df_files['timezero'] = df_files.forecast_date.apply(next_monday)

df_files = df_files[~df_files.model.isin(models_to_exclude)]

In [10]:
df_files

Unnamed: 0,filename,model,forecast_date,timezero
0,data-processed/AIpert-pwllnod/2020-12-21-AIper...,AIpert-pwllnod,2020-12-21,2020-12-21
1,data-processed/AIpert-pwllnod/2020-12-28-AIper...,AIpert-pwllnod,2020-12-28,2020-12-28
2,data-processed/AIpert-pwllnod/2021-01-04-AIper...,AIpert-pwllnod,2021-01-04,2021-01-04
3,data-processed/AIpert-pwllnod/2021-01-11-AIper...,AIpert-pwllnod,2021-01-11,2021-01-11
4,data-processed/AIpert-pwllnod/2021-01-18-AIper...,AIpert-pwllnod,2021-01-18,2021-01-18
...,...,...,...,...
2913,data-processed/epiforecasts-ensemble1/2021-02-...,epiforecasts-ensemble1,2021-02-22,2021-02-22
2914,data-processed/epiforecasts-ensemble1/2021-03-...,epiforecasts-ensemble1,2021-03-01,2021-03-01
2915,data-processed/epiforecasts-ensemble1/2021-03-...,epiforecasts-ensemble1,2021-03-08,2021-03-08
2916,data-processed/epiforecasts-ensemble1/2021-03-...,epiforecasts-ensemble1,2021-03-15,2021-03-15


### Test Data

In [11]:
VALID_TARGETS = [f"{_} wk ahead inc death" for _ in range(1, 5)] + \
                [f"{_} wk ahead cum death" for _ in range(1, 5)] + \
                [f"{_} wk ahead inc case" for _ in range(1, 5)] + \
                [f"{_} wk ahead cum case" for _ in range(1, 5)]

In [12]:
df_test_files = df_files[df_files.timezero == test_date]

In [14]:
dfs_test = []

for _, row in tqdm(df_test_files.iterrows(), total=df_test_files.shape[0]):
    #print(row['filename'])
    df_temp = pd.read_csv('https://github.com/reichlab/covid19-forecast-hub/raw/master/' + row['filename'],
                         dtype=dtype, parse_dates=parse_dates)
    df_temp = df_temp[df_temp.target.isin(VALID_TARGETS)]
    df_temp['model'] = row['model']
    dfs_test.append(df_temp)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=69.0), HTML(value='')))




In [15]:
df_test = pd.concat(dfs_test)

In [16]:
df_test.shape

(2765311, 8)

In [17]:
# only consider US + 50 states
df_test = df_test[df_test.location.str.len() == 2]
df_test = df_test[~df_test.location.isin(locations_to_exclude)]

In [18]:
# ensure that for all targets each model provides forecasts for all locations
df_test = df_test[df_test.groupby(['target', 'model'])['location'].transform('nunique') == 51]

In [19]:
# dict of the models available for each target
available_models = dict(df_test.groupby(['target'])['model'].unique())

In [21]:
# available_models

## Training Data

d - 4 weeks - (horizon - 1) --> end: d - horizon

In [22]:
# assigns each horizon the corresponding training forecast dates for the test date
h_dict = {}
for h in range(1, 5):
    h_dict[h] = [test_date - pd.Timedelta(weeks=4) - pd.Timedelta(weeks=(h - 1)), 
                 test_date - pd.Timedelta(weeks=h)]

In [23]:
h_dict

{1: [Timestamp('2021-02-22 00:00:00'), Timestamp('2021-03-15 00:00:00')],
 2: [Timestamp('2021-02-15 00:00:00'), Timestamp('2021-03-08 00:00:00')],
 3: [Timestamp('2021-02-08 00:00:00'), Timestamp('2021-03-01 00:00:00')],
 4: [Timestamp('2021-02-01 00:00:00'), Timestamp('2021-02-22 00:00:00')]}

In [24]:
def relevant_horizons(d):
    hs = []
    for h in range(1, 5):
        if((d >= h_dict[h][0]) & (d <= h_dict[h][1])):
            hs.append(h)
    return hs

In [26]:
df_files['horizons'] = df_files.timezero.apply(relevant_horizons)

# only keep relevant training data
df_files = df_files[df_files.horizons.apply(len) > 0]

In [27]:
dfs = []
for _, row in tqdm(df_files.iterrows(), total=df_files.shape[0]):
#     print(row['filename'])
#     print(row['horizons'])
    VALID_TARGETS = [f"{_} wk ahead inc death" for _ in row['horizons']] + \
                    [f"{_} wk ahead cum death" for _ in row['horizons']] + \
                    [f"{_} wk ahead inc case" for _ in row['horizons']] + \
                    [f"{_} wk ahead cum case" for _ in row['horizons']]
    df_temp = pd.read_csv('https://github.com/reichlab/covid19-forecast-hub/raw/master/' + row['filename'],
                         dtype=dtype, parse_dates=parse_dates)
    df_temp = df_temp[df_temp.target.isin(VALID_TARGETS)]
    df_temp['model'] = row['model']
    dfs.append(df_temp)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=453.0), HTML(value='')))




In [28]:
df = pd.concat(dfs)

# Data Cleaning

In [29]:
df = df[df.location.str.len() == 2]
df = df[~df.location.isin(locations_to_exclude)]

In [30]:
df.location.nunique() # US + 50 states

51

In [31]:
df = df[df.groupby(['target', 'model', 'location'])['target_end_date'].transform('nunique') == 4]

In [32]:
df = df[df.groupby(['target', 'model', 'target_end_date'])['location'].transform('nunique') == 51]

In [33]:
df.shape

(1233384, 8)

In [34]:
df.head()

Unnamed: 0,forecast_date,target,target_end_date,quantile,type,value,location,model
72,2021-01-31,4 wk ahead inc death,2021-02-27,0.01,quantile,13527.57889,US,BPagano-RtDriven
73,2021-01-31,4 wk ahead inc death,2021-02-27,0.025,quantile,14229.39794,US,BPagano-RtDriven
74,2021-01-31,4 wk ahead inc death,2021-02-27,0.05,quantile,14856.78719,US,BPagano-RtDriven
75,2021-01-31,4 wk ahead inc death,2021-02-27,0.1,quantile,15591.43945,US,BPagano-RtDriven
76,2021-01-31,4 wk ahead inc death,2021-02-27,0.15,quantile,16090.33317,US,BPagano-RtDriven


In [35]:
df.target.unique()

array(['4 wk ahead inc death', '4 wk ahead cum death',
       '4 wk ahead inc case', '3 wk ahead inc death',
       '3 wk ahead cum death', '3 wk ahead inc case',
       '2 wk ahead inc death', '2 wk ahead cum death',
       '2 wk ahead inc case', '1 wk ahead inc death',
       '1 wk ahead cum death', '1 wk ahead inc case'], dtype=object)

In [36]:
df.shape

(1233384, 8)

In [37]:
df.groupby('target').model.nunique()

target
1 wk ahead cum death    29
1 wk ahead inc case     28
1 wk ahead inc death    31
2 wk ahead cum death    26
2 wk ahead inc case     25
2 wk ahead inc death    26
3 wk ahead cum death    28
3 wk ahead inc case     24
3 wk ahead inc death    28
4 wk ahead cum death    28
4 wk ahead inc case     23
4 wk ahead inc death    27
Name: model, dtype: int64

In [38]:
df.shape

(1233384, 8)

In [39]:
df = df[df.apply(lambda x: x.model in (available_models[x.target]), axis=1)]

In [40]:
df.shape

(1202376, 8)

In [42]:
# check if there are models used for training that are not available for the test date
train_models = dict(df.groupby('target').model.unique())

a = [(k, t) for k, v in train_models.items() for t in v]
b = [(k, t) for k, v in available_models.items() for t in v]

[i for i in a if i not in b]

[]

In [43]:
pd.DataFrame([(i, k) for i,j in train_models.items() for k in j], 
                  columns=['target','model'])

Unnamed: 0,target,model
0,1 wk ahead cum death,BPagano-RtDriven
1,1 wk ahead cum death,CEID-Walk
2,1 wk ahead cum death,COVIDhub-baseline
3,1 wk ahead cum death,CU-nochange
4,1 wk ahead cum death,CU-scenario_high
...,...,...
307,4 wk ahead inc death,UA-EpiCovDA
308,4 wk ahead inc death,UCSD_NEU-DeepGLEAM
309,4 wk ahead inc death,UMass-MechBayes
310,4 wk ahead inc death,USC-SI_kJalpha


# Export

In [44]:
df.to_csv('df_train.csv', index=False)

In [45]:
df_test.to_csv('df_test.csv', index=False)