In [113]:
import pandas as pd
import requests

In [114]:
def next_monday(date):
    return pd.date_range(start=date, end=date + pd.offsets.Day(6), freq='W-MON')[0]

# Setting

In [115]:
test_date = pd.to_datetime('2021-03-22')

In [116]:
models_to_exclude = ['COVIDhub-ensemble', 'COVIDhub-trained_ensemble']

locations_to_exclude = ["11", "60", "66", "69", "72", "74", "78"]

# DC,11,District of Columbia
# AS,60,American Samoa
# GU,66,Guam
# MP,69,Northern Mariana Islands
# PR,72,Puerto Rico
# UM,74,U.S. Minor Outlying Islands
# VI,78,Virgin Islands

# Load Files

In [118]:
url = "https://api.github.com/repos/reichlab/covid19-forecast-hub/git/trees/master?recursive=1"
r = requests.get(url)
res = r.json()

In [119]:
files = [file["path"] for file in res["tree"] if (file["path"].startswith('data-processed/') and file["path"].endswith('.csv'))]

In [148]:
files[:5]

['data-processed/AIpert-pwllnod/2020-12-21-AIpert-pwllnod.csv',
 'data-processed/AIpert-pwllnod/2020-12-28-AIpert-pwllnod.csv',
 'data-processed/AIpert-pwllnod/2021-01-04-AIpert-pwllnod.csv',
 'data-processed/AIpert-pwllnod/2021-01-11-AIpert-pwllnod.csv',
 'data-processed/AIpert-pwllnod/2021-01-18-AIpert-pwllnod.csv']

In [24]:
# models = [file["path"].split('/')[1] for file in res["tree"] if (file["path"].startswith('data-processed/') and '.' not in file["path"])]

In [128]:
df_files = pd.DataFrame({'filename':files})

df_files['model'] = df_files.filename.apply(lambda f: f.split('/')[1])

df_files['forecast_date'] = df_files.filename.apply(lambda f: f.split('/')[2][:10])

df_files.forecast_date = pd.to_datetime(df_files.forecast_date)

df_files['timezero'] = df_files.forecast_date.apply(next_monday)

df_files = df_files[~df_files.model.isin(models_to_exclude)].copy()

In [129]:
df_files

Unnamed: 0,filename,model,forecast_date,timezero
0,data-processed/AIpert-pwllnod/2020-12-21-AIper...,AIpert-pwllnod,2020-12-21,2020-12-21
1,data-processed/AIpert-pwllnod/2020-12-28-AIper...,AIpert-pwllnod,2020-12-28,2020-12-28
2,data-processed/AIpert-pwllnod/2021-01-04-AIper...,AIpert-pwllnod,2021-01-04,2021-01-04
3,data-processed/AIpert-pwllnod/2021-01-11-AIper...,AIpert-pwllnod,2021-01-11,2021-01-11
4,data-processed/AIpert-pwllnod/2021-01-18-AIper...,AIpert-pwllnod,2021-01-18,2021-01-18
...,...,...,...,...
2913,data-processed/epiforecasts-ensemble1/2021-02-...,epiforecasts-ensemble1,2021-02-22,2021-02-22
2914,data-processed/epiforecasts-ensemble1/2021-03-...,epiforecasts-ensemble1,2021-03-01,2021-03-01
2915,data-processed/epiforecasts-ensemble1/2021-03-...,epiforecasts-ensemble1,2021-03-08,2021-03-08
2916,data-processed/epiforecasts-ensemble1/2021-03-...,epiforecasts-ensemble1,2021-03-15,2021-03-15


### Test Data

In [133]:
VALID_TARGETS = [f"{_} wk ahead inc death" for _ in range(1, 5)] + \
                [f"{_} wk ahead cum death" for _ in range(1, 5)] + \
                [f"{_} wk ahead inc case" for _ in range(1, 5)] + \
                [f"{_} wk ahead cum case" for _ in range(1, 5)]

In [134]:
df_test_files = df_files[df_files.timezero == test_date]

In [137]:
dfs_test = []

for _, row in df_test_files.iterrows():
    print(row['filename'])
    df_temp = pd.read_csv('https://github.com/reichlab/covid19-forecast-hub/raw/master/' + row['filename'],
                         dtype=dtype, parse_dates=parse_dates)
    df_temp = df_temp[df_temp.target.isin(VALID_TARGETS)]
    df_temp['model'] = row['model']
    dfs_test.append(df_temp)

data-processed/AIpert-pwllnod/2021-03-22-AIpert-pwllnod.csv
data-processed/BPagano-RtDriven/2021-03-21-BPagano-RtDriven.csv
data-processed/CEID-Walk/2021-03-22-CEID-Walk.csv
data-processed/CMU-TimeSeries/2021-03-22-CMU-TimeSeries.csv
data-processed/COVIDhub-baseline/2021-03-22-COVIDhub-baseline.csv
data-processed/CU-nochange/2021-03-21-CU-nochange.csv
data-processed/CU-scenario_high/2021-03-21-CU-scenario_high.csv
data-processed/CU-scenario_low/2021-03-21-CU-scenario_low.csv
data-processed/CU-scenario_mid/2021-03-21-CU-scenario_mid.csv
data-processed/CU-select/2021-03-21-CU-select.csv
data-processed/Caltech-CS156/2021-03-21-Caltech-CS156.csv
data-processed/Columbia_UNC-SurvCon/2021-03-21-Columbia_UNC-SurvCon.csv
data-processed/Covid19Sim-Simulator/2021-03-21-Covid19Sim-Simulator.csv
data-processed/CovidAnalytics-DELPHI/2021-03-22-CovidAnalytics-DELPHI.csv
data-processed/DDS-NBDS/2021-03-22-DDS-NBDS.csv
data-processed/FAIR-NRAR/2021-03-17-FAIR-NRAR.csv
data-processed/FAIR-NRAR/2021-03-1

In [151]:
df_test = pd.concat(dfs_test)

In [152]:
df_test.shape

(2765311, 8)

In [153]:
df_test = df_test[df_test.location.str.len() == 2]
df_test = df_test[~df_test.location.isin(locations_to_exclude)]

In [154]:
df_test = df_test[df_test.groupby(['target', 'model'])['location'].transform('nunique') == 51]

In [155]:
df_test.shape

(373779, 8)

In [160]:
available_models = dict(df_test.groupby(['target'])['model'].unique())

In [161]:
available_models

{'1 wk ahead cum death': array(['AIpert-pwllnod', 'BPagano-RtDriven', 'CEID-Walk',
        'COVIDhub-baseline', 'CU-nochange', 'CU-scenario_high',
        'CU-scenario_low', 'CU-scenario_mid', 'CU-select',
        'Covid19Sim-Simulator', 'CovidAnalytics-DELPHI', 'DDS-NBDS',
        'Geneva-DetGrowth', 'IHME-CurveFit', 'IUPUI-HkPrMobiDyR',
        'JHUAPL-Bucky', 'Karlen-pypm', 'LANL-GrowthRate', 'LNQ-ens1',
        'MIT_CritData-GBCF', 'MOBS-GLEAM_COVID', 'Microsoft-DeepSTIA',
        'OliverWyman-Navigator', 'PSI-DRAFT', 'RobertWalraven-ESG',
        'SigSci-TS', 'SteveMcConnell-CovidComplete', 'TTU-squider',
        'UA-EpiCovDA', 'UCLA-SuEIR', 'UCSD_NEU-DeepGLEAM',
        'UMass-MechBayes', 'USC-SI_kJalpha', 'UT-Mobility',
        'WalmartLabsML-LogForecasting', 'epiforecasts-ensemble1'],
       dtype=object),
 '1 wk ahead inc case': array(['BPagano-RtDriven', 'CEID-Walk', 'COVIDhub-baseline',
        'CU-nochange', 'CU-scenario_high', 'CU-scenario_low',
        'CU-scenario_mid', 

## Training Data

d - 4 weeks - (horizon - 1) --> end: d - horizon

In [94]:
# assigns each horizon the corresponding training forecast dates for the test date
h_dict = {}
for h in range(1, 5):
    h_dict[h] = [test_date - pd.Timedelta(weeks=4) - pd.Timedelta(weeks=(h - 1)), 
                 test_date - pd.Timedelta(weeks=h)]

In [95]:
h_dict

{1: [Timestamp('2021-02-22 00:00:00'), Timestamp('2021-03-15 00:00:00')],
 2: [Timestamp('2021-02-15 00:00:00'), Timestamp('2021-03-08 00:00:00')],
 3: [Timestamp('2021-02-08 00:00:00'), Timestamp('2021-03-01 00:00:00')],
 4: [Timestamp('2021-02-01 00:00:00'), Timestamp('2021-02-22 00:00:00')]}

In [96]:
def relevant_horizons(d):
    hs = []
    for h in range(1, 5):
        if((d >= h_dict[h][0]) & (d <= h_dict[h][1])):
            hs.append(h)
    return hs

In [98]:
relevant_horizons((test_date - pd.Timedelta(weeks=5)))

[2, 3, 4]

In [238]:
df_files['horizons'] = df_files.timezero.apply(relevant_horizons)

# only keep relevant training data
df_files = df_files[df_files.horizons.apply(len) > 0]

In [110]:
dtype={'target': str, 'location': str, 'type': str, 'quantile': float, 'value': float}
parse_dates=['forecast_date', 'target_end_date']

In [249]:
dfs = []
for _, row in df_files.iterrows():
    print(row['filename'])
    print(row['horizons'])
    VALID_TARGETS = [f"{_} wk ahead inc death" for _ in row['horizons']] + \
                    [f"{_} wk ahead cum death" for _ in row['horizons']] + \
                    [f"{_} wk ahead inc case" for _ in row['horizons']] + \
                    [f"{_} wk ahead cum case" for _ in row['horizons']]
    df_temp = pd.read_csv('https://github.com/reichlab/covid19-forecast-hub/raw/master/' + row['filename'],
                         dtype=dtype, parse_dates=parse_dates)
    df_temp = df_temp[df_temp.target.isin(VALID_TARGETS)]
    df_temp['model'] = row['model']
    dfs.append(df_temp)

data-processed/AIpert-pwllnod/2021-02-01-AIpert-pwllnod.csv
[4]
data-processed/AIpert-pwllnod/2021-02-08-AIpert-pwllnod.csv
[3, 4]
data-processed/AIpert-pwllnod/2021-02-15-AIpert-pwllnod.csv
[2, 3, 4]
data-processed/AIpert-pwllnod/2021-02-22-AIpert-pwllnod.csv
[1, 2, 3, 4]
data-processed/AIpert-pwllnod/2021-03-01-AIpert-pwllnod.csv
[1, 2, 3]
data-processed/AIpert-pwllnod/2021-03-08-AIpert-pwllnod.csv
[1, 2]
data-processed/AIpert-pwllnod/2021-03-15-AIpert-pwllnod.csv
[1]
data-processed/BPagano-RtDriven/2021-01-31-BPagano-RtDriven.csv
[4]
data-processed/BPagano-RtDriven/2021-02-07-BPagano-RtDriven.csv
[3, 4]
data-processed/BPagano-RtDriven/2021-02-14-BPagano-RtDriven.csv
[2, 3, 4]
data-processed/BPagano-RtDriven/2021-02-21-BPagano-RtDriven.csv
[1, 2, 3, 4]
data-processed/BPagano-RtDriven/2021-02-28-BPagano-RtDriven.csv
[1, 2, 3]
data-processed/BPagano-RtDriven/2021-03-07-BPagano-RtDriven.csv
[1, 2]
data-processed/BPagano-RtDriven/2021-03-14-BPagano-RtDriven.csv
[1]
data-processed/CEID-Wa

data-processed/FAIR-NRAR/2021-02-23-FAIR-NRAR.csv
[1, 2, 3]
data-processed/FAIR-NRAR/2021-02-24-FAIR-NRAR.csv
[1, 2, 3]
data-processed/FAIR-NRAR/2021-02-25-FAIR-NRAR.csv
[1, 2, 3]
data-processed/FAIR-NRAR/2021-02-28-FAIR-NRAR.csv
[1, 2, 3]
data-processed/FAIR-NRAR/2021-03-01-FAIR-NRAR.csv
[1, 2, 3]
data-processed/FAIR-NRAR/2021-03-02-FAIR-NRAR.csv
[1, 2]
data-processed/FAIR-NRAR/2021-03-03-FAIR-NRAR.csv
[1, 2]
data-processed/FAIR-NRAR/2021-03-04-FAIR-NRAR.csv
[1, 2]
data-processed/FAIR-NRAR/2021-03-05-FAIR-NRAR.csv
[1, 2]
data-processed/FAIR-NRAR/2021-03-07-FAIR-NRAR.csv
[1, 2]
data-processed/FAIR-NRAR/2021-03-08-FAIR-NRAR.csv
[1, 2]
data-processed/FAIR-NRAR/2021-03-09-FAIR-NRAR.csv
[1]
data-processed/FAIR-NRAR/2021-03-10-FAIR-NRAR.csv
[1]
data-processed/FAIR-NRAR/2021-03-11-FAIR-NRAR.csv
[1]
data-processed/FAIR-NRAR/2021-03-12-FAIR-NRAR.csv
[1]
data-processed/FAIR-NRAR/2021-03-13-FAIR-NRAR.csv
[1]
data-processed/FAIR-NRAR/2021-03-15-FAIR-NRAR.csv
[1]
data-processed/FDANIHASU-Sweight/2

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


data-processed/IHME-CurveFit/2021-02-05-IHME-CurveFit.csv
[3, 4]
data-processed/IHME-CurveFit/2021-02-12-IHME-CurveFit.csv
[2, 3, 4]
data-processed/IHME-CurveFit/2021-02-22-IHME-CurveFit.csv
[1, 2, 3, 4]
data-processed/IHME-CurveFit/2021-02-25-IHME-CurveFit.csv
[1, 2, 3]
data-processed/IHME-CurveFit/2021-03-15-IHME-CurveFit.csv
[1]
data-processed/IUPUI-HkPrMobiDyR/2021-02-22-IUPUI-HkPrMobiDyR.csv
[1, 2, 3, 4]
data-processed/IUPUI-HkPrMobiDyR/2021-03-15-IUPUI-HkPrMobiDyR.csv
[1]
data-processed/IowaStateLW-STEM/2021-01-31-IowaStateLW-STEM.csv
[4]
data-processed/IowaStateLW-STEM/2021-02-07-IowaStateLW-STEM.csv
[3, 4]
data-processed/IowaStateLW-STEM/2021-02-14-IowaStateLW-STEM.csv
[2, 3, 4]
data-processed/IowaStateLW-STEM/2021-02-21-IowaStateLW-STEM.csv
[1, 2, 3, 4]
data-processed/IowaStateLW-STEM/2021-02-28-IowaStateLW-STEM.csv
[1, 2, 3]
data-processed/IowaStateLW-STEM/2021-03-07-IowaStateLW-STEM.csv
[1, 2]
data-processed/IowaStateLW-STEM/2021-03-14-IowaStateLW-STEM.csv
[1]
data-processed

data-processed/OliverWyman-Navigator/2021-02-21-OliverWyman-Navigator.csv
[1, 2, 3, 4]
data-processed/OliverWyman-Navigator/2021-02-28-OliverWyman-Navigator.csv
[1, 2, 3]
data-processed/OliverWyman-Navigator/2021-03-07-OliverWyman-Navigator.csv
[1, 2]
data-processed/OliverWyman-Navigator/2021-03-14-OliverWyman-Navigator.csv
[1]
data-processed/OneQuietNight-ML/2021-01-31-OneQuietNight-ML.csv
[4]
data-processed/OneQuietNight-ML/2021-02-07-OneQuietNight-ML.csv
[3, 4]
data-processed/OneQuietNight-ML/2021-02-14-OneQuietNight-ML.csv
[2, 3, 4]
data-processed/OneQuietNight-ML/2021-02-21-OneQuietNight-ML.csv
[1, 2, 3, 4]
data-processed/OneQuietNight-ML/2021-02-28-OneQuietNight-ML.csv
[1, 2, 3]
data-processed/OneQuietNight-ML/2021-03-07-OneQuietNight-ML.csv
[1, 2]
data-processed/PSI-DRAFT/2021-02-01-PSI-DRAFT.csv
[4]
data-processed/PSI-DRAFT/2021-02-08-PSI-DRAFT.csv
[3, 4]
data-processed/PSI-DRAFT/2021-02-15-PSI-DRAFT.csv
[2, 3, 4]
data-processed/PSI-DRAFT/2021-02-22-PSI-DRAFT.csv
[1, 2, 3, 4]
d

data-processed/USC-SI_kJalpha/2021-01-31-USC-SI_kJalpha.csv
[4]
data-processed/USC-SI_kJalpha/2021-02-07-USC-SI_kJalpha.csv
[3, 4]
data-processed/USC-SI_kJalpha/2021-02-14-USC-SI_kJalpha.csv
[2, 3, 4]
data-processed/USC-SI_kJalpha/2021-02-21-USC-SI_kJalpha.csv
[1, 2, 3, 4]
data-processed/USC-SI_kJalpha/2021-02-28-USC-SI_kJalpha.csv
[1, 2, 3]
data-processed/USC-SI_kJalpha/2021-03-07-USC-SI_kJalpha.csv
[1, 2]
data-processed/USC-SI_kJalpha/2021-03-14-USC-SI_kJalpha.csv
[1]
data-processed/UVA-Ensemble/2021-02-01-UVA-Ensemble.csv
[4]
data-processed/UVA-Ensemble/2021-02-15-UVA-Ensemble.csv
[2, 3, 4]
data-processed/UVA-Ensemble/2021-02-22-UVA-Ensemble.csv
[1, 2, 3, 4]
data-processed/UVA-Ensemble/2021-03-01-UVA-Ensemble.csv
[1, 2, 3]
data-processed/UVA-Ensemble/2021-03-08-UVA-Ensemble.csv
[1, 2]
data-processed/UVA-Ensemble/2021-03-15-UVA-Ensemble.csv
[1]
data-processed/UpstateSU-GRU/2021-02-01-UpstateSU-GRU.csv
[4]
data-processed/UpstateSU-GRU/2021-02-08-UpstateSU-GRU.csv
[3, 4]
data-processed

In [251]:
df = pd.concat(dfs)

# Data Cleaning

In [259]:
df = df[df.location.str.len() == 2]
df = df[~df.location.isin(locations_to_exclude)]

In [57]:
df.location.nunique() # US + 50 states

51

In [74]:
df = df[df.groupby(['target', 'model', 'location'])['target_end_date'].transform('nunique') == 4]

In [77]:
df = df[df.groupby(['target', 'model', 'target_end_date'])['location'].transform('nunique') == 51]

In [58]:
df.shape

(1677243, 8)

In [65]:
df.head()

Unnamed: 0,forecast_date,target,target_end_date,quantile,type,value,location,model
0,2021-02-01,4 wk ahead cum death,2021-02-27,0.025,quantile,500175.498495,US,AIpert-pwllnod
1,2021-02-01,4 wk ahead cum death,2021-02-27,0.25,quantile,522591.962402,US,AIpert-pwllnod
2,2021-02-01,4 wk ahead cum death,2021-02-27,0.75,quantile,565647.261165,US,AIpert-pwllnod
3,2021-02-01,4 wk ahead cum death,2021-02-27,0.975,quantile,625287.306473,US,AIpert-pwllnod
4,2021-02-01,4 wk ahead cum death,2021-02-27,,point,534353.948867,US,AIpert-pwllnod


In [66]:
df.target.unique()

array(['4 wk ahead cum death', '4 wk ahead inc death',
       '3 wk ahead cum death', '3 wk ahead inc death',
       '2 wk ahead cum death', '2 wk ahead inc death',
       '1 wk ahead cum death', '1 wk ahead inc death',
       '4 wk ahead inc case', '3 wk ahead inc case',
       '2 wk ahead inc case', '1 wk ahead inc case'], dtype=object)

In [75]:
df.shape

(1213800, 8)

In [87]:
df.groupby('target').model.nunique()

target
1 wk ahead cum death    29
1 wk ahead inc case     28
1 wk ahead inc death    31
2 wk ahead cum death    26
2 wk ahead inc case     25
2 wk ahead inc death    26
3 wk ahead cum death    27
3 wk ahead inc case     24
3 wk ahead inc death    27
4 wk ahead cum death    27
4 wk ahead inc case     23
4 wk ahead inc death    26
Name: model, dtype: int64

# Export

In [264]:
df.to_csv('df_train.csv', index=False)

In [2]:
df = pd.read_csv('df_train.csv')