In [158]:
import pandas as pd
import requests
import glob
from tqdm.auto import tqdm
tqdm.pandas()

# Setting

In [105]:
VALID_TARGETS = [f"{_} wk ahead inc death" for _ in range(1, 5)] + \
                [f"{_} wk ahead cum death" for _ in range(1, 5)] + \
                [f"{_} wk ahead inc case" for _ in range(1, 5)]

MODELS_TO_EXCLUDE = ['COVIDhub-ensemble', 'COVIDhub-trained_ensemble', 
                     'CU-nochange', 'CU-scenario_high', 'CU-scenario_low', 'CU-scenario_mid']

LOCATIONS_TO_EXCLUDE = ["11", "60", "66", "69", "72", "74", "78"]

# DC,11,District of Columbia
# AS,60,American Samoa
# GU,66,Guam
# MP,69,Northern Mariana Islands
# PR,72,Puerto Rico
# UM,74,U.S. Minor Outlying Islands
# VI,78,Virgin Islands

# Load Files

In [107]:
def next_monday(date):
    return pd.date_range(start=date, end=date + pd.offsets.Day(6), freq='W-MON')[0]

In [221]:
def get_all_filepaths_and_dates(models_to_exclude, online=False):
    if online:
        url = "https://api.github.com/repos/reichlab/covid19-forecast-hub/git/trees/master?recursive=1"
        r = requests.get(url)
        res = r.json()

        files = [file["path"] for file in res["tree"] if (file["path"].startswith('data-processed/') and file["path"].endswith('.csv'))]
    else:
        files = glob.glob('../covid19-forecast-hub/data-processed/**/*.csv', recursive=True)
        files = [f.replace('../covid19-forecast-hub/', '').replace('\\', '/') for f in files]

    df_files = pd.DataFrame({'filename':files})

    df_files['model'] = df_files.filename.apply(lambda f: f.split('/')[1])

    df_files['forecast_date'] = df_files.filename.apply(lambda f: f.split('/')[2][:10])
    df_files.forecast_date = pd.to_datetime(df_files.forecast_date)

    df_files['timezero'] = df_files.forecast_date.apply(next_monday)

    df_files = df_files[~df_files.model.isin(models_to_exclude)]
    
    df_files.sort_values('filename', inplace=True, ignore_index=True)

    return df_files

In [222]:
# df_files = get_all_filepaths_and_dates(MODELS_TO_EXCLUDE, online=False)

### Test Data

In [109]:
def validate_submissions(df, locations_to_exclude, train_set = True):
    # only consider US + 50 states
    df = df[df.location.str.len() == 2]
    df = df[~df.location.isin(locations_to_exclude)]

    df = df[df.type == 'quantile']
    df.dropna(subset = ['value'], inplace = True)
    
    if train_set:
        # how many forecasts for each target/model/location? should be 4 for every location
        df['no_forecasts'] = df.groupby(['target', 'model', 'location'])['target_end_date'].transform('nunique')
        df = df[df.groupby(['target', 'model'])['no_forecasts'].transform('min') == 4].drop(columns='no_forecasts').reset_index(drop=True)

    df['no_quantiles'] = df.groupby(['model', 'target', 'target_end_date', 'location'])['quantile'].transform('nunique')
    df['no_quantiles'] = df.groupby(['target', 'model'])['no_quantiles'].transform('min')

    df = df[(df.no_quantiles == 23) | 
            (df.target.str.contains('inc case') & (df.no_quantiles == 7))].drop(columns='no_quantiles').reset_index(drop=True)

    # ensure that for all targets each model provides forecasts for all locations
    df = df[df.groupby(['target', 'model'])['location'].transform('nunique') == 51]
    
    return df

In [244]:
def load_test_data(files, test_date, valid_targets, locations_to_exclude, online=False):
    test_date = pd.to_datetime(test_date)
    df_test_files = files[files.timezero == test_date]
    
    if online:
        base_path = 'https://github.com/reichlab/covid19-forecast-hub/raw/master/'
    else:
        base_path = '../covid19-forecast-hub/'
    
    dfs = []
    for _, row in tqdm(df_test_files.iterrows(), total=df_test_files.shape[0], desc = 'Load test data'):
        df_temp = pd.read_csv(base_path + row['filename'],
                              dtype = {'target': str, 'location': str, 'type': str, 'quantile': float, 'value': float}, 
                              parse_dates = ['forecast_date', 'target_end_date'])
        df_temp = df_temp[df_temp.target.isin(VALID_TARGETS)]
        df_temp['model'] = row['model']
        dfs.append(df_temp)
    df_test = pd.concat(dfs)
    
    df_test = validate_submissions(df_test, locations_to_exclude, train_set = False)
    
    return df_test

In [245]:
# df_test = load_test_data(df_files, '2021-06-14', VALID_TARGETS, LOCATIONS_TO_EXCLUDE)

Load test data:   0%|          | 0/51 [00:00<?, ?it/s]

## Training Data

d - 4 weeks - (horizon - 1) --> end: d - horizon

In [111]:
def get_forecast_dates_by_horizon(test_date, window_size = 4):
    # assigns to each horizon the corresponding training forecast dates for the test date
    forecast_dates_by_horizon = {}
    for horizon in range(1, 5):
        forecast_dates_by_horizon[horizon] = [test_date - pd.Timedelta(weeks = window_size) - pd.Timedelta(weeks = (horizon - 1)), 
                     test_date - pd.Timedelta(weeks = horizon)]
    return forecast_dates_by_horizon

In [112]:
def get_relevant_horizons(forecast_date, forecast_dates_by_horizon):
    relevant_horizons = []
    for horizon in range(1, 5):
        if((forecast_date >= forecast_dates_by_horizon[horizon][0]) & (forecast_date <= forecast_dates_by_horizon[horizon][1])):
            relevant_horizons.append(horizon)
    return relevant_horizons

In [246]:
def load_train_data(files, test_date, valid_targets, locations_to_exclude, window_size = 4, online = False):
    test_date = pd.to_datetime(test_date)
    lower_bound = test_date - pd.Timedelta(weeks = window_size) - pd.Timedelta(weeks=(window_size - 1))
    df_train_files = files[(files.timezero >= lower_bound) & (df_files.timezero < test_date)].copy()
    
    forecast_dates_by_horizon = get_forecast_dates_by_horizon(test_date, window_size)
    df_train_files['horizons'] = df_train_files.timezero.apply(get_relevant_horizons, 
                                                               forecast_dates_by_horizon=forecast_dates_by_horizon)
    
    if online:
        base_path = 'https://github.com/reichlab/covid19-forecast-hub/raw/master/'
    else:
        base_path = '../covid19-forecast-hub/'
    
    dfs = []
    for _, row in tqdm(df_train_files.iterrows(), total=df_train_files.shape[0], desc = 'Load train data'):
        relevant_targets = [f"{_} wk ahead inc death" for _ in row['horizons']] + \
                           [f"{_} wk ahead cum death" for _ in row['horizons']] + \
                           [f"{_} wk ahead inc case" for _ in row['horizons']]
        df_temp = pd.read_csv(base_path + row['filename'],
                              dtype = {'target': str, 'location': str, 'type': str, 'quantile': float, 'value': float}, 
                              parse_dates = ['forecast_date', 'target_end_date'])
        df_temp = df_temp[df_temp.target.isin(relevant_targets)]
        df_temp['model'] = row['model']
        dfs.append(df_temp)
    df_train = pd.concat(dfs)
    
    df_train = validate_submissions(df_train, locations_to_exclude, train_set = True)
    
    return df_train

In [247]:
# df_train = load_train_data(df_files, '2021-06-14', VALID_TARGETS, LOCATIONS_TO_EXCLUDE, 4)

Load train data:   0%|          | 0/404 [00:00<?, ?it/s]

In [239]:
def load_train_test(test_date, valid_targets, locations_to_exclude, models_to_exclude, window_size = 4, online = False):
    files = get_all_filepaths_and_dates(models_to_exclude)
    
    df_test = load_test_data(files, test_date, valid_targets, locations_to_exclude, online)
    df_train = load_train_data(files, test_date, valid_targets, locations_to_exclude, window_size, online)
    
    # dicts of the models available for each target
    available_models_test  = dict(df_test.groupby(['target'])['model'].unique())
    available_models_train = dict(df_train.groupby(['target'])['model'].unique())
    
    # ensure models are available in both train and test set
    df_train = df_train[df_train.apply(lambda x: x.model in (available_models_test[x.target]), axis=1)]
    df_test  = df_test[df_test.apply(lambda x: x.model in (available_models_train[x.target]), axis=1)]
    
    return df_train, df_test

In [240]:
# df_train, df_test = load_train_test('2021-06-14', VALID_TARGETS, LOCATIONS_TO_EXCLUDE, MODELS_TO_EXCLUDE, 4)

Load test data:


  0%|          | 0/51 [00:00<?, ?it/s]

Load train data:


  0%|          | 0/404 [00:00<?, ?it/s]

In [242]:
def compute_train_test_sets(test_dates, valid_targets, locations_to_exclude, models_to_exclude, window_size = 4, online = False):
    for test_date in test_dates:
        print(test_date)
        df_train, df_test = load_train_test(test_date, valid_targets, locations_to_exclude, models_to_exclude, 
                                            window_size, online)
        df_train.to_csv(f'data/{test_date}_df_train.csv', index=False)
        df_test.to_csv(f'data/{test_date}_df_test.csv', index=False)

In [243]:
compute_train_test_sets(['2021-06-07', '2021-06-14'], VALID_TARGETS, LOCATIONS_TO_EXCLUDE, MODELS_TO_EXCLUDE, 4)

2021-06-07
Load test data:


  0%|          | 0/54 [00:00<?, ?it/s]

Load train data:


  0%|          | 0/412 [00:00<?, ?it/s]

2021-06-14
Load test data:


  0%|          | 0/51 [00:00<?, ?it/s]

Load train data:


  0%|          | 0/404 [00:00<?, ?it/s]