# Pull data

In [1]:
import requests
import numpy as np
import pandas as pd
from epiweeks import Week
from datetime import date, timedelta
import os

In [2]:
def pull_scenario_modeling_hub_predictions(model,dates):
    """pull_scenario_modeling_hub_predictions. Load predictions of the model saved by the scenario modeling
    hub.

    Parameters
    ----------
    model : str
        Model name on thhe
    dates : list or string
        List of potential dates in the iso format, e.g., 'yyyy-mm-dd', for the submission.
    """
    predictions = None
    if isinstance(dates,str):
        dates = [dates]
    for date in dates:
        url = f"https://raw.githubusercontent.com/midas-network/covid19-scenario-modeling-hub/master/data-processed/{model}/{date}-{model}"
        for ext in [".csv",".gz",".zip",".csv.zip",".csv.gz"]:
            try:
                predictions = pd.read_csv(url+ext,dtype={'location':str},parse_dates=['target_end_date'])
            except:
                pass
    if predictions is None:
        print(f"Data for model {model} and date {dates} unavailable")
    return predictions


def pull_surveillance_data(target='death',incidence=True):
    mapping = {'death':'Deaths', 'case':'Cases', 'hospitalization': 'Hospitalizations'}
    if incidence:
        s = 'Incident'
    else:
        s = 'Cumulative'
    url = f"https://media.githubusercontent.com/media/reichlab/covid19-forecast-hub/master/data-truth/truth-{s}%20{mapping[target]}.csv"
    return pd.read_csv(url, dtype={'location':str})

## Surveillance data

In [3]:
target = 'hospitalization'
#target = 'death'
#target = 'case'
incidence = True

In [4]:
observations = pull_surveillance_data(target,incidence)

In [5]:
observations

Unnamed: 0,date,location,location_name,value
0,2021-04-30,19,Iowa,26
1,2021-04-29,02,Alaska,8
2,2021-04-29,33,New Hampshire,8
3,2021-04-28,16,Idaho,13
4,2021-04-28,49,Utah,19
...,...,...,...,...
67411,2023-11-20,US,United States,2934
67412,2023-11-21,US,United States,3022
67413,2023-11-22,US,United States,2927
67414,2023-11-23,US,United States,2682


### Save data

In [6]:
observations.to_parquet(f"./dat/truth_{'inc' if incidence else 'cum'}_{target}.pq", index=False)

## Scenario projection data

### Pull multiple models and save data

In [7]:
rd = 4 #smh round
dates = ['2021-03-27','2021-03-29','2021-03-28'] #potential submission dates
models = ["Ensemble","Ensemble_LOP","IHME-IHME_COVID_model_deaths_unscaled","JHUAPL-Bucky",
           "JHU_IDD-CovidSP","Karlen-pypm","MOBS_NEU-GLEAM_COVID",
           "USC-SIkJalpha","UVA-adaptive"]

In [5]:
rd = 5 #smh round
dates = ['2021-05-01','2021-05-02','2021-05-04'] #potential submission dates
models = ["Ensemble","Ensemble_LOP","IHME-IHME_COVID_model_deaths_unscaled","JHUAPL-Bucky",
           "JHU_IDD-CovidSP","Karlen-pypm","MOBS_NEU-GLEAM_COVID","UNCC-hierbin",
           "USC-SIkJalpha","UVA-adaptive"]

In [8]:
rd = 6 #smh round
dates = ['2021-05-28','2021-05-29','2021-05-30','2021-06-08','2021-06-05'] #potential submission dates
models = ["Ensemble","Ensemble_LOP","JHUAPL-Bucky",
           "JHU_IDD-CovidSP","Karlen-pypm","MOBS_NEU-GLEAM_COVID","UNCC-hierbin","NCSU-COVSIM",
           "USC-SIkJalpha","UVA-adaptive","UTA-ImmunoSEIRS", "UVA-EpiHiper"]

In [15]:
rd = 7 #smh round
dates = ['2021-07-03','2021-07-13','2021-07-04'] #potential submission dates
models = ["Ensemble","Ensemble_LOP","JHUAPL-Bucky",
           "JHU_IDD-CovidSP","Karlen-pypm","MOBS_NEU-GLEAM_COVID","UNCC-hierbin",
           "USC-SIkJalpha","UVA-adaptive","UVA-EpiHiper"]

In [21]:

rd = 9 #smh round
dates = ['2021-09-11','2021-09-13','2021-09-14','2021-09-12'] #potential submission dates
models = ["Ensemble","Ensemble_LOP","Ensemble_LOP_untrimmed", "JHUAPL-Bucky",
           "JHU_IDD-CovidSP","MOBS_NEU-GLEAM_COVID","UNCC-hierbin", "NotreDame-FRED",
           "USC-SIkJalpha","UVA-adaptive","UVA-EpiHiper"]

In [24]:

rd = 10 #smh round
dates = ['2021-11-13','2021-11-20','2021-11-14','2021-11-09','2021-11-21'] #potential submission dates
models = ["Ensemble","Ensemble_LOP","Ensemble_LOP_untrimmed", 
           "MOBS_NEU-GLEAM_COVID","UNCC-hierbin", "NotreDame-FRED",
           "USC-SIkJalpha","UVA-adaptive","UVA-EpiHiper"]

In [27]:
rd = 11 #smh round
dates = ['2021-12-18','2021-12-21','2021-12-17','2021-12-19','2021-11-21'] #potential submission dates
models = ["Ensemble","Ensemble_LOP","Ensemble_LOP_untrimmed", "JHU_IDD-CovidSP", 
           "MOBS_NEU-GLEAM_COVID","UNCC-hierbin", "NotreDame-FRED",
           "USC-SIkJalpha","UVA-adaptive","UVA-EpiHiper"]

In [31]:
rd = 10
dates = ['2021-11-13','2021-11-20','2021-11-14','2021-11-09','2021-11-21'] #potential submission dates
models = ["NotreDame-FRED"]

In [8]:
rd = 12
models = ["Ensemble","Ensemble_LOP","Ensemble_LOP_untrimmed","JHU_IDD-CovidSP","MOBS_NEU-GLEAM_COVID",
           "NCSU-COVSIM","NotreDame-FRED","UNCC-hierbin","USC-SIkJalpha","UTA-ImmunoSEIRS",
           "UVA-EpiHiper","UVA-adaptive"]
dates = ['2022-01-09']

In [38]:
rd = 13
models = ["Ensemble","Ensemble_LOP","Ensemble_LOP_untrimmed","JHU_IDD-CovidSP","MOBS_NEU-GLEAM_COVID",
           "NCSU-COVSIM","UNCC-hierbin","USC-SIkJalpha","UTA-ImmunoSEIRS",
           "UVA-EpiHiper","UVA-adaptive"]
dates = ['2022-03-13']

In [42]:
rd = 14
models = ["Ensemble","Ensemble_LOP","Ensemble_LOP_untrimmed","JHU_IDD-CovidSP","MOBS_NEU-GLEAM_COVID",
          "MOBS_NEU-GLEAM_COVID_OT","NCSU-COVSIM","UNCC-hierbin","USC-SIkJalpha","USC-SIkJalpha-update",
          "UTA-ImmunoSEIRS","UVA-adaptive"]
dates = ['2022-06-05','2022-06-04']

In [45]:
rd = 15
models = ["Ensemble","Ensemble_LOP","Ensemble_LOP_untrimmed","JHU_IDD-CovidSP","MOBS_NEU-GLEAM_COVID",
          "MOBS_NEU-GLEAM_COVID_OT","NCSU-COVSIM","UNCC-hierbin","USC-SIkJalpha","USC-SIkJalpha-update",
          "UTA-ImmunoSEIRS","UVA-adaptive"]
dates = ['2022-07-31']

In [50]:
rd = 16
models = ["Ensemble","Ensemble_LOP","Ensemble_LOP_untrimmed","JHU_IDD-CovidSP","MOBS_NEU-GLEAM_COVID",
          "NCSU-COVSIM","UNCC-hierbin","USC-SIkJalpha",
          "UTA-ImmunoSEIRS","UVA-adaptive"]
dates = ['2022-10-30']

In [9]:
for model in models:
    print(model)
    predictions = pull_scenario_modeling_hub_predictions(model,dates)
    if predictions is not None:
        predictions.to_parquet(f'./dat/{model}_rd{rd}.pq', index=False)
        # predictions.to_csv(f'./dat/{model}_rd{rd}.csv', index=False)

Ensemble
Ensemble_LOP
Ensemble_LOP_untrimmed
JHU_IDD-CovidSP
MOBS_NEU-GLEAM_COVID
NCSU-COVSIM
NotreDame-FRED
UNCC-hierbin
Data for model UNCC-hierbin and date ['2022-01-09'] unavailable
USC-SIkJalpha
UTA-ImmunoSEIRS
UVA-EpiHiper
UVA-adaptive


# Pull COVID-19 Forecast Hub data

In [10]:
def _url_checker(url):
    get = requests.get(url)
    if get.status_code != 200:
        raise requests.exceptions.RequestException(f"{url}: is Not reachable")

def pull_covid_forecast_hub_predictions(model,start_week,end_week):
    """pull_covid_forecast_hub_predictions. Load predictions of the model saved by the covid19 forecast hub.

    Parameters
    ----------
    model : str
        Model name on thhe
    start_week : Week object
        First epiweek of the range.
    end_week : Week object
        Last epiweek of the range.
    """

    week_list = [start_week]
    while week_list[-1] != end_week:
        week_list.append(week_list[-1]+1)
    pull_dates = [(week.startdate()+timedelta(days = 1)) for week in week_list]
    get_url = lambda date:f"https://raw.githubusercontent.com/reichlab/covid19-forecast-hub/master/data-processed/{model}/{date}-{model}.csv"
    #check which files are accessible
    url_list = []
    for date in pull_dates:
        try:
            url = get_url(date.isoformat())
            _url_checker(url)
            url_list += [url]
        except requests.exceptions.RequestException:
            #some group push date is on sundays
            try:
                url = get_url((date+timedelta(days = -1)).isoformat())
                _url_checker(url)
                url_list += [url]
            except requests.exceptions.RequestException:
                print(f"Data for date {date.isoformat()} is unavailable")
    df_predictions = pd.concat([pd.read_csv(url,dtype={'location':str},
                                            parse_dates=['target_end_date','forecast_date']) for url in url_list])
    return df_predictions

def pull_surveillance_data(target='death',incidence=True):
    mapping = {'death':'Deaths', 'case':'Cases', 'hospitalization': 'Hospitalizations'}
    if incidence:
        s = 'Incident'
    else:
        s = 'Cumulative'
    url = f"https://media.githubusercontent.com/media/reichlab/covid19-forecast-hub/master/data-truth/truth-{s}%20{mapping[target]}.csv"
    return pd.read_csv(url, dtype={'location':str})

# Pull a single model

In [None]:
model = "COVIDhub-baseline"
#specify the period
start_week = Week(2020,47)
end_week = Week(2021,16)
#specify what is the target
target = 'death'
# target = 'hosp'
prediction_delay = 4
incidence = True
target_prediction = f"{prediction_delay} wk ahead {'inc' if incidence else 'cum'} {target}"
#pull the predictions
predictions = pull_covid_forecast_hub_predictions(model,start_week-prediction_delay+1,end_week-prediction_delay+1) #this ensures predictions are covered by start week and end week
predictions = predictions[(predictions['target'] == target_prediction)] #select desired prediction

### Pull multiple models and save data

In [None]:
models = ["COVIDhub-baseline", "COVIDhub-ensemble"]
start_week = Week(2023,16)
end_week =  Week(2023,31)
rd=17
for model in models:
    print(model)
    predictions = pull_covid_forecast_hub_predictions(model,start_week,end_week) #by default, should cover 1 week ahead predictions
    if predictions is not None:
        predictions.to_parquet(f'./dat/{model}_4wks_rd{rd}.pq', index=False)
        # predictions.to_csv(f'./dat/{model}_rd{rd}.csv', index=False)

In [None]:
model = "COVIDhub-baseline"
#model = "COVIDhub-ensemble"

startdates = [ Week(2021,18), Week(2021,22), Week(2021,27), Week(2021,37), Week(2021,51),
             Week(2022,2), Week(2022,11), Week(2022,23), Week(2022,31), Week(2022,44)]


enddates = [ Week(2021,43), Week(2021,47), Week(2021,52), Week(2022,9),  Week(2022, 10),
           Week(2022, 13), Week(2023, 11), Week(2023, 23), Week(2023, 18), Week(2023, 17)]

rds = [ 5, 6, 7, 9,  11, 12, 13, 14 ,15, 16]

#rds = [5, 6]

target = 'death'
# target = 'hosp'
prediction_delay = 4
incidence = True
target_prediction = f"{prediction_delay} wk ahead {'inc' if incidence else 'cum'} {target}"

for i in range(len(rds)):
    start_week = startdates[i]
    end_week = enddates[i]
    rd = rds[i]
    print(rd)
    predictions = pull_covid_forecast_hub_predictions(model,start_week-prediction_delay+1,end_week-prediction_delay+1) #by default, should cover 1 week ahead predictions
    predictions = predictions[(predictions['target'] == target_prediction)] #select desired prediction
    if predictions is not None:
        predictions.to_parquet(f'./dat/{model}_4wks_rd{rd}.pq', index=False)
        
        

**NOTE**: anything saved in the ```dat``` folder is not going to be tracked by git, to avoid having data on the Github repository. The repository is to track code only.