# Pull data

In [1]:
import requests
import numpy as np
import pandas as pd
from epiweeks import Week
from datetime import date, timedelta
import os

In [2]:
def _url_checker(url):
    get = requests.get(url)
    if get.status_code != 200:
        raise requests.exceptions.RequestException(f"{url}: is Not reachable")

def pull_covid_forecast_hub_predictions(model,start_week,end_week):
    """pull_covid_forecast_hub_predictions. Load predictions of the model saved by the covid19 forecast hub.

    Parameters
    ----------
    model : str
        Model name on thhe
    start_week : Week object
        First epiweek of the range.
    end_week : Week object
        Last epiweek of the range.
    """

    week_list = [start_week]
    while week_list[-1] != end_week:
        week_list.append(week_list[-1]+1)
    pull_dates = [(week.startdate()+timedelta(days = 1)) for week in week_list]
    get_url = lambda date:f"https://raw.githubusercontent.com/reichlab/covid19-forecast-hub/master/data-processed/{model}/{date}-{model}.csv"
    #check which files are accessible
    url_list = []
    for date in pull_dates:
        try:
            url = get_url(date.isoformat())
            _url_checker(url)
            url_list += [url]
        except requests.exceptions.RequestException:
            #some group push date is on sundays
            try:
                url = get_url((date+timedelta(days = -1)).isoformat())
                _url_checker(url)
                url_list += [url]
            except requests.exceptions.RequestException:
                print(f"Data for date {date.isoformat()} is unavailable")
    df_predictions = pd.concat([pd.read_csv(url,dtype={'location':str},
                                            parse_dates=['target_end_date','forecast_date']) for url in url_list])
    return df_predictions

def pull_surveillance_data(target='death',incidence=True):
    mapping = {'death':'Deaths', 'case':'Cases', 'hospitalization': 'Hospitalizations'}
    if incidence:
        s = 'Incident'
    else:
        s = 'Cumulative'
    url = f"https://media.githubusercontent.com/media/reichlab/covid19-forecast-hub/master/data-truth/truth-{s}%20{mapping[target]}.csv"
    return pd.read_csv(url, dtype={'location':str})

## Surveillance data

In [12]:
# target = 'hospitalization'
target = 'death'
incidence = True

In [13]:
observations = pull_surveillance_data(target,incidence)

In [14]:
observations.head(5)

Unnamed: 0,date,location,location_name,value
0,2020-01-22,1001,Autauga County,0
1,2020-01-23,1001,Autauga County,0
2,2020-01-24,1001,Autauga County,0
3,2020-01-25,1001,Autauga County,0
4,2020-01-26,1001,Autauga County,0


### Save data

In [15]:
observations.to_parquet(f"./dat/truth_{'inc' if incidence else 'cum'}_{target}.pq", index=False)

## Forecast data

### Pull a single model

In [16]:
model = "COVIDhub-baseline"
#specify the period
start_week = Week(2020,47)
end_week = Week(2021,16)
#specify what is the target
target = 'death'
# target = 'hosp'
prediction_delay = 4
incidence = True
target_prediction = f"{prediction_delay} wk ahead {'inc' if incidence else 'cum'} {target}"
#pull the predictions
predictions = pull_covid_forecast_hub_predictions(model,start_week-prediction_delay+1,end_week-prediction_delay+1) #this ensures predictions are covered by start week and end week
predictions = predictions[(predictions['target'] == target_prediction)] #select desired prediction

In [18]:
predictions

Unnamed: 0,forecast_date,target,target_end_date,location,type,quantile,value
138,2020-10-26,4 wk ahead inc death,2020-11-21,01,quantile,0.010,0.000000
139,2020-10-26,4 wk ahead inc death,2020-11-21,01,quantile,0.025,0.000000
140,2020-10-26,4 wk ahead inc death,2020-11-21,01,quantile,0.050,1.544856
141,2020-10-26,4 wk ahead inc death,2020-11-21,01,quantile,0.100,18.262344
142,2020-10-26,4 wk ahead inc death,2020-11-21,01,quantile,0.150,29.855927
...,...,...,...,...,...,...,...
21661,2021-03-29,4 wk ahead inc death,2021-04-24,US,quantile,0.900,11709.526617
21662,2021-03-29,4 wk ahead inc death,2021-04-24,US,quantile,0.950,13202.066792
21663,2021-03-29,4 wk ahead inc death,2021-04-24,US,quantile,0.975,14545.853942
21664,2021-03-29,4 wk ahead inc death,2021-04-24,US,quantile,0.990,16202.241991


### Pull multiple models and save data

In [38]:
models = ["COVIDhub-baseline"]
start_week = Week(2020,47)
end_week = Week(2021,16)
for model in models:
    print(model)
    predictions = pull_covid_forecast_hub_predictions(model,start_week,end_week) #by default, should cover 1 week ahead predictions
    if predictions is not None:
        predictions.to_parquet(f'./dat/{model}_{start_week.isoformat()}-{end_week.isoformat()}.pq', index=False)
        # predictions.to_csv(f'./dat/{model}_rd{rd}.csv', index=False)

MOBS-GLEAM_COVID
JHU_IDD-CovidSP
UMass-MechBayes


**NOTE**: anything saved in the ```dat``` folder is not going to be tracked by git, to avoid having data on the Github repository. The repository is to track code only.

In [21]:
model = "COVIDhub-ensemble"

startdates = [ Week(2021,18), Week(2021,22), Week(2021,27), Week(2021,37), Week(2021,51),
             Week(2022,2), Week(2022,11), Week(2022,23), Week(2022,31), Week(2022,44)]


enddates = [ Week(2021,43), Week(2021,40), Week(2021,45), Week(2022,9),  Week(2022, 10),
           Week(2022, 13), Week(2022, 29), Week(2022, 41), Week(2022, 49), Week(2023, 18)]

rds = [ 5, 6, 7, 9,  11, 12, 13, 14 ,15, 16]

#rds = [5, 6]

target = 'death'
# target = 'hosp'
prediction_delay = 4
incidence = True
target_prediction = f"{prediction_delay} wk ahead {'inc' if incidence else 'cum'} {target}"

for i in range(len(rds)):
    start_week = startdates[i]
    end_week = enddates[i]
    rd = rds[i]
    print(rd)
    predictions = pull_covid_forecast_hub_predictions(model,start_week-prediction_delay+1,end_week-prediction_delay+1) #by default, should cover 1 week ahead predictions
    predictions = predictions[(predictions['target'] == target_prediction)] #select desired prediction
    if predictions is not None:
        predictions.to_parquet(f'./dat/{model}_4wks_rd{rd}.pq', index=False)
        
        

5
6
7
9
11
12
13
14
15
16


In [22]:
predictions

Unnamed: 0,forecast_date,target,target_end_date,location,type,quantile,value
5640,2022-10-10,4 wk ahead inc death,2022-11-05,01,quantile,0.010,2
5641,2022-10-10,4 wk ahead inc death,2022-11-05,02,quantile,0.010,0
5642,2022-10-10,4 wk ahead inc death,2022-11-05,04,quantile,0.010,5
5643,2022-10-10,4 wk ahead inc death,2022-11-05,05,quantile,0.010,4
5644,2022-10-10,4 wk ahead inc death,2022-11-05,06,quantile,0.010,106
...,...,...,...,...,...,...,...
10927,2023-03-06,4 wk ahead inc death,2023-04-01,US,quantile,0.900,3624
10931,2023-03-06,4 wk ahead inc death,2023-04-01,US,quantile,0.950,4047
10935,2023-03-06,4 wk ahead inc death,2023-04-01,US,quantile,0.975,4459
10939,2023-03-06,4 wk ahead inc death,2023-04-01,US,quantile,0.990,4804
