# Ensemble Challenge (18-month Evaluation)

## Timepoint: July 19, 2021. 

## Setting: New York State upon the arrival of the Delta variant, vaccines available

In [None]:
# TODO: 
# - collect and process data DONE
# - collect 3 candidate models DONE
# - set parameter values/uncertainty DONE
# - create observables for cumulative cases, hospitalizations, and deaths DONE
# - calibrate models independently DONE
# - calibrate ensemble of multiple models DONE
# - improve calibration with more data DONE
# - plot and post-process results DONE!

### Load dependencies

In [1]:
import pandas as pd

import pyciemss
import pyciemss.visuals.plots as plots
import pyciemss.visuals.vega as vega
import pyciemss.visuals.trajectories as trajectories

# Process data

In [2]:
location = "New York"
# BETTER_NAMES = {'Cases': 'Infected',
#                 'Hospitalizations': 'Hospitalized',
#                 'Deaths': 'Dead'}
# ## instantiate, making sure no dates are skipped between start and end (could be found dynamically)
# full_dataset = pd.DataFrame({'date':pd.date_range(start='1/22/2020', end='3/29/2024')})

# ## fill the instantiated dataset
# for i in ['Cases','Hospitalizations','Deaths']:
#     all_data = pd.read_csv(f'https://media.githubusercontent.com/media/reichlab/covid19-forecast-hub/master/data-truth/truth-Incident%20{i}.csv')
#     subset   = all_data[all_data.location_name == location].groupby("date")["value"].sum().reset_index()
#     subset.date = pd.to_datetime(subset.date)
#     subset['cumsum'] = subset.value.cumsum()
#     full_dataset = full_dataset.merge(subset[['date','cumsum']], how='outer', on='date').rename(columns={'cumsum':BETTER_NAMES[i]})

# full_dataset[(full_dataset.date >= '2021-06-01') & (full_dataset.date < '2021-09-06')].reset_index(drop=True).reset_index(names=['Timestamp']).drop(['date'],axis=1)

In [3]:
# full_dataset.to_csv('full_dataset.csv', index=False)
full_dataset = pd.read_csv('full_dataset.csv')

### Define a function to select a subset of the data

In [4]:
def get_data_between(start='2021-06-01',end='2021-09-06'):
    return full_dataset[(full_dataset.date >= start) & (full_dataset.date < end)].reset_index(drop=True).reset_index(names=['Timestamp']).drop(['date'],axis=1)

dataset = get_data_between('2021-06-01', '2021-09-06')
# dataset

### Define a function to plot results

In [5]:
def plot_results_and_data(results, data):
    # Plot the result for cumulative cases, hospitalizations, and deaths
    nice_labels={"deceased_state": "Deaths", "Cumulative_hosp_state": "Cumul Hosp", "Cumulative_cases_state": "Cumul Cases"}
    schema = plots.trajectories(results["data"], 
                                keep=["deceased_state", "Cumulative_hosp_state", "Cumulative_cases_state"], 
                                relabel=nice_labels,
                                points=data.drop(columns=['Timestamp']).reset_index(drop=True)
                               )
    # plots.save_schema(schema, "_schema.json")
    # plots.ipy_display(schema, dpi=150)
    return schema

def plot_all_results(results):
    # Plot the result for all state variables
    schema = plots.trajectories(results["data"], keep=".*_state")
    return schema

# Gather models

In [6]:
# See `Model_Kitchen.ipynb` for model derivation
model1 = "SEIRHD_age_structured_petrinet.json"
model2 = "SEIRHD_vacc_var_petrinet.json"
model3 = "SEIRHD_time_varying_transmission_petrinet.json"

### Define solution mappings for each model

In [7]:
def solution_mapping(model_solution: dict) -> dict:
    # solution mapping for model1 and model2 and model3 (they all have the same set of observables)
    mapped_dict = {}
    mapped_dict["Susceptible"] = model_solution["susceptible"]
    mapped_dict["Exposed"] = model_solution["exposed"]
    mapped_dict["Infected"] = model_solution["infected"]
    mapped_dict["Recovered"] = model_solution["recovered"]
    mapped_dict["Hospitalized"] = model_solution["hospitalized"]
    mapped_dict["Deceased"] = model_solution["deceased"]
    mapped_dict["Cumulative_cases"] = model_solution["all_cases"]
    mapped_dict["Cumulative_hosp"] = model_solution["all_hosp"]
    return mapped_dict

### Set model paths and parameters for sampling

In [8]:
num_iterations = 10 # 500
num_samples = 100
start_time = 0.0
logging_step_size = 10.0

single_model_paths = [[model1], [model2], [model3]]
model_paths = [model1, model2]

single_solution_mapping = [lambda x : x]
solution_mappings = [solution_mapping, solution_mapping, solution_mapping]

single_data_mapping = {'Infected': 'Cumulative_cases', 'Hospitalized': 'Cumulative_hosp', 'Dead': 'deceased'}
data_mapping = {'Infected': 'Cumulative_cases', 'Hospitalized': 'Cumulative_hosp', 'Dead': 'Deceased'}

# (1) Forecast 1: 07/19/2021 - 08/16/2021

In [9]:
dataset = get_data_between('2021-06-01', '2021-07-18')
end_time = len(dataset) + 29.0

## (A) Calibrate each model as an ensemble of a single model 

### Model 1

In [10]:
## Calibrate the model to data
calibrated_results = pyciemss.ensemble_calibrate(single_model_paths[0], single_solution_mapping, dataset, 
                                                 data_mapping=single_data_mapping, num_iterations=num_iterations)
parameter_estimates = calibrated_results["inferred_parameters"]
# print(parameter_estimates())

## Sample the calibrated model
calibrated_ensemble_result = pyciemss.ensemble_sample(single_model_paths[0], single_solution_mapping, 
                                                      end_time, logging_step_size, num_samples, 
                                                      start_time=start_time, inferred_parameters=parameter_estimates)

## Display results
display(calibrated_ensemble_result['data'].head())
schema = plot_results_and_data(calibrated_ensemble_result, dataset)
# schema = plot_all_results(calibrated_ensemble_result)
plots.save_schema(schema, "_schema.json")
plots.ipy_display(schema, dpi=150)

  if not data_df.applymap(lambda x: isinstance(x, (int, float))).all().all():


Data printout: This dataset contains 46 rows of data. The first column, Timestamp, begins at 0 and ends at 46. The subsequent columns are named: Infected, Hospitalized, Dead


Unnamed: 0,timepoint_id,sample_id,timepoint_unknown,model_0/weight_param,model_0/persistent_beta_param,model_0/persistent_r_EI_param,model_0/persistent_r_IR_y_param,model_0/persistent_r_IR_m_param,model_0/persistent_r_IH_y_param,model_0/persistent_r_IH_m_param,...,S_o_state,S_y_state,susceptible_state,exposed_state,infected_state,recovered_state,hospitalized_state,deceased_state,all_cases_state,all_hosp_state
0,0,0,10.0,1.0,0.35488,0.197486,0.450925,0.15253,0.022434,0.038593,...,380124.25,73885.28125,580575.875,9841189.0,4417460.0,4172716.0,264446.0,57499.5625,9198429.0,480126.3
1,1,0,20.0,1.0,0.35488,0.197486,0.450925,0.15253,0.022434,0.038593,...,20.319042,0.026382,20.346952,1461309.0,2605424.0,12972942.0,705052.8125,125931.359375,18158868.0,1943336.0
2,2,0,30.0,1.0,0.35488,0.197486,0.450925,0.15253,0.022434,0.038593,...,0.704356,0.000327,0.704687,202806.2,569476.1,16094146.0,338380.1875,190499.78125,19417394.0,2418706.0
3,3,0,40.0,1.0,0.35488,0.197486,0.450925,0.15253,0.022434,0.038593,...,0.353872,0.000141,0.354015,28145.76,101729.1,16844350.0,108994.4375,217395.140625,19592054.0,2513402.0
4,4,0,50.0,1.0,0.35488,0.197486,0.450925,0.15253,0.022434,0.038593,...,0.313656,0.000123,0.31378,3906.112,16633.15,17008362.0,29643.355469,225829.515625,19616292.0,2529644.0


NameError: name 'schema' is not defined

In [11]:
display(calibrated_ensemble_result["ensemble_quantiles"])
from pyciemss.integration_utils.result_processing import cdc_format

q_ensemble_data = cdc_format(
    calibrated_ensemble_result["ensemble_quantiles"],
    solution_string_mapping={
        "infected_state": "cases",
        "hospitalized_state": "hosp.",
        "dead_state": "death",
    },
    forecast_start_date="2023-08-03",
    location="US",
    drop_column_names=[
        "timepoint_id",
        "number_days",
        "inc_cum",
        "output",
        "Forecast_Backcast",
    ],
    time_unit="days",
    train_end_point=10.,
)
print("CDC Format:")
display(q_ensemble_data)

Unnamed: 0,timepoint_id,number_None,inc_cum,output,type,quantile,value
0,0,10.0,cum,model_0/Cumulative_cases_state,quantile,0.010,7.433344e+06
1,0,10.0,cum,model_0/Cumulative_cases_state,quantile,0.025,8.001986e+06
2,0,10.0,cum,model_0/Cumulative_cases_state,quantile,0.050,8.136477e+06
3,0,10.0,cum,model_0/Cumulative_cases_state,quantile,0.100,8.346386e+06
4,0,10.0,cum,model_0/Cumulative_cases_state,quantile,0.150,8.457930e+06
...,...,...,...,...,...,...,...
9011,6,70.0,inc,all_hosp_state,quantile,0.850,2.588128e+06
9012,6,70.0,inc,all_hosp_state,quantile,0.900,2.608083e+06
9013,6,70.0,inc,all_hosp_state,quantile,0.950,2.667483e+06
9014,6,70.0,inc,all_hosp_state,quantile,0.975,2.687664e+06


CDC Format:


  q_ensemble_data["forecast_date"] = pd.to_datetime(


Unnamed: 0,type,quantile,value,target,forecast_date,target_end_date,location
23,quantile,0.010,1.771170e+07,10.0 days ahead cum model_0/Cumulative_cases_s...,2023-08-03,2023-08-13,US
24,quantile,0.025,1.779737e+07,10.0 days ahead cum model_0/Cumulative_cases_s...,2023-08-03,2023-08-13,US
25,quantile,0.050,1.783613e+07,10.0 days ahead cum model_0/Cumulative_cases_s...,2023-08-03,2023-08-13,US
26,quantile,0.100,1.787660e+07,10.0 days ahead cum model_0/Cumulative_cases_s...,2023-08-03,2023-08-13,US
27,quantile,0.150,1.792271e+07,10.0 days ahead cum model_0/Cumulative_cases_s...,2023-08-03,2023-08-13,US
...,...,...,...,...,...,...,...
9011,quantile,0.850,2.588128e+06,60.0 days ahead inc all_hosp_state,2023-08-03,2023-10-02,US
9012,quantile,0.900,2.608083e+06,60.0 days ahead inc all_hosp_state,2023-08-03,2023-10-02,US
9013,quantile,0.950,2.667483e+06,60.0 days ahead inc all_hosp_state,2023-08-03,2023-10-02,US
9014,quantile,0.975,2.687664e+06,60.0 days ahead inc all_hosp_state,2023-08-03,2023-10-02,US


### Model 2

In [None]:
## Calibrate the model to data
calibrated_results = pyciemss.ensemble_calibrate(single_model_paths[1], single_solution_mapping, dataset, 
                                                 data_mapping=single_data_mapping, num_iterations=num_iterations)
parameter_estimates = calibrated_results["inferred_parameters"]
# print(parameter_estimates())

## Sample the calibrated model
calibrated_ensemble_result = pyciemss.ensemble_sample(single_model_paths[1], single_solution_mapping, 
                                                      end_time, logging_step_size, num_samples, 
                                                      start_time=start_time, inferred_parameters=parameter_estimates)

## Display results
# display(calibrated_ensemble_result['data'].head())
schema = plot_results_and_data(calibrated_ensemble_result, dataset)
# schema = plot_all_results(calibrated_ensemble_result)
plots.save_schema(schema, "_schema.json")
plots.ipy_display(schema, dpi=150)

## (B) Calibrate an ensemble of multiple models

In [None]:
## Calibrate the model to data
calibrated_results = pyciemss.ensemble_calibrate(model_paths, solution_mappings, dataset, 
                                                 data_mapping=data_mapping, num_iterations=num_iterations)
parameter_estimates = calibrated_results["inferred_parameters"]
# print(parameter_estimates())

## Sample the calibrated model
ensemble_result = pyciemss.ensemble_sample(model_paths, solution_mappings, end_time, 
                                           logging_step_size, num_samples, start_time=start_time)

## Display results
# display(ensemble_result['data'].head())
schema = plot_results_and_data(calibrated_ensemble_result, dataset)
# schema = plot_all_results(calibrated_ensemble_result)
plots.save_schema(schema, "_schema.json")
plots.ipy_display(schema, dpi=150)

# (2) Forecast 2: 07/26/2021 - 08/23/2021

In [None]:
dataset = get_data_between('2021-06-01', '2021-07-25')
end_time = len(dataset) + 29.0

## (A) Calibrate an ensemble of a single model

In [None]:
## Calibrate the model to data
calibrated_results = pyciemss.ensemble_calibrate(single_model_paths[1], single_solution_mapping, dataset, 
                                                 data_mapping=single_data_mapping, num_iterations=num_iterations)
parameter_estimates = calibrated_results["inferred_parameters"]
# print(parameter_estimates())

## Sample the calibrated model
calibrated_ensemble_result = pyciemss.ensemble_sample(single_model_paths[1], single_solution_mapping, 
                                                      end_time, logging_step_size, num_samples, 
                                                      start_time=start_time, inferred_parameters=parameter_estimates)

## Display results
# display(calibrated_ensemble_result['data'].head())
schema = plot_results_and_data(calibrated_ensemble_result, dataset)
# schema = plot_all_results(calibrated_ensemble_result)
plots.save_schema(schema, "_schema.json")
plots.ipy_display(schema, dpi=150)

## (B) Calibrate an ensemble of multiple models

In [None]:
## Calibrate the model to data
calibrated_results = pyciemss.ensemble_calibrate(model_paths, solution_mappings, dataset, 
                                                 data_mapping=data_mapping, num_iterations=num_iterations)
parameter_estimates = calibrated_results["inferred_parameters"]
# print(parameter_estimates())

## Sample the calibrated model
ensemble_result = pyciemss.ensemble_sample(model_paths, solution_mappings, end_time, 
                                           logging_step_size, num_samples, start_time=start_time)

## Display results
# display(ensemble_result['data'].head())
schema = plot_results_and_data(calibrated_ensemble_result, dataset)
# schema = plot_all_results(calibrated_ensemble_result)
plots.save_schema(schema, "_schema.json")
plots.ipy_display(schema, dpi=150)

# (3) Forecast 3: 08/02/2021 - 08/30/2021

In [None]:
dataset = get_data_between('2021-06-01', '2021-08-01')
end_time = len(dataset) + 29.0

## (A) Calibrate an ensemble of a single model

In [None]:
## Calibrate the model to data
calibrated_results = pyciemss.ensemble_calibrate(single_model_paths[1], single_solution_mapping, dataset, 
                                                 data_mapping=single_data_mapping, num_iterations=num_iterations)
parameter_estimates = calibrated_results["inferred_parameters"]
# print(parameter_estimates())

## Sample the calibrated model
calibrated_ensemble_result = pyciemss.ensemble_sample(single_model_paths[1], single_solution_mapping, 
                                                      end_time, logging_step_size, num_samples, 
                                                      start_time=start_time, inferred_parameters=parameter_estimates)

## Display results
# display(calibrated_ensemble_result['data'].head())
schema = plot_results_and_data(calibrated_ensemble_result, dataset)
# schema = plot_all_results(calibrated_ensemble_result)
plots.save_schema(schema, "_schema.json")
plots.ipy_display(schema, dpi=150)

## (B) Calibrate an ensemble of multiple models

In [None]:
## Calibrate the model to data
calibrated_results = pyciemss.ensemble_calibrate(model_paths, solution_mappings, dataset, 
                                                 data_mapping=data_mapping, num_iterations=num_iterations)
parameter_estimates = calibrated_results["inferred_parameters"]
# print(parameter_estimates())

## Sample the calibrated model
ensemble_result = pyciemss.ensemble_sample(model_paths, solution_mappings, end_time, 
                                           logging_step_size, num_samples, start_time=start_time)

## Display results
# display(ensemble_result['data'].head())
schema = plot_results_and_data(calibrated_ensemble_result, dataset)
# schema = plot_all_results(calibrated_ensemble_result)
plots.save_schema(schema, "_schema.json")
plots.ipy_display(schema, dpi=150)

# (4) Forecast 4: 08/09/2021 - 09/06/2021

In [None]:
dataset = get_data_between('2021-06-01', '2021-08-08')
end_time = len(dataset) + 29.0

## (A) Calibrate an ensemble of a single model

In [None]:
## Calibrate the model to data
calibrated_results = pyciemss.ensemble_calibrate(single_model_paths[1], single_solution_mapping, dataset, 
                                                 data_mapping=single_data_mapping, num_iterations=num_iterations)
parameter_estimates = calibrated_results["inferred_parameters"]
# print(parameter_estimates())

## Sample the calibrated model
calibrated_ensemble_result = pyciemss.ensemble_sample(single_model_paths[1], single_solution_mapping, 
                                                      end_time, logging_step_size, num_samples, 
                                                      start_time=start_time, inferred_parameters=parameter_estimates)

## Display results
# display(calibrated_ensemble_result['data'].head())
schema = plot_results_and_data(calibrated_ensemble_result, dataset)
# schema = plot_all_results(calibrated_ensemble_result)
plots.save_schema(schema, "_schema.json")
plots.ipy_display(schema, dpi=150)

## (B) Calibrate an ensemble of multiple models

In [None]:
## Calibrate the model to data
calibrated_results = pyciemss.ensemble_calibrate(model_paths, solution_mappings, dataset, 
                                                 data_mapping=data_mapping, num_iterations=num_iterations)
parameter_estimates = calibrated_results["inferred_parameters"]
# print(parameter_estimates())

## Sample the calibrated model
ensemble_result = pyciemss.ensemble_sample(model_paths, solution_mappings, end_time, 
                                           logging_step_size, num_samples, start_time=start_time)

## Display results
# display(ensemble_result['data'].head())
schema = plot_results_and_data(calibrated_ensemble_result, dataset)
# schema = plot_all_results(calibrated_ensemble_result)
plots.save_schema(schema, "_schema.json")
plots.ipy_display(schema, dpi=150)