In [38]:
#
# notebook for calculating overall MAE for all models for a parameter set.
#

In [39]:
# Ensure notebook is being run from base repository directory
COVID_HOME = "/Users/efua/documents/uni/spring-2021/UROP/covid19-forecast-hub"
import os, sys
try:
    os.chdir(COVID_HOME)
except Exception as err:
    print(f"Warning: unable to change directory; {repr(err)}")
from src.utils.notebook_util import isnotebook
if isnotebook():
    # Autoreload packages that are modified
    %load_ext autoreload
    %autoreload 2
else:
    from argparse import ArgumentParser
    
# Computational libraries 
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import numpy.matlib
from collections import deque

# os libraries 
import shutil
from datetime import datetime, timedelta
from functools import partial
import copy
import pdb
import os

from utils import *
from file_util import *
from attributes import *

from zoo_of_experts import *
from zoo_of_hinters import *
from zoo_of_losses import *

import matplotlib.pyplot as plt

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [63]:
#
# Specify model parameters
#
model_name = "online_expert" 

if not isnotebook():
    # If notebook run as a script, parse command-line arguments
    parser = ArgumentParser()
    parser.add_argument("pos_vars",nargs="*")  # gt_id and horizon 
    parser.add_argument('--target_dates', '-t', default="std_future")
    args, opt = parser.parse_known_args()
    
    # Assign variables                                                                                                                                     
    gt_id = get_id_name(args.pos_vars[0]) # "contest_precip" or "contest_tmp2m"                                                                            
    horizon = get_th_name(args.pos_vars[1]) # "34w" or "56w"    
    target_date_str = args.target_dates # target date object
else:
    # Otherwise, specify arguments interactively
    gt_id = "incd_case" #"contest_precip", "contest_tmp2m". choices: "cumm_death", "incd_death", "cumm_case", "incd_case"     
    target_date_str = "std_weekly" #"contest_precip", "contest_tmp2m"
    location = '41' #US or FIPS code
    quantile = 0.5
    horizon = "4w"

In [64]:
"""
Read in ground truth and model predictions for task
"""
# Get location FIPS codes
fips_codes = get_fips_codes() 

# Read ground truth 
printf(f'Loading {gt_id} ground truth')
df_gt = get_ground_truth(gt_id, location, load_df=True)

# Get target dates
first_date, last_date = get_data_range(gt_id, location=location)
printf(f"Getting target dates from {first_date} to {last_date}.")
target_dates = get_target_dates(target_date_str, first_date, last_date)
target_date_objs = pd.Series(target_dates)

# Store delta between target date and forecast issuance date
start_delta = timedelta(days=get_start_delta(horizon, gt_id))

# Get model predicitons for the task
printf(f'Loading model predictions for {gt_id}, {horizon}, {location}, q{quantile}')
model_pred_df = get_model_predictions(gt_id, horizon, location=location, quantile=quantile, load_df=True)

# Get model list
persistant_models, all_models = get_persistant_models(model_pred_df)
expert_models = list(all_models)
expert_models.sort()
model_string = (',').join(expert_models)
printf(f"Predictions for models loaded:\n {expert_models}")

'\nRead in ground truth and model predictions for task\n'

Loading incd_case ground truth
Getting target dates from 2020-01-29 00:00:00 to 2021-05-10 00:00:00.
Loading model predictions for incd_case, 4w, 41, q0.5
Predictions for models loaded:
 ['BPagano-RtDriven', 'CEID-Walk', 'COVIDhub-baseline', 'COVIDhub-ensemble', 'COVIDhub-trained_ensemble', 'CU-nochange', 'CU-scenario_high', 'CU-scenario_low', 'CU-scenario_mid', 'CU-select', 'Covid19Sim-Simulator', 'CovidActNow-SEIR_CAN', 'CovidAnalytics-DELPHI', 'DDS-NBDS', 'Google_Harvard-CPF', 'IEM_MED-CovidProject', 'IHME-CurveFit', 'IQVIA_ACOE-STAN', 'IUPUI-HkPrMobiDyR', 'IowaStateLW-STEM', 'JCB-PRM', 'JHUAPL-Bucky', 'JHU_CSSE-DECOM', 'JHU_IDD-CovidSP', 'JHU_UNC_GAS-StatMechPool', 'Karlen-pypm', 'LANL-GrowthRate', 'LNQ-ens1', 'MIT_ISOLAT-Mixtures', 'MOBS-GLEAM_COVID', 'Microsoft-DeepSTIA', 'OneQuietNight-ML', 'RobertWalraven-ESG', 'SigSci-TS', 'UCLA-SuEIR', 'UCSB-ACTS', 'UMich-RidgeTfReg', 'USACE-ERDC_SEIR', 'USC-SI_kJalpha', 'USC-SI_kJalpha_RF', 'UVA-Ensemble', 'Wadhwani_AI-BayesOpt']


In [65]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [66]:
"""
ensemble MAE
"""

med_model = model_pred_df.groupby(['target_end_date']).median() 
mea_model = model_pred_df.groupby(['target_end_date']).mean()

        
#make median series
MED_copy = med_model
MED_single = MED_copy.reset_index()
MED_dict = {}
for index, row in MED_single.iterrows():
    MED_dict[row["target_end_date"]] = row["value"]

med_series = pd.Series(data=MED_dict, index=MED_single['target_end_date'])


#make mean series
MEA_copy = mea_model
MEA_single = MEA_copy.reset_index()
MEA_dict = {}
for index, row in MEA_single.iterrows():
    MEA_dict[row["target_end_date"]] = row["value"]

mea_series = pd.Series(data=MEA_dict, index=MEA_single['target_end_date'])

#make groundtruth series
GT_copy = df_gt
GT_single = GT_copy.reset_index()
GT_single
GT_dict = {}
for index, row in GT_single.iterrows():
    dat = row["date"].date()
    datestr = dat.strftime("%Y-%m-%d")
    GT_dict[datestr] = row["gt"]
        
GT_series = pd.Series(data=GT_dict, index=MED_single['target_end_date']) #matching gt values to ensemble dates due to missing dates?

#calculate cumulative overall error
losses_mea = MAELoss.loss(0,GT_series,mea_series)
entries = 0
tot = 0
for x in losses_mea:
    entries += 1
    tot += x
    
mean_avg = tot / entries



losses_med = MAELoss.loss(0,GT_series,med_series)
entries2 = 0
tot2 = 0
for x in losses_med:
    entries2 += 1
    tot2 += x
    
med_avg = tot2 / entries2



'\nensemble MAE\n'

Unnamed: 0,location,date,gt
0,41,2020-01-29,0.0
1,41,2020-01-30,0.0
2,41,2020-01-31,0.0
3,41,2020-02-01,0.0
4,41,2020-02-02,0.0
...,...,...,...
463,41,2021-05-06,5332.0
464,41,2021-05-07,5174.0
465,41,2021-05-08,5207.0
466,41,2021-05-09,5061.0


In [68]:
"""
iterate through models and collect MAE
"""

print(f"overall model MAE for: {location}, {gt_id}, {horizon} horizon")
print(f"mean ensemble overall error: {mean_avg}")
print(f"median ensemble overall error: {med_avg}")
print(f"")



for mod in expert_models:
        #make model series
        model_choice = str(mod)
        IND_model = model_pred_df['value'][model_choice]
        MOD_single = IND_model.reset_index()
        IND_dates = MOD_single['target_end_date'].tolist() 
        MOD_dict = {}
        for index, row in MOD_single.iterrows():
            MOD_dict[row["target_end_date"]] = row["value"]
        mod_series = pd.Series(data=MOD_dict, index=MOD_single['target_end_date'])

        #make groundtruth series
        GT_copy = df_gt
        GT_single = GT_copy.reset_index()
        GT_dict = {}
        for index, row in GT_single.iterrows():
            dat = row["date"].date()
            datestr = dat.strftime("%Y-%m-%d")
            if datestr in IND_dates:
                GT_dict[datestr] = row["gt"]

        GT_series = pd.Series(data=GT_dict, index=MOD_single['target_end_date'])
        
        #calculate cumulative overall errors
        losses_mod = MAELoss.loss(0,GT_series,mod_series)
        entries = 0
        tot = 0
        for x in losses_mod:
            entries += 1
            tot += x

        mod_avg = tot / entries
        print(f"{mod}: {mod_avg}")
        #print(f"{mod_avg}")

'\niterate through models and collect MAE\n'

overall model MAE for: 41, incd_case, 4w horizon
mean ensemble overall error: 1658.7484320934286
median ensemble overall error: 1463.606307533117

BPagano-RtDriven: 4362.00688047619
CEID-Walk: 2145.062103448275
COVIDhub-baseline: 1560.212765957447
COVIDhub-ensemble: 1935.8181818181818
COVIDhub-trained_ensemble: 2268.2
CU-nochange: 2982.967741935484
CU-scenario_high: 3371.5
CU-scenario_low: 2487.5666666666666
CU-scenario_mid: 3102.9666666666667
CU-select: 2795.5333333333333
Covid19Sim-Simulator: 2095.070122678213
CovidActNow-SEIR_CAN: 475.31000368588093
CovidAnalytics-DELPHI: 3025.885714285714
DDS-NBDS: 2306.6451612903224
Google_Harvard-CPF: 2696.87890625
IEM_MED-CovidProject: 2439.564085441674
IHME-CurveFit: 1889.4577848130125
IQVIA_ACOE-STAN: 2333.0
IUPUI-HkPrMobiDyR: 280.5
IowaStateLW-STEM: 2010.6470588235295
JCB-PRM: 6582.996513626967
JHUAPL-Bucky: 2993.848304862857
JHU_CSSE-DECOM: 3191.5992528299503
JHU_IDD-CovidSP: 2095.506211970739
JHU_UNC_GAS-StatMechPool: 2772.65625
Karlen-pypm

In [46]:
most_dates = []
total_mods = 0
for mod in expert_models:
        total_mods += 1
        #make model series
        mod_choice = str(mod)
        mod_tab = model_pred_df['value'][mod_choice]
        mod_tab = mod_tab.reset_index()
        fst_lst = mod_tab['target_end_date'].iloc[[0, -1]]
        first_date = fst_lst.iloc[0]
        last_date = fst_lst.iloc[1]
        #RANGE FOR INC_CASES: if (first_date <= "2020-07-11") and ("2021-02-27" <= last_date ):
        #RANGE FOR INC_DEATH: if (first_date <= "2020-05-30") and ("2021-02-27" <= last_date ):
        if (first_date <= "2020-07-11") and ("2021-02-27" <= last_date ):
            most_dates.append(mod)
        print(f"#{mod} date range: {first_date} to {last_date}")
        
inc_ct = len(most_dates)
exc_ct = total_mods - inc_ct
print(f"{total_mods} models for {gt_id}. {inc_ct} included for date range ({most_dates}, {exc_ct} excluded for missing dates.")



#BPagano-RtDriven date range: 2020-11-14 to 2021-04-03
#CEID-Walk date range: 2020-09-05 to 2021-04-03
#COVIDhub-baseline date range: 2020-05-02 to 2021-03-20
#COVIDhub-ensemble date range: 2020-08-15 to 2021-03-27
#COVIDhub-trained_ensemble date range: 2021-02-27 to 2021-03-27
#CU-nochange date range: 2020-08-29 to 2021-03-27
#CU-scenario_high date range: 2020-09-05 to 2021-03-27
#CU-scenario_low date range: 2020-09-05 to 2021-03-27
#CU-scenario_mid date range: 2020-09-05 to 2021-03-27
#CU-select date range: 2020-09-05 to 2021-03-27
#Columbia_UNC-SurvCon date range: 2020-08-01 to 2021-04-03
#Covid19Sim-Simulator date range: 2020-08-01 to 2021-03-27
#CovidAnalytics-DELPHI date range: 2020-08-01 to 2021-03-27
#DDS-NBDS date range: 2020-08-22 to 2021-03-27
#FDANIHASU-Sweight date range: 2021-02-20 to 2021-03-27
#IEM_MED-CovidProject date range: 2021-01-02 to 2021-04-03
#IHME-CurveFit date range: 2020-08-15 to 2020-09-05
#IQVIA_ACOE-STAN date range: 2021-01-09 to 2021-01-09
#IUPUI-HkPrMob

In [None]:
#view individual model
view_model = model_pred_df['value'][best_case]

In [None]:
"""
prediction, MAE, regret plots
"""
#best model (INC cases): USACE-ERDC_SEIR
#(IUPUI-HkPrMobiDyR) performed better for all sampled but TX, also less values.
#best model (INC deaths): YYG-ParamSearch
#(SWC-TerminusCM performed best but only one value)

best_case = 'USACE-ERDC_SEIR'
BC_model = model_pred_df['value'][best_case]

#copy groundtruth data
GT_copy = df_gt 
GT_single = GT_copy.reset_index()
GT_xvals = GT_single['date']
GT_yvals = GT_single['gt']

#get model values
BC_copy  = BC_model.reset_index()
BC_xvals = BC_copy['target_end_date']
BC_yvals = BC_copy['value']
BC_dates = BC_xvals.tolist() 

#filter GT dates
GT_mdates = []
GT_mvals = []
for index, row in GT_single.iterrows():
    date = row["date"].date()
    val = row["gt"]
    form = '{dat}'
    datestr = date.strftime("%Y-%m-%d")
    if datestr in BC_dates:
        GT_mdates.append(date)
        GT_mvals.append(val)
        
#filter mean and median to model dates
med_model = model_pred_df.groupby(['target_end_date']).median() 
MED_copy = med_model
MED_single = MED_copy.reset_index()
MED_xvals = MED_single['target_end_date']
MED_yvals = MED_single['value']

MED_mdates = []
MED_mvals = []
for index, row in MED_single.iterrows():
    date = row["target_end_date"]
    val = row["value"]
    if date in BC_dates:
        MED_mdates.append(date)
        MED_mvals.append(val)
        
mea_model = model_pred_df.groupby(['target_end_date']).mean()
MEA_copy = mea_model
MEA_single = MEA_copy.reset_index()
MEA_xvals = MEA_single['target_end_date']
MEA_yvals = MEA_single['value']

MEA_mdates = []
MEA_mvals = []
for index, row in MEA_single.iterrows():
    date = row["target_end_date"]
    val = row["value"]
    if date in BC_dates:
        MEA_mdates.append(date)
        MEA_mvals.append(val)

#combined plot of groundtruth and individual model.
plt.figure(figsize=(15,15))
plt.plot(BC_dates, GT_mvals, label = "groundtruth")
plt.plot(BC_dates, BC_yvals, label = f"{best_case} model")
plt.plot(BC_dates, MED_mvals, label = "median ensemble")
plt.plot(BC_dates, MEA_mvals, label = "mean ensemble")
plt.xticks(rotation=90)
plt.title(f'groundtruth vs {best_case} vs mean ensemble vs median ensemble: {gt_id} raw data')
plt.ylabel(f'{gt_id} count')
plt.xlabel('date')
plt.legend()
plt.show()

In [None]:
#------------- MODEL VS ENSEMBLE ERROR (single model)
#make model series
MOD_copy = BC_model
MOD_single = MOD_copy.reset_index()
MOD_dict = {}
for index, row in MOD_single.iterrows():
    MOD_dict[row["target_end_date"]] = row["value"]

mod_series = pd.Series(data=MOD_dict, index=MOD_single['target_end_date'])

#make median series
MED_copy = med_model
MED_single = MED_copy.reset_index()
MED_dict = {}
for index, row in MED_single.iterrows():
    dat = row["target_end_date"]
    if dat in BC_dates:
        MED_dict[dat] = row["value"]

med_series = pd.Series(data=MED_dict, index=MOD_single['target_end_date'])


#make mean series
MEA_copy = mea_model
MEA_single = MEA_copy.reset_index()
MEA_dict = {}
for index, row in MEA_single.iterrows():
    dat = row["target_end_date"]
    if dat in BC_dates:
        MEA_dict[dat] = row["value"]

mea_series = pd.Series(data=MEA_dict, index=MOD_single['target_end_date'])


#make groundtruth series
GT_copy = df_gt
GT_single = GT_copy.reset_index()
GT_dict = {}
for index, row in GT_single.iterrows():
    dat = row["date"].date()
    datestr = dat.strftime("%Y-%m-%d")
    if datestr in BC_dates:
        GT_dict[datestr] = row["gt"]
        
GT_series = pd.Series(data=GT_dict, index=MOD_single['target_end_date'])

#calculate cumulative overall errors
losses_mod = MAELoss.loss(0,GT_series,mod_series)
entries = 0
tot = 0
for x in losses_mod:
    entries += 1
    tot += x
    
mod_avg = tot / entries
print(f"{best_case} model overall error: {mod_avg}")

losses_mea = MAELoss.loss(0,GT_series,mea_series)
entries = 0
tot = 0
for x in losses_mea:
    entries += 1
    tot += x
    
mean_avg = tot / entries
print(f"mean overall error for {best_case} date range: {mean_avg}")


losses_med = MAELoss.loss(0,GT_series,med_series)
entries2 = 0
tot2 = 0
for x in losses_med:
    entries2 += 1
    tot2 += x
    
med_avg = tot2 / entries2
print(f"median overall error for {best_case} date range: {med_avg}")

#plot losses
modDates = BC_dates

plt.figure(figsize=(15,15))
plt.plot(modDates, losses_mod.values, label = f"{best_case} MAE")
plt.plot(modDates, losses_mea.values, label = "mean ensemble MAE")
plt.plot(modDates, losses_med.values, label = "median ensemble MAE")
plt.xticks(rotation=90)
plt.title(f'{best_case} vs mean ensemble vs mean ensemble: MAE')
plt.ylabel('error')
plt.xlabel('date')
plt.legend()
plt.show()


In [None]:
#------------- MODEL REGRET (single model)
#make regret function
def regretFind(MAE_ens, MAE_mod):  
        return (MAE_ens - MAE_mod)


#calculate cumulative overall errors
#losses_mod = MAELoss.loss(0,GT_series,mod_series)
regret_vals_med = regretFind(losses_med, losses_mod)
regret_vals_mea = regretFind(losses_mea, losses_mod)

#calculate average regrets
sum_med = 0
count_med = 0
for val in regret_vals_med.values:
    sum_med += val
    count_med += 1
    
med_reg_avg = sum_med / count_med
print(f'the average {best_case} and median ensemble regret is: {med_reg_avg}')

sum_mea = 0
count_mea = 0
for val in regret_vals_mea.values:
    sum_mea += val
    count_mea += 1
    
mea_reg_avg = sum_mea / count_mea
print(f'the average {best_case} and mean ensemble regret is: {mea_reg_avg}')
    

modDates = BC_dates
plt.figure(figsize=(15,15))
plt.plot(modDates, regret_vals_med.values, label = f"median and {best_case} regret")
plt.plot(modDates, regret_vals_mea.values, label = f"mean and {best_case} regret")
plt.xticks(rotation=90)
plt.title(f'{best_case} vs mean ensemble vs mean ensemble: Regret')
plt.ylabel('error')
plt.xlabel('date')
plt.legend()
plt.show()

#for inc cases, the USACE-ERDC_SEIR performed best, with average regret compared to median at 
#374.9132778186387 and average regret compared to mean at 7437.702504995208.
#for inc deaths, the average YYG-ParamSearch and median ensemble regret is: 339.13774474221003
#the average YYG-ParamSearch and mean ensemble regret is: 525.6756407469163

In [None]:
"""
iterate through all expert models and plot all
"""

GT_copy = df_gt 
GT_single = GT_copy.reset_index()
GT_dateform = []
for index, row in GT_single.iterrows():
    date = row["date"].date()
    val = row["gt"]
    form = '{dat}'
    datestr = date.strftime("%Y-%m-%d")
    GT_dateform.append(datestr)
GT_xvals = GT_single['date']
GT_yvals = GT_single['gt']

med_model = model_pred_df.groupby(['target_end_date']).median() 
MED_copy = med_model
MED_single = MED_copy.reset_index()
MED_xvals = MED_single['target_end_date']
MED_yvals = MED_single['value']
        
mea_model = model_pred_df.groupby(['target_end_date']).mean()
MEA_copy = mea_model
MEA_single = MEA_copy.reset_index()
MEA_xvals = MEA_single['target_end_date']
MEA_yvals = MEA_single['value']


fig, ax = plt.subplots(figsize=(50,50))
ax.plot(GT_dateform, GT_yvals, label='groundtruth data')
ax.plot(MED_xvals, MED_yvals, label='median ensemble data')
ax.plot(MEA_xvals, MEA_yvals, label='mean ensemble data')
for mod in expert_models:
        #make model series
        model_choice = str(mod)
        BC_model = model_pred_df['value'][model_choice]
        BC_copy  = BC_model.reset_index()
        BC_xvals = BC_copy['target_end_date']
        BC_yvals = BC_copy['value']
        ax.plot(BC_xvals, BC_yvals, label=f'{model_choice} data')
legend = ax.legend(loc='upper right', fontsize='x-large')
plt.xticks(rotation=90)
plt.xlabel('count')
plt.ylabel('date')
plt.title(f'groundtruth vs median vs mean ensemble: {gt_id}, raw data')
plt.show()

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [None]:
"""
collect models with the most dates
"""
#INC_CASES DATE RANGES
#GROUNDTRUTH: 2020-01-29 to current date.
#COVIDhub-baseline date range: 2020-04-11 to 2021-02-27
#Columbia_UNC-SurvCon date range: 2020-07-11 to 2021-03-13
#Covid19Sim-Simulator date range: 2020-07-11 to 2021-03-06
#CovidAnalytics-DELPHI date range: 2020-07-11 to 2021-03-06
#IowaStateLW-STEM date range: 2020-07-11 to 2021-03-06
#JHU_IDD-CovidSP date range: 2020-07-11 to 2021-03-06
#LANL-GrowthRate date range: 2020-07-11 to 2021-03-13
#OliverWyman-Navigator date range: 2020-07-11 to 2021-03-06
#RobertWalraven-ESG date range: 2020-07-11 to 2021-03-06
#UCLA-SuEIR date range: 2020-07-11 to 2021-03-06
#UMich-RidgeTfReg date range: 2020-07-11 to 2021-03-06
#----
#USACE-ERDC_SEIR date range: 2020-07-11 to 2020-12-05 [STOPPED]
#LNQ-ens1 date range: 2020-07-18 to 2021-03-06
#COVIDhub-ensemble date range: 2020-07-25 to 2021-03-06
#IHME-CurveFit date range: 2020-07-25 to 2020-08-15
#DDS-NBDS date range: 2020-08-01 to 2021-03-06
#Karlen-pypm date range: 2020-08-01 to 2021-03-06
#CU-nochange date range: 2020-08-08 to 2021-03-06
#CEID-Walk date range: 2020-08-15 to 2021-03-13
#CU-scenario_high date range: 2020-08-15 to 2021-03-06
#CU-scenario_low date range: 2020-08-15 to 2021-03-06
#CU-scenario_mid date range: 2020-08-15 to 2021-03-06
#CU-select date range: 2020-08-15 to 2021-03-06
#JHUAPL-Bucky date range: 2020-09-05 to 2021-03-06
#BPagano-RtDriven date range: 2020-10-24 to 2021-03-13
#JHU_CSSE-DECOM date range: 2020-10-31 to 2021-03-06
#JCB-PRM date range: 2020-11-07 to 2021-01-02
#USC-SI_kJalpha_RF date range: 2020-11-07 to 2020-11-14
#UChicagoCHATTOPADHYAY-UnIT date range: 2020-11-14 to 2021-03-13
#UCF-AEM date range: 2020-11-21 to 2021-03-06
#USC-SI_kJalpha date range: 2020-11-21 to 2021-03-13
#UVA-Ensemble date range: 2020-11-21 to 2021-03-06
#IEM_MED-CovidProject date range: 2020-12-12 to 2021-03-13
#IQVIA_ACOE-STAN date range: 2020-12-12 to 2020-12-19
#OneQuietNight-ML date range: 2020-12-12 to 2021-03-06
#QJHong-Encounter date range: 2020-12-12 to 2021-03-06
#Microsoft-DeepSTIA date range: 2020-12-26 to 2021-03-13
#MOBS-GLEAM_COVID date range: 2021-01-16 to 2021-03-13
#SigSci-TS date range: 2021-01-16 to 2021-03-13
#UpstateSU-GRU date range: 2021-01-16 to 2021-03-13
#MIT_ISOLAT-Mixtures date range: 2021-01-23 to 2021-03-13
#FDANIHASU-Sweight date range: 2021-01-30 to 2021-03-06
#COVIDhub-trained_ensemble date range: 2021-02-06 to 2021-03-06
#IUPUI-HkPrMobiDyR date range: 2021-02-27 to 2021-02-27


#INC_DEATHS DATE RANGES
#GROUNDTRUTH: 2020-01-29 to current date.
#COVIDhub-baseline date range: 2020-04-11 to 2021-02-27
#MOBS-GLEAM_COVID date range: 2020-04-18 to 2021-03-13
#GT-DeepCOVID date range: 2020-05-09 to 2021-03-06
#IHME-CurveFit date range: 2020-05-23 to 2021-03-06
#JHU_IDD-CovidSP date range: 2020-05-23 to 2021-03-06
#UMass-MechBayes date range: 2020-05-23 to 2021-03-06
#Covid19Sim-Simulator date range: 2020-05-30 to 2021-03-06
#---
#COVIDhub-ensemble date range: 2020-06-13 to 2021-03-06
#OliverWyman-Navigator date range: 2020-06-13 to 2021-03-06
#LANL-GrowthRate date range: 2020-06-20 to 2021-03-13
#UCLA-SuEIR date range: 2020-06-20 to 2021-03-06
#epiforecasts-ensemble1 date range: 2020-06-20 to 2021-03-06
#CU-nochange date range: 2020-06-27 to 2021-03-06
#CU-scenario_high date range: 2020-06-27 to 2021-03-06
#CU-scenario_low date range: 2020-06-27 to 2021-03-06
#CU-scenario_mid date range: 2020-06-27 to 2021-03-06
#CU-select date range: 2020-06-27 to 2021-03-06
#UA-EpiCovDA date range: 2020-06-27 to 2021-03-06
#---
#Imperial-ensemble1 date range: 2020-03-21 to 2020-07-04 [STOPPED]
#Imperial-ensemble2 date range: 2020-03-21 to 2020-07-04 [STOPPED]
#YYG-ParamSearch date range: 2020-04-18 to 2020-10-10 [STOPPED]
#UT-Mobility date range: 2020-04-25 to 2021-01-30 [STOPPED]
#USACE-ERDC_SEIR date range: 2020-06-06 to 2020-12-05 [STOPPED]
#NotreDame-mobility date range: 2020-06-27 to 2020-12-19 [STOPPED]
#MITCovAlliance-SIR date range: 2020-07-04 to 2021-01-16
#SWC-TerminusCM date range: 2020-07-04 to 2020-07-04
#IowaStateLW-STEM date range: 2020-07-11 to 2021-03-06
#PSI-DRAFT date range: 2020-07-11 to 2021-03-06
#RobertWalraven-ESG date range: 2020-07-11 to 2021-03-06
#UMich-RidgeTfReg date range: 2020-07-11 to 2021-03-06
#Columbia_UNC-SurvCon date range: 2020-07-25 to 2021-03-13
#LNQ-ens1 date range: 2020-07-25 to 2021-03-06
#DDS-NBDS date range: 2020-08-01 to 2021-03-06
#Karlen-pypm date range: 2020-08-01 to 2021-03-06
#UCM_MESALab-FoGSEIR date range: 2020-08-01 to 2021-03-13
#RPI_UW-Mob_Collision date range: 2020-08-08 to 2021-03-06
#CEID-Walk date range: 2020-08-15 to 2021-03-13
#SteveMcConnell-CovidComplete date range: 2020-08-29 to 2021-03-06
#JHUAPL-Bucky date range: 2020-09-05 to 2021-03-06
#UCSD_NEU-DeepGLEAM date range: 2020-09-12 to 2021-03-06
#CovidAnalytics-DELPHI date range: 2020-10-03 to 2021-03-06
#BPagano-RtDriven date range: 2020-10-10 to 2021-03-13
#MSRA-DeepST date range: 2020-10-24 to 2020-12-12
#JHU_CSSE-DECOM date range: 2020-10-31 to 2021-03-06
#JCB-PRM date range: 2020-11-07 to 2021-01-02
#USC-SI_kJalpha_RF date range: 2020-11-07 to 2020-11-14
#UChicagoCHATTOPADHYAY-UnIT date range: 2020-11-14 to 2021-03-13
#USC-SI_kJalpha date range: 2020-11-21 to 2021-03-13
#MIT_CritData-GBCF date range: 2020-12-12 to 2021-03-13
#Microsoft-DeepSTIA date range: 2020-12-12 to 2021-03-13
#QJHong-Encounter date range: 2020-12-12 to 2021-03-06
#SigSci-TS date range: 2021-01-16 to 2021-03-13
#UpstateSU-GRU date range: 2021-01-16 to 2021-03-13
#MIT_ISOLAT-Mixtures date range: 2021-01-23 to 2021-03-13
#COVIDhub-trained_ensemble date range: 2021-02-06 to 2021-03-06
#IUPUI-HkPrMobiDyR date range: 2021-02-27 to 2021-02-27



most_dates = []
total_mods = 0
for mod in expert_models:
        total_mods += 1
        #make model series
        mod_choice = str(mod)
        mod_tab = model_pred_df['value'][mod_choice]
        mod_tab = mod_tab.reset_index()
        fst_lst = mod_tab['target_end_date'].iloc[[0, -1]]
        first_date = fst_lst.iloc[0]
        last_date = fst_lst.iloc[1]
        #RANGE FOR INC_CASES: if (first_date <= "2020-07-11") and ("2021-02-27" <= last_date ):
        #RANGE FOR INC_DEATH: if (first_date <= "2020-05-30") and ("2021-02-27" <= last_date ):
        if (first_date <= "2020-05-30") and ("2021-02-27" <= last_date ):
            most_dates.append(mod)
        print(f"#{mod} date range: {first_date} to {last_date}")
        
inc_ct = len(most_dates)
exc_ct = total_mods - inc_ct
print(f"{total_mods} models for {gt_id}. {inc_ct} included for date range, {exc_ct} excluded for missing dates.")



In [None]:

"""
data plots of most persistent models
"""

fig, ax = plt.subplots(figsize=(50,50))
ax.plot(GT_dateform, GT_yvals, label='groundtruth data', linewidth=5)
ax.plot(MED_xvals, MED_yvals, label='median ensemble data')
ax.plot(MEA_xvals, MEA_yvals, label='mean ensemble data')
for mod in most_dates:
        #make model series
        model_choice = str(mod)
        BC_model = model_pred_df['value'][model_choice]
        BC_copy  = BC_model.reset_index()
        BC_xvals = BC_copy['target_end_date']
        BC_yvals = BC_copy['value']
        ax.plot(BC_xvals, BC_yvals, label=f'{model_choice} data')
legend = ax.legend(loc='upper right', fontsize='x-large')
plt.xticks(rotation=90)
plt.xlabel('count')
plt.ylabel('date')
plt.title(f'persistent plots vs ensembles: {gt_id}, raw data')
plt.show()

In [None]:

"""
MAE/performance plots of most persistent models
"""

#copy median series
MED_copy = med_model
MED_single = MED_copy.reset_index()
MED_dates = MED_single['target_end_date']
MED_dict = {}
for index, row in MED_single.iterrows():
    MED_dict[row["target_end_date"]] = row["value"]

med_series = pd.Series(data=MED_dict, index=MED_single['target_end_date'])


#copy groundtruth series
GT_copy = df_gt
GT_single = GT_copy.reset_index()
GT_dict = {}
for index, row in GT_single.iterrows():
    dat = row["date"].date()
    datestr = dat.strftime("%Y-%m-%d")
    GT_dict[datestr] = row["gt"]
        
GT_series = pd.Series(data=GT_dict, index=MED_single['target_end_date']) #filtered to ensemble range

#get median ensemble losses
losses_med = MAELoss.loss(0,GT_series,med_series)

#performance plots for most persistent models
fig, ax = plt.subplots(figsize=(50,50))
ax.plot(MED_dates, losses_med.values, label='median ensemble MAE', linewidth=5)
for mod in most_dates:
        #make model series
        model_choice = str(mod)
        IND_model = model_pred_df['value'][model_choice]
        MOD_single = IND_model.reset_index()
        IND_dates = MOD_single['target_end_date'].tolist() 
        MOD_dict = {}
        for index, row in MOD_single.iterrows():
            MOD_dict[row["target_end_date"]] = row["value"]
        mod_series = pd.Series(data=MOD_dict, index=MOD_single['target_end_date'])

        #make filtered groundtruth series
        GT_copy = df_gt
        GT_single = GT_copy.reset_index()
        GT_dict = {}
        for index, row in GT_single.iterrows():
            dat = row["date"].date()
            datestr = dat.strftime("%Y-%m-%d")
            if datestr in IND_dates:
                GT_dict[datestr] = row["gt"]

        GT_series = pd.Series(data=GT_dict, index=MOD_single['target_end_date'])
        
        #calculate MAE
        losses_mod = MAELoss.loss(0,GT_series,mod_series)
        ax.plot(IND_dates, losses_mod.values, label=f'{model_choice} MAE')
legend = ax.legend(loc='upper right', fontsize='x-large')
plt.xticks(rotation=90)
plt.xlabel('count')
plt.ylabel('date')
plt.title(f'persistent plots vs median ensemble: {gt_id}, MAE')
plt.show()


In [None]:
"""
#regret plots of most persistent models (vs median ensembles)
"""


fig, ax = plt.subplots(figsize=(50,50))
for mod in most_dates:
        #make model series
        model_choice = str(mod)
        SELECT_model = model_pred_df['value'][model_choice]
        SELECT_copy  = SELECT_model.reset_index()
        SELECT_dates = SELECT_copy['target_end_date']
        SELECT_vals = SELECT_copy['value']
        
        SELECT_dict = {}
        for index, row in SELECT_copy.iterrows():
            SELECT_dict[row["target_end_date"]] = row["value"]

        SELECT_series = pd.Series(data=SELECT_dict, index=SELECT_dates.to_numpy())
        
        #make gt series (for model dates)
        REAL_copy = df_gt
        REAL_single = REAL_copy.reset_index()
        REAL_dict = {}
        for index, row in REAL_single.iterrows():
            dat = row["date"].date()
            datestr = dat.strftime("%Y-%m-%d")
            if datestr in SELECT_dates.to_numpy():
                REAL_dict[datestr] = row["gt"]

        REAL_series = pd.Series(data=REAL_dict, index=SELECT_dates.to_numpy())
        
        #make median ensemble series (for model dates)
        MEDI_copy = med_model
        MEDI_single = MEDI_copy.reset_index()
        MEDI_xvals = MEDI_single['target_end_date']
        MEDI_dict = {}
        for index, row in MEDI_single.iterrows():
            dat = row["target_end_date"]
            if dat in SELECT_dates.to_numpy():
                MEDI_dict[dat] = row["value"]

        med_series = pd.Series(data=MEDI_dict, index=SELECT_dates.to_numpy())

        #calculate median ensemble loss
        LOSS_med = MAELoss.loss(0,REAL_series,med_series)

        #calculate model loss
        LOSS_mod = MAELoss.loss(0,REAL_series,SELECT_series)
        
        #calculate regret
        regret_vals_MOD = regretFind(LOSS_med, LOSS_mod)
        
        #check
        #print(mod)
        #regret_vals_MOD
        
        
        #plot regret
        ax.plot(regret_vals_MOD.index, regret_vals_MOD.values, label=f'{model_choice} regret')
        
legend = ax.legend(loc='upper right', fontsize='x-large')
plt.axhline(y=0, color='r', linestyle='-')
plt.xticks(rotation=90)
plt.xlabel('count')
plt.ylabel('date')
plt.title(f'persistent plots vs median ensemble: {gt_id}, regret')
plt.show()


In [None]:

"""
#regret plots of all models (vs median ensembles)
#NOT WORKING
#DATES ARE PLOTTING OUT OF ORDER?
"""

"""
fig, ax = plt.subplots(figsize=(50,50))
for mod in expert_models:
        #make model series
        model_choice = str(mod)
        SELECT_model = model_pred_df['value'][model_choice]
        SELECT_copy  = SELECT_model.reset_index()
        SELECT_dates = SELECT_copy['target_end_date']
        SELECT_vals = SELECT_copy['value']
        
        SELECT_dict = {}
        for index, row in SELECT_copy.iterrows():
            SELECT_dict[row["target_end_date"]] = row["value"]

        SELECT_series = pd.Series(data=SELECT_dict, index=SELECT_copy['target_end_date'])
        
        #make gt series (for model dates)
        REAL_copy = df_gt
        REAL_single = REAL_copy.reset_index()
        REAL_dict = {}
        for index, row in REAL_single.iterrows():
            dat = row["date"].date()
            datestr = dat.strftime("%Y-%m-%d")
            if datestr in SELECT_dates.to_numpy():
                REAL_dict[datestr] = row["gt"]

        REAL_series = pd.Series(data=REAL_dict, index=SELECT_dates.to_numpy())
        
        #make median ensemble series (for model dates)
        MEDI_copy = med_model
        MEDI_single = MEDI_copy.reset_index()
        MEDI_xvals = MEDI_single['target_end_date']
        MEDI_dict = {}
        for index, row in MEDI_single.iterrows():
            dat = row["target_end_date"]
            if dat in SELECT_dates.to_numpy():
                MEDI_dict[dat] = row["value"]

        med_series = pd.Series(data=MEDI_dict, index=SELECT_dates.to_numpy())

        #calculate median ensemble loss
        LOSS_med = MAELoss.loss(0,REAL_series,med_series)

        #calculate model loss
        LOSS_mod = MAELoss.loss(0,REAL_series,SELECT_series)
        
        #calculate regret
        regret_vals_MOD = regretFind(LOSS_med, LOSS_mod)
        
        #check
        #print(mod)
        #regret_vals_MOD
        
        
        #plot regret
        ax.plot(regret_vals_MOD.index, regret_vals_MOD.values, label=f'{model_choice} regret')
        
legend = ax.legend(loc='upper right', fontsize='x-large')
plt.xticks(rotation=90)
plt.xlabel('count')
plt.ylabel('date')
plt.title(f'all plots vs ensembles: {gt_id}, regret')
plt.show()
"""