In [1]:
# (C) Copyright 1996- ECMWF.
#
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.

In [2]:
# basic libraries for data analysis
import numpy as np 
import pandas as pd
import xarray as xr

from itertools import product

import multiprocessing # parallel processing
import tqdm # timing

## Input variables

In [3]:
out_dir = ''

season_studied = 'Winter' # All, Winter or Summer!
observational_dataset = 'EOBS' # ERA5 or EOBS
rolling_days = 3 # what temporal resolution to analyse? in number of days; integer>=1
frcst_med = out_dir+'Med_LocalizedPatterns_ForecastAllocations.nc'

bootstr = 1000

In [4]:
aux_name = f'{observational_dataset}_{season_studied}_Days{rolling_days}'
results_used = f'{aux_name}/Statistics_{aux_name}_Bootstraps_FullAreas_Data/'

processors_used = 7
del(aux_name)

## Read data

In [5]:
# get the index values of the 5th, 95th and median number, when data are ordered (for the bootstraping)
l_min = int(bootstr*5/100)
l_max = int(bootstr*95/100)-1
l_med = int(bootstr/2)

In [6]:
all_files = !ls $out_dir$results_used
areas_used = sorted(list(set([i.split('_')[0] for i in all_files])))

In [7]:
def normalize(data):
    
    return (data-data.min('time'))/(data.max('time')-data.min('time'))

In [8]:
input_dictionary = {i_area: {} for i_area in areas_used+['Med']}
for i_area in areas_used:
    # get data for reference connections (temporal & Med patterns) to localized extremes
    file_prefix = out_dir+results_used+i_area+'_ReferenceConnections_'
    file_data = xr.open_dataarray(file_prefix+'ConnTemp.nc')
    # in case analysis is for winter and there is no climatoly for 29th Feb, give same values as 28th Feb
    if '0229' not in file_data.temporal.values and season_studied!='Summer':
        file_data = [file_data, file_data.sel(temporal='0228').assign_coords({'temporal': '0229'})]
        file_data = xr.concat(file_data, dim='temporal')
        
    input_dictionary[i_area]['RefTempCondProb'] = file_data.sel(bootstrap='Actual').drop('bootstrap') 
    
    # get data for Mediterranean patterns (new derivation) and localized extremes
    file_prefix = out_dir+results_used+i_area+'_MedPatterns_'
    file_data = xr.open_dataset(file_prefix+'CondProbs.nc')['CondProbs']
    file_data = file_data.sel(bootstrap='Actual', Constraints=0).drop(['bootstrap', 'Constraints'])
    input_dictionary[i_area]['MedCondProb'] = file_data
    
    # get data for areal precipitation
    file_name = f'{out_dir}../Data/Forecasts/{i_area}_precip.nc'
    file_data = xr.open_dataarray(file_name)
    file_data = file_data.rolling(step=rolling_days).sum().dropna('step')
    input_dictionary[i_area]['PrcFrcst'] = file_data
    file_name = f'{out_dir}{results_used}{i_area}_PrecipObs_Timeseries.nc'
    file_data = xr.open_dataarray(file_name).dropna('time')
    if 'Winter' in results_used:
        file_data = file_data.isel(time=file_data.time.dt.month.isin([1, 2, 9, 10, 11, 12]))
    elif 'Summer' in results_used:
        file_data = file_data.isel(time=file_data.time.dt.month.isin([3, 4, 5, 6, 7, 8]))
    input_dictionary[i_area]['PrcObs'] = file_data
    file_name = f'{out_dir}{results_used}{i_area}_PrecipERA5_CondProbs.nc'
    file_data = xr.open_dataarray(file_name)
    file_data = file_data.sel(bootstrap='Actual', Constraints=0).drop(['bootstrap', 'Constraints'])
    input_dictionary[i_area]['PrcCondProb'] = file_data
    
    # get data for water vapour flux
    file_name = f'{out_dir}{results_used}{i_area}_WvfERA5_CondProbs.nc'
    file_data = xr.open_dataarray(file_name)
    file_data = file_data.sel(bootstrap='Actual', Constraints=0).drop(['bootstrap', 'Constraints'])
    input_dictionary[i_area]['WvfCondProb'] = file_data

    file_name = f'{out_dir}../Data/Forecasts/{i_area}_wvf.nc'
    file_data = xr.open_dataarray(file_name)
    file_data = file_data.rolling(step=rolling_days).mean()
    total_area = np.sqrt(file_data.sel(wvf_direction='NorthW')**2+file_data.sel(wvf_direction='EastW')**2)
    total_area = total_area.assign_coords({'wvf_direction': 'Total'})
    file_data = [file_data.sel(wvf_direction=['NorthW', 'SouthW', 'EastW', 'WestW']), total_area]
    file_data = xr.concat(file_data, dim='wvf_direction')
    wvf_dircs = file_data.wvf_direction.values
    
    file_name = f'{out_dir}../Data/Forecasts/{i_area}_relhum.nc'
    file_data2 = xr.open_dataarray(file_name).rename({'pressure_level': 'wvf_direction'})
    file_data2 = file_data2.assign_coords({'wvf_direction': 
                                           [f'RH{int(i)}' for i in file_data2.wvf_direction.values]})
    file_data2 = file_data2.sel(wvf_direction=input_dictionary[i_area]['WvfCondProb'].Extra.values[1])
    file_data2 = file_data2.rolling(step=rolling_days).mean()
    file_data = xr.concat([file_data, file_data2], dim='wvf_direction').dropna('step')
    input_dictionary[i_area]['WvfFrcst'] = file_data
    
# get auxiliary data
lead_steps_used = []
lead_steps_used.append(input_dictionary[i_area]['PrcFrcst'].step.values)
lead_steps_used.append(input_dictionary[i_area]['WvfFrcst'].step.values)

perc_used = input_dictionary[i_area]['PrcCondProb'].percentile.values.tolist()

del(i_area, file_prefix, file_data, file_name, total_area, file_data2)

In [9]:
# add data for 9 Mediterranean clusters
file_data = xr.open_dataarray(frcst_med)
file_data = file_data.isel(number=file_data.number>-1) # don't use ens. mean
file_data = file_data.sel(rolling=rolling_days, variable_type='Anomalies', ClustersNumber=9)

# clim. based on ERA5 and not model-dependent gives better results for up to ~2nd week lead times
# file_data = file_data.sel(clim_type='Model')
file_data = file_data.sel(clim_type='ERA5')

input_dictionary['Med']['PatFrcst'] = file_data.reset_coords(drop=True) 

lead_steps_used.append(input_dictionary['Med']['PatFrcst'].step.values)

del(file_data, frcst_med)

In [10]:
final_steps = set(lead_steps_used[0])
for l in lead_steps_used[1:]:
    final_steps &= set(l)

final_steps = list(final_steps) # convert to list
final_steps = sorted(final_steps)
del(lead_steps_used)

In [11]:
# values used for cost loss ratio for the economic value analysis
# cost_loss_ratio_values = (100-np.array(perc_used))/100
# cost_loss_ratio_values = np.linspace(cost_loss_ratio_values.min()*.8, cost_loss_ratio_values.max()*1.2, 6)
cost_loss_ratio_values = np.linspace(0, .1, 11)
cost_loss_ratio_values = np.append(cost_loss_ratio_values, np.linspace(cost_loss_ratio_values.max(), 0.999, 12))
cost_loss_ratio_values = np.append(cost_loss_ratio_values, (100-np.array(perc_used))/100)
cost_loss_ratio_values = np.append(cost_loss_ratio_values, [0.001])
cost_loss_ratio_values = sorted(set(cost_loss_ratio_values))[1:]
cost_loss_ratio_values = xr.DataArray(cost_loss_ratio_values, dims=['cost_ratio'], 
                               coords={'cost_ratio': cost_loss_ratio_values})
cost_loss_ratio_values

In [12]:
def temp_flagging(valid_dates, temp_subset):
    
    valid_dates = pd.to_datetime(valid_dates)
    
    if temp_subset == 'All':
        temporal_flag = ['All']*len(valid_dates)
    elif temp_subset == 'HalfYear':
        temporal_flag = (valid_dates.month%12 + 3)//3
        temporal_flag = temporal_flag.map({1: 'WinterHalf', 2: 'SummerHalf', 3: 'SummerHalf', 4: 'WinterHalf'})
    elif temp_subset == 'Season':
        temporal_flag = (valid_dates.month%12 + 3)//3
        temporal_flag = temporal_flag.map({1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Autumn'})
    elif temp_subset == 'Month':
        temporal_flag = valid_dates.month.astype(str)
    elif temp_subset == 'DayMonth':
        temporal_flag = pd.Series([i[-4:] for i in valid_dates.strftime('%Y%m%d')])
        temporal_flag = temporal_flag.values
        
    return temporal_flag

In [13]:
# get the climatological frequencies for the dates in each season used for the analysis
if 'Winter' in results_used:
    seasons_used = ['Autumn', 'Winter']
elif 'Summer' in results_used:
    seasons_used = ['Spring', 'Summer']
else:
    seasons_used = ['Winter', 'Spring', 'Summer', 'Autumn']

print(seasons_used)
# get the number of days per season for a full 4-year period (so that 1 leap year is also included)
aux_climatology = pd.date_range('20000101', '20031231')
aux_climatology = pd.DataFrame({'Dates': aux_climatology}, index=aux_climatology)
aux_climatology['Season'] = temp_flagging(aux_climatology.Dates.values, 'Season')

ref_clim = aux_climatology.query('Season in @seasons_used').Season.value_counts(normalize=True)

del(seasons_used, aux_climatology)

['Autumn', 'Winter']


In [14]:
def define_extremes_obs(data, percentiles):
    
    quant_data = data.quantile(np.array(percentiles)/100, interpolation='linear', dim='time') # thresh.
    quant_data = quant_data.rename({'quantile': 'percentile'}) # rename coordinate
    quant_data = quant_data.assign_coords({'percentile': percentiles}) # assign the dim values based on lags

    # boolean xarray for identifying if an event is over the threshold
    exceed_xr = [data>quant_data.sel(percentile=i_p) for i_p in percentiles] 
    exceed_xr = xr.concat(exceed_xr, dim=pd.Index(percentiles, name='percentile')) # concat. data for all perc.
   
    return exceed_xr

In [15]:
# function for getting subset of forecasted precip/wvf data and generating boolean for exceedance of extremes
def define_extremes_frcst(data, percentiles):
    
    data_stack = data.stack(all_data=['time', 'number'])    
    
    quants = data_stack.quantile(np.array(percentiles)/100, 
                                 interpolation='linear', dim='all_data', keep_attrs=True) # thresholds
    quants = quants.rename({'quantile': 'percentile'}) # rename coordinate
    quants = quants.assign_coords({'percentile': percentiles}) # assign the dim values based on percentiles

    # boolean xarray for identifying if an event is over the threshold
    exceed_xr = [data>quants.sel(percentile=i_p) for i_p in percentiles] # boolean of exceedance per percentile
    exceed_xr = xr.concat(exceed_xr, dim=pd.Index(percentiles, name='percentile')) # concat. data for all perc.
    
    return exceed_xr

In [16]:
def frcst_subset(area_used, key_used, lead_used):
    
    frcst_data_all = input_dictionary[area_used][key_used].sel(step=lead_used)
    frcst_data_all = frcst_data_all.assign_coords({'time': frcst_data_all.time+lead_used})

    return frcst_data_all

In [17]:
def brierSS(data_used, ref_score):
    
    brier_score = data_used
    brier_score.name = 'BS'
    
    brier_ref_clim = ref_score.sel(Method=['DayMonth_Temp', 'Season_Temp']).min('Method')
    brier_skill_score = 1 - brier_score/brier_ref_clim
    brier_skill_score.name = 'BSS'
    
    brier_score = xr.merge([brier_score, brier_skill_score])
    
    return brier_score

In [18]:
def economic_value_calc(input_data):
    
    frcst_data, obs_data, p_t = input_data
    
    hits = ((frcst_data>=p_t).where(obs_data==1)).sum('time')
    hits.name = 'Hits'    
    false_alarms = ((frcst_data>=p_t).where(obs_data==0)).sum('time')
    false_alarms.name = 'FalseAlarms'
    misses = ((frcst_data<p_t).where(obs_data==1)).sum('time')
    misses.name = 'Misses'
    cor_neg = ((frcst_data<p_t).where(obs_data==0)).sum('time')
    cor_neg.name = 'CorrectNegative'
    
    hit_rate = hits/(hits+misses)
    hit_rate.name = 'HitRate'
    false_alarm_rate = false_alarms/(false_alarms+cor_neg)
    false_alarm_rate.name = 'FalseAlarmRate'
    
    ec_val_final = false_alarm_rate*cost_loss_ratio*(1-extremes_occur)
    ec_val_final = ec_val_final - hit_rate*extremes_occur*(1-cost_loss_ratio) + extremes_occur
    ec_val_final = (ev_clim - ec_val_final)/(ev_clim-extremes_occur*cost_loss_ratio)
    ec_val_final = ec_val_final.assign_coords({'p_thr': p_t})
    ec_val_final.name = 'EV'
    
    ec_val_final = xr.merge([ec_val_final, hits, false_alarms, misses, hit_rate, cor_neg, false_alarm_rate])
    
    return ec_val_final

In [19]:
def reliability(input_data):
    
    frcst_data, obs_data, thresholds = input_data
    
    thresholds_used = [-.1]+list(thresholds)
    
    reliab_data = []
    for i in range(len(thresholds_used)-1):
        used_frcsts = ((frcst_data>thresholds_used[i])&(frcst_data<=thresholds_used[i+1]))
        sample_frcsts = used_frcsts.sum('time')
        sample_frcsts.name = 'SampleFrcsts'
        mean_frcsts = frcst_data.where(used_frcsts).mean('time')
        mean_frcsts.name = 'MeanFrcsts'
        sample_obs = obs_data.where(used_frcsts).sum('time')
        sample_obs.name = 'SampleObs'
        mean_obs = obs_data.where(used_frcsts).mean('time')
        mean_obs.name = 'MeanObs'
        reliab_data.append(xr.merge([sample_frcsts, sample_obs, mean_frcsts, mean_obs]))
        
    
    reliab_data = xr.concat(reliab_data, dim=pd.Index(thresholds_used[1:], name='p_thr'))
    
    base_rate = obs_data.mean('time')
    resolution_final = (reliab_data['MeanObs']-base_rate)**2
    resolution_final.name = 'Resolution_Bined'
    resolution_total = resolution_final.weighted(reliab_data['SampleFrcsts']).mean('p_thr')
    resolution_total.name = 'Resolution_Total'
    reliability_final = (reliab_data['MeanFrcsts']-reliab_data['MeanObs'])**2
    reliability_final.name = 'Reliability_Bined'
    reliability_total = reliability_final.weighted(reliab_data['SampleFrcsts']).mean('p_thr')
    reliability_total.name = 'Reliability_Total'
    uncertaintly_total = base_rate*(1-base_rate)
    uncertaintly_total.name = 'Uncertainty_Total'
    
    reliab_data = xr.merge([reliab_data, uncertaintly_total, resolution_final, resolution_total,
                            reliability_final, reliability_total])
    
    return reliab_data

In [20]:
def statistical_analysis_final(input_data):
    
    boot_dates_used, calc_ecom_value = input_data
    area_i = input_data_analysis['area']
    
    # define observational extremes
    precip_actual_i = input_data_analysis['obs'].sel(time=boot_dates_used)
    precip_actual_i = define_extremes_obs(precip_actual_i, perc_used)
    
    # get reference scores
    clim_cond_prob = input_dictionary[area_i]['RefTempCondProb']

    seasonal_i = clim_cond_prob.sel(temporal=temp_flagging(boot_dates_used, 'Season'))
    seasonal_i = seasonal_i.rename({'temporal': 'time'}).assign_coords({'time': boot_dates_used})
    daymonth_i = clim_cond_prob.sel(temporal=temp_flagging(boot_dates_used, 'DayMonth'))
    daymonth_i = daymonth_i.rename({'temporal': 'time'}).assign_coords({'time': boot_dates_used})
    
    ref_forecast_i = [seasonal_i, daymonth_i]
    ref_forecast_i = xr.concat(ref_forecast_i, dim=pd.Index(['Season_Temp', 'DayMonth_Temp'], name='Method'))
    brier_score_all_ref = ((ref_forecast_i-precip_actual_i)**2).sum('time')/len(boot_dates_used)
    
    # define forecasted precipitation extremes and get associated cond.prob. and brier score
    precip_frcsts_condprobs_i = input_dictionary[area_i]['PrcCondProb']
    precip_frcsts_i = input_data_analysis['prc_frc'].sel(time=boot_dates_used)
    precip_frcsts_i = define_extremes_frcst(precip_frcsts_i, precip_frcsts_condprobs_i.percentile.values)
    precip_frcsts_i = precip_frcsts_i#.rename({'percentile': 'ERA5_percentile'})
    
    condprob_pos_precip = precip_frcsts_i.where(precip_frcsts_i==1)
    condprob_pos_precip = condprob_pos_precip*precip_frcsts_condprobs_i.sel(extr_predictor=1)
    condprob_neg_precip = (precip_frcsts_i.where(precip_frcsts_i==0)+1)
    condprob_neg_precip = condprob_neg_precip*precip_frcsts_condprobs_i.sel(extr_predictor=0)

    precip_frcsts_final_i = (condprob_pos_precip.fillna(0)+condprob_neg_precip.fillna(0)).mean('number')
    brier_precip_frcsts_i = ((precip_frcsts_final_i-precip_actual_i)**2).sum('time')/len(boot_dates_used)
    
    # get brier score when only considering boolean based on actual forecasts and not conditioned on ERA5
    precip_fr_actual_i = precip_frcsts_i.mean('number')
    precip_fr_actual_i = precip_fr_actual_i.sel(percentile=precip_actual_i.percentile.values) # only obs. perc.
    brier_precip_frcsts_actual_i = ((precip_fr_actual_i-precip_actual_i)**2).sum('time')/len(boot_dates_used) 
     
    # define forecasted water vapour flux extremes and get associated cond.prob. and brier score
    wvf_frcsts_condprobs_i = input_dictionary[area_i]['WvfCondProb']
    wvf_frcsts_i = input_data_analysis['wvf_frc'].sel(time=boot_dates_used)
    rh_level = wvf_frcsts_condprobs_i.Extra.values[1]
    
    rh_alone = wvf_frcsts_i.sel(wvf_direction=rh_level)
    norm = normalize(wvf_frcsts_i)    
    norm1 = (norm+normalize(rh_alone).drop('wvf_direction').reset_coords(drop=True))/2
    norm1 = norm1.expand_dims({'Extra': [rh_level]})
    wvf_frcsts_i = wvf_frcsts_i.expand_dims({'Extra': ['Alone']})
    wvf_frcsts_i = xr.concat([wvf_frcsts_i, norm1], dim='Extra')     
    wvf_frcsts_i_actual = wvf_frcsts_i
    wvf_frcsts_i = define_extremes_frcst(wvf_frcsts_i, wvf_frcsts_condprobs_i.percentile.values)
    
    condprob_pos_wvf = wvf_frcsts_i.where(wvf_frcsts_i==1)
    condprob_pos_wvf = condprob_pos_wvf*wvf_frcsts_condprobs_i.sel(extr_predictor=1)
    condprob_neg_wvf = (wvf_frcsts_i.where(wvf_frcsts_i==0)+1)
    condprob_neg_wvf = condprob_neg_wvf*wvf_frcsts_condprobs_i.sel(extr_predictor=0)

    wvf_frcsts_final_i = (condprob_pos_wvf.fillna(0)+condprob_neg_wvf.fillna(0)).mean('number')
    brier_wvf_frcsts_i = ((wvf_frcsts_final_i-precip_actual_i)**2).sum('time')/len(boot_dates_used)   
   
    # create wvf_percentile = 0 for the next combined predictors of patterns and wvf
    wvf_frcsts_i = define_extremes_frcst(wvf_frcsts_i_actual, [80]).rename({'percentile': 'wvf_percentile'})
    wvf_frcsts_i = wvf_frcsts_i.assign_coords({'wvf_percentile': [1]})
    wvf_frcsts_i_p0 = (wvf_frcsts_i.isel(wvf_percentile=0)>=0).assign_coords(wvf_percentile=0)
    wvf_frcsts_i = xr.concat([wvf_frcsts_i_p0, wvf_frcsts_i], dim='wvf_percentile')

    # define forecasted mediterranean patterns and get associated cond.prob. and brier score
    medpat_frcsts_condprobs_i = input_dictionary[area_i]['MedCondProb']
    medpat_frcsts_i = input_data_analysis['med_frc'].sel(time=boot_dates_used)
    
    # make labels boolean for each day and cluster
    alloc_i = [medpat_frcsts_i==i for i in range(len(medpat_frcsts_condprobs_i.cluster))]
    alloc_i = xr.concat(alloc_i, dim=medpat_frcsts_condprobs_i.cluster)

    brier_Med_wvf = []
    frcst_Med_wvf = []
    for i_warnarea in medpat_frcsts_condprobs_i.WarnArea:
        allocs_i_warn_pos = alloc_i.where(wvf_frcsts_i.sel(WarnArea=i_warnarea)).fillna(0).astype(int)
        allocs_i_warn_neg = alloc_i.where(wvf_frcsts_i.sel(WarnArea=i_warnarea)==0).fillna(0).astype(int)

        allocs_i_warnarea = xr.concat([allocs_i_warn_pos, allocs_i_warn_neg], 
                                      dim=medpat_frcsts_condprobs_i.CondProbType)

        frcst_Med_i_warn = allocs_i_warnarea*medpat_frcsts_condprobs_i.sel(WarnArea=i_warnarea)
        frcst_Med_i_warn = frcst_Med_i_warn.sum(['cluster', 'CondProbType']).mean('number')

        brier_Med_i_warn = (frcst_Med_i_warn-precip_actual_i.sel(WarnArea=i_warnarea))**2
        brier_Med_i_warn = brier_Med_i_warn.sum('time')/len(boot_dates_used)
        brier_Med_wvf.append(brier_Med_i_warn)
        frcst_Med_wvf.append(frcst_Med_i_warn)

    brier_Med_wvf = xr.concat(brier_Med_wvf, dim='WarnArea')
    frcst_Med_wvf = xr.concat(frcst_Med_wvf, dim='WarnArea')
    
    # calculate BSS
    step_tag = brier_precip_frcsts_i.step.values
    brier_score_all_ref_final = brierSS(brier_score_all_ref, brier_score_all_ref)
    brier_score_all_ref_final = brier_score_all_ref_final.assign_coords({'step': step_tag})
    brier_precip_frcsts_actual_i = brierSS(brier_precip_frcsts_actual_i, brier_score_all_ref)
    brier_precip_frcsts_i = brierSS(brier_precip_frcsts_i, brier_score_all_ref)
    brier_wvf_frcsts_i = brierSS(brier_wvf_frcsts_i, brier_score_all_ref)
    brier_Med_wvf = brierSS(brier_Med_wvf, brier_score_all_ref)
    
    # calculate economic value (if needed)
    if calc_ecom_value:
        # generate auxiliary data needed for calculating the Economic Value for different Cost/Loss measures
        global extremes_occur, cost_loss_ratio, ev_clim
        extremes_occur = precip_actual_i.mean('time')
        cost_loss_ratio = extremes_occur*0+cost_loss_ratio_values # generate remaining dims of the cost ratio
        extremes_occur = cost_loss_ratio*0+extremes_occur # generate "cost_loss_ratio" dim on "extremes_occur"
        ev_clim = xr.concat([cost_loss_ratio, extremes_occur], dim='clim').min('clim') # clim gain for each area
        
        threshold_bins = 25 # was 15 before
        
        # EV of reference forecasts
        thresholds_used_max = input_dictionary[area_i]['RefTempCondProb'].max().values
        thresholds_used = list(np.linspace(0, np.floor(thresholds_used_max*100)/100, threshold_bins))
        ec_val_ref = list(product([ref_forecast_i], [precip_actual_i], thresholds_used))
        ec_val_ref = [economic_value_calc(i) for i in ec_val_ref]
        ec_val_ref = xr.concat(ec_val_ref, dim='p_thr')
        ec_val_ref = ec_val_ref.assign_coords({'step': step_tag})
        rel_data = reliability([ref_forecast_i, precip_actual_i, thresholds_used])
        ec_val_ref = xr.merge([ec_val_ref, rel_data])
        
        # EV of direct forecasts (using forecasted precip and cond. probs based on ERA5)
        thresholds_used_max = input_dictionary[area_i]['PrcCondProb'].max().values
        thresholds_used = list(np.linspace(0, np.floor(thresholds_used_max*100)/100, threshold_bins))
        ec_val_precip = list(product([precip_frcsts_final_i], [precip_actual_i], thresholds_used))
        ec_val_precip = [economic_value_calc(i) for i in ec_val_precip]
        ec_val_precip = xr.concat(ec_val_precip, dim='p_thr')
        rel_data = reliability([precip_frcsts_final_i, precip_actual_i, thresholds_used])
        ec_val_precip = xr.merge([ec_val_precip, rel_data])
        
        # EV of direct forecasts (using forecasted precip and boolean based only on forecasts)
        thresholds_used = list(np.linspace(0, 1, len(precip_frcsts_i.number)+1))
        ec_val_precip_act = list(product([precip_fr_actual_i], [precip_actual_i], thresholds_used))
        ec_val_precip_act = [economic_value_calc(i) for i in ec_val_precip_act]
        ec_val_precip_act = xr.concat(ec_val_precip_act, dim='p_thr')
        rel_data = reliability([precip_fr_actual_i, precip_actual_i, thresholds_used])
        ec_val_precip_act = xr.merge([ec_val_precip_act, rel_data])
        
        # EV of forecasts using wvf
        thresholds_used_max = input_dictionary[area_i]['WvfCondProb'].max().values
        thresholds_used = list(np.linspace(0, np.floor(thresholds_used_max*100)/100, threshold_bins))
        ec_val_wvf = list(product([wvf_frcsts_final_i], [precip_actual_i], thresholds_used))
        ec_val_wvf = [economic_value_calc(i) for i in ec_val_wvf]
        ec_val_wvf = xr.concat(ec_val_wvf, dim='p_thr')
        rel_data = reliability([wvf_frcsts_final_i, precip_actual_i, thresholds_used])
        ec_val_wvf = xr.merge([ec_val_wvf, rel_data])
        
        # EV of forecasts using Mediterranean patterns
        thresholds_used_max = input_dictionary[area_i]['MedCondProb'].sel(wvf_percentile=0).max().values
        thresholds_used = list(np.linspace(0, np.floor(thresholds_used_max*100)/100, threshold_bins))
        ec_val_Medpat = list(product([frcst_Med_wvf], [precip_actual_i], thresholds_used))
        ec_val_Medpat = [economic_value_calc(i) for i in ec_val_Medpat]
        ec_val_Medpat = xr.concat(ec_val_Medpat, dim='p_thr')
        rel_data = reliability([frcst_Med_wvf, precip_actual_i, thresholds_used])
        ec_val_Medpat = xr.merge([ec_val_Medpat, rel_data])
    
    if calc_ecom_value==False:
        return {'ReferenceConnections': {'Brier': brier_score_all_ref_final.astype('float32')}, 
                'precipERA5': {'Brier': brier_precip_frcsts_i.astype('float32')},
                'precipForecasts': {'Brier': brier_precip_frcsts_actual_i.astype('float32')},
                'wvfERA5': {'Brier': brier_wvf_frcsts_i.astype('float32')}, 
                'MedPatterns': {'Brier': brier_Med_wvf.astype('float32')}}
    else:
        return {'ReferenceConnections': {'Brier': brier_score_all_ref_final.astype('float32'), 
                                         'EV': ec_val_ref.astype('float32')}, 
                'precipERA5': {'Brier': brier_precip_frcsts_i.astype('float32'), 
                               'EV': ec_val_precip.astype('float32')},
                'precipForecasts': {'Brier': brier_precip_frcsts_actual_i.astype('float32'), 
                                    'EV': ec_val_precip_act.astype('float32')},
                'wvfERA5': {'Brier': brier_wvf_frcsts_i.astype('float32'), 
                            'EV': ec_val_wvf.astype('float32')}, 
                'MedPatterns': {'Brier': brier_Med_wvf.astype('float32'), 
                                'EV': ec_val_Medpat.astype('float32')}}

In [21]:
def analysis_bootstraps(area_used, step_used):
    
    precip_frcsts = input_dictionary[area_used]['PrcFrcst'].sel(step=step_used)
    precip_frcsts = precip_frcsts.assign_coords({'time': precip_frcsts.time+step_used})

    precip_actual = input_dictionary[area_used]['PrcObs']
     
    global common_dates
    common_dates = set(precip_frcsts.time.values) & set(precip_actual.time.values) # get common dates
    common_dates = sorted(common_dates) # convert to sorted list
    common_dates = pd.to_datetime(common_dates)

    precip_actual = precip_actual.sel(time=common_dates)

    wvf_frcsts = frcst_subset(area_used, 'WvfFrcst', step_used).sel(time=common_dates)
    medpat_frcsts = frcst_subset('Med', 'PatFrcst', step_used).sel(time=common_dates)
    precip_frcsts = precip_frcsts.sel(time=common_dates)
    
    global input_data_analysis
    input_data_analysis = {'obs': precip_actual, 'prc_frc': precip_frcsts, 'wvf_frc': wvf_frcsts,
                           'med_frc': medpat_frcsts, 'area': area_used}

    dates_df = pd.DataFrame({'Dates': common_dates}, index=common_dates)
    dates_df['Season'] = temp_flagging(common_dates, 'Season')
    
    # get bootstraps and actual dates of analysis that respect the climatological frequencies of seasons
    initial_freq = dates_df.Season.value_counts(normalize=True).values
    initial_inst = dates_df.Season.value_counts(normalize=False)
    ref_score = ref_clim.loc[initial_inst.index].values
    ratio_seasons = ref_score/ref_score[-1] # last value has lowest instances (value_counts sorted as default)
    seasons_len_final = (ratio_seasons*initial_inst.values[-1]).astype(int)

    check_len = initial_inst.values < seasons_len_final
    if check_len.sum()>0:
        min_loc = np.argmin(seasons_len_final[check_len])
        min_loc = (np.where(check_len==True)[0])[min_loc]
        ratio_seasons = ref_score/ref_score[min_loc]
        seasons_len_final = (ratio_seasons*initial_inst.values[min_loc]).astype(int)

    seasons_len_final = pd.Series(seasons_len_final, index=initial_inst.index)

    bbs_dates = []
    for i_index, i_len in enumerate(seasons_len_final):
        i_season = seasons_len_final.index[i_index]
        all_dates_season = dates_df.query('Season==@i_season').index
        np.random.seed(10)
        bbs_i_ssn = np.random.choice(all_dates_season, i_len*bootstr) # generate all bootstrapped values
        bbs_i_ssn = pd.to_datetime(bbs_i_ssn) # convert to datetime
        bbs_i_ssn = np.array_split(bbs_i_ssn, bootstr) # split into the number of subsets (samples)
        i_actual = pd.to_datetime(np.array(all_dates_season[:i_len]))
        bbs_i_ssn = np.insert(bbs_i_ssn, 0, i_actual, axis=0) # add actual at 1st place
        bbs_dates.append(bbs_i_ssn)

    bbs_dates = np.concatenate(bbs_dates, axis=1)
    bbs_dates = [(j, True) if i==0 else (j, True) for i, j in enumerate(bbs_dates)]
    
    pool = multiprocessing.Pool(processes=processors_used) # object for multiprocessing
    res_bbs = list(tqdm.tqdm(pool.imap(statistical_analysis_final, bbs_dates), 
                             total=len(bbs_dates), position=0, leave=True))
    pool.close()
    del(pool)
    
    return res_bbs

In [22]:
def bootstrap_statistics(key_used, indicator_used):
    
    data_final_all = [] # loop though warning zones, otherwise data get out of memory
    for i_wa in input_dictionary[i_area]['PrcObs'].WarnArea.values:
        
        bs_all_bootstraps = xr.concat([i[key_used][indicator_used].sel(WarnArea=i_wa) for i in res_bbs],
                                               dim=pd.Index(range(bootstr+1), name='bootstrap'))

        # process bootstraps for getting the results from the Q5, Q95, Median, and Actual bootstraps
        data_final = bs_all_bootstraps.to_array().rename({'variable': 'Indicator'})
        data_final = data_final.transpose(..., 'bootstrap')

        # get percent of bootstraps that BSS is positive (for checking significance of results)
        if indicator_used=='Brier':
            sign = (data_final.sel(Indicator='BSS')>0).sum('bootstrap')/(bootstr+1)
            sign = sign.expand_dims('Indicator').to_dataset('Indicator')
            sign = sign.rename({'BSS': 'BSS_Sign'})

        # get the quantiles of interest based on bootstraps
        data_quant = np.sort(data_final.isel(bootstrap=data_final.bootstrap>0), axis=-1)[..., [l_min, l_max]]
        data_quant = data_final.sel(bootstrap=range(2))*0+data_quant
        data_quant = data_quant.assign_coords({'bootstrap': ['Q5', 'Q95']})

        # add results from original analysis
        data_actual = data_final.isel(bootstrap=0).assign_coords({'bootstrap': 'Actual'})

        # get median value based on bootstraps and actual data (so median is actual sorted index)
        data_median = np.sort(data_final, axis=-1)[..., l_med]
        data_median = data_final.sel(bootstrap=0)*0+data_median
        data_median = data_median.assign_coords({'bootstrap': 'Q50'})

        data_final = xr.concat([data_quant, data_actual, data_median], dim='bootstrap')
        if indicator_used=='Brier':
            data_final = data_final.to_dataset('Indicator')
            data_final = xr.merge([data_final, sign])
        else:
            data_final = data_final.to_dataset('Indicator')#.max('p_thr')
            data_final['Hits'] = data_final['Hits'].isel(cost_ratio=0, drop=True)
            data_final['FalseAlarms'] = data_final['FalseAlarms'].isel(cost_ratio=0, drop=True)
            data_final['HitRate'] = data_final['HitRate'].isel(cost_ratio=0, drop=True)
            data_final['FalseAlarmRate'] = data_final['FalseAlarmRate'].isel(cost_ratio=0, drop=True)
            data_final['SampleFrcsts'] = data_final['SampleFrcsts'].isel(cost_ratio=0, drop=True)
            data_final['MeanFrcsts'] = data_final['MeanFrcsts'].isel(cost_ratio=0, drop=True)
            data_final['SampleObs'] = data_final['SampleObs'].isel(cost_ratio=0, drop=True)
            data_final['MeanObs'] = data_final['MeanObs'].isel(cost_ratio=0, drop=True)
        
        data_final_all.append(data_final)
    
    data_final_all = xr.concat(data_final_all, dim='WarnArea')
    
    return data_final_all

In [23]:
tps_an = ['ReferenceConnections', 'precipERA5', 'precipForecasts', 'wvfERA5', 'MedPatterns']

results_all = {i_area: {i_type: {} for i_type in tps_an} for i_area in areas_used}
for i_area in areas_used[:]:
    results_all[i_area] = {i_type: {'Brier': [], 'EV': []} for i_type in tps_an}
    for i_step in np.array(final_steps)[:17]:#[:8:3]:#[[2, 4, 7]][:1]:
        res_bbs = analysis_bootstraps(i_area, i_step)
        for i_type in tps_an:
            results_all[i_area][i_type]['Brier'].append( bootstrap_statistics(i_type, 'Brier') )     
            results_all[i_area][i_type]['EV'].append( bootstrap_statistics(i_type, 'EV') )
        del(res_bbs)
        
    for i_type in tps_an:
        results_all[i_area][i_type]['Brier'] = xr.concat(results_all[i_area][i_type]['Brier'], dim='step')
        file_name = f'{out_dir}{results_used}{i_area}_EPEsForecst_{i_type}_Brier.nc'
        results_all[i_area][i_type]['Brier'].to_netcdf(file_name)
        results_all[i_area][i_type]['EV'] = xr.concat(results_all[i_area][i_type]['EV'], dim='step')
        file_name = f'{out_dir}{results_used}{i_area}_EPEsForecst_{i_type}_EV.nc'
        results_all[i_area][i_type]['EV'].to_netcdf(file_name)
        
del(i_area, i_step, i_type, file_name)

100%|███████████████████████████████████████| 1001/1001 [36:56<00:00,  2.21s/it]
100%|███████████████████████████████████████| 1001/1001 [37:14<00:00,  2.23s/it]
100%|███████████████████████████████████████| 1001/1001 [37:15<00:00,  2.23s/it]
100%|███████████████████████████████████████| 1001/1001 [36:44<00:00,  2.20s/it]
100%|███████████████████████████████████████| 1001/1001 [37:47<00:00,  2.27s/it]
100%|███████████████████████████████████████| 1001/1001 [37:42<00:00,  2.26s/it]
100%|███████████████████████████████████████| 1001/1001 [37:26<00:00,  2.24s/it]
100%|███████████████████████████████████████| 1001/1001 [37:11<00:00,  2.23s/it]
100%|███████████████████████████████████████| 1001/1001 [37:42<00:00,  2.26s/it]
100%|███████████████████████████████████████| 1001/1001 [37:47<00:00,  2.27s/it]
100%|███████████████████████████████████████| 1001/1001 [37:37<00:00,  2.26s/it]
100%|███████████████████████████████████████| 1001/1001 [37:44<00:00,  2.26s/it]
100%|███████████████████████