In [1]:
# (C) Copyright 1996- ECMWF.
#
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.

### SOS
This script takes too much time (more than 1 week if run on personal computer). It is recommended to run it in cluster, and preferably in a different session for each lead time, so multiple sessions can run in parallel.

In [2]:
import xarray as xr
import pandas as pd
import numpy as np

from itertools import product

from pathlib import Path
import multiprocessing # parallel processing
import tqdm # timing
import sys

In [3]:
Area_used = [48, -10, 27, 41]
P_used = [90, 95, 99] # Thresholds for EPEs [90, 95, 99]
offset_days = 15 # offset days used for getting the EPEs reference climatology of occurrence
bootstraps = 1000

In [4]:
input_dir = ''
output_dir = '/ProcessedData/ForecastsEPEs_Analysis/'

Path(output_dir).mkdir(parents=True, exist_ok=True) # generate subfolder for storing the results

In [5]:
ActualClusters = pd.read_csv(input_dir+'ProcessedData/PatternAllocations_ERA5.csv', index_col=0)
ActualClusters.index = pd.to_datetime(ActualClusters.index)
n_clusters = len(ActualClusters.Label.unique())

AllocatedClusters = pd.read_csv(input_dir+'ProcessedData/ForecastsClusterAllocations.csv').iloc[:, 1:]
AllocatedClusters[['time', 'valid_time']] = AllocatedClusters[['time', 'valid_time']].apply(pd.to_datetime)

In [6]:
# indices for start and end of Summer Half (Summer Half between 16th April - 15th October, inclusive of both dates)
Sorted_Dates = np.array(pd.date_range('20040101', '20041231').strftime('%m%d')) # a leap year for getting all dates
StartSummerHalf = np.where(Sorted_Dates=='0416')[0]
EndSummerHalf = np.where(Sorted_Dates=='1015')[0]        

In [7]:
def temp_flagging(valid_dates, temp_subset):
    
    valid_dates = pd.to_datetime(valid_dates)
    
    if temp_subset == 'All':
        temporal_flag = ['All']*len(valid_dates)
    elif temp_subset == 'HalfYear':
        temporal_flag_aux = pd.Series([i[-4:] for i in valid_dates.strftime('%Y%m%d')])
        temporal_flag_aux = temporal_flag_aux.map({i: i_c for i_c, i in enumerate(Sorted_Dates)})
        temporal_flag_aux = temporal_flag_aux.values
        temporal_flag = np.repeat(['WinterHalf'], len(temporal_flag_aux))
        temporal_flag[(temporal_flag_aux>=StartSummerHalf) & (temporal_flag_aux<=EndSummerHalf)] = 'SummerHalf'
    elif temp_subset == 'Season':
        temporal_flag = (valid_dates.month%12 + 3)//3
        temporal_flag = temporal_flag.map({1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Autumn'})
    elif temp_subset == 'Month':
        temporal_flag = valid_dates.month.astype(str)
    elif temp_subset == 'DayMonth':
        temporal_flag = pd.Series([i[-4:] for i in valid_dates.strftime('%Y%m%d')])
        temporal_flag = temporal_flag.values
        
    return temporal_flag  

In [8]:
for i_tag in ['All', 'HalfYear', 'Season', 'Month', 'DayMonth']:
    ActualClusters[i_tag] = temp_flagging(ActualClusters.index, i_tag)
    
del(i_tag)

In [9]:
# read ERA5 rainfall data
Precipitation = xr.open_dataarray(input_dir+'Data/ERA5/D1_Total_Precipitation.grb', engine='cfgrib')
Precipitation = Precipitation.reset_coords(drop=True)
dates = pd.to_datetime(Precipitation.time.values) # get dates
dates = pd.to_datetime(dates.strftime('%Y%m%d')) # convert to 00:00 hour of day
Precipitation = Precipitation.assign_coords({'time': dates})
Precipitation = Precipitation.sel(longitude=slice(Area_used[1], Area_used[3]), 
                                  latitude=slice(Area_used[0], Area_used[2]), 
                                  time=slice('1979', '2020')) # keep only full years 1979-2020

precip_dates = Precipitation.time.values # get precipitation dates
Lons_all = Precipitation.longitude.values # get longitudes, since due to memory limititations subsets are needed

del(dates)

In [10]:
# function for getting subset of precipitation data and generating boolean for exceedance of extremes
def exceed_boolean(data):
    
    Quant = data.quantile(np.array(P_used)/100, interpolation='linear', dim='time', keep_attrs=True) # thresholds
    Quant = Quant.rename({'quantile': 'percentile'}) # rename coordinate
    Quant = Quant.assign_coords({'percentile': P_used}) # assign the dim values based on percentiles

    # boolean xarray for identifying if an event is over the threshold
    Exceed_xr = [data>Quant.sel(percentile=i_p) for i_p in P_used] # boolean of exceedance per percentile
    Exceed_xr = xr.concat(Exceed_xr, dim=pd.Index(P_used, name='percentile')) # concatenate data for all percentiles
    
    return Exceed_xr

In [11]:
# function for calculating "statistical" Brier Score based on conditional probabilities of EPEs at subsets
def statistical_brier_score(cond_probs, weights, dim_used='subset'):
    
    weights = np.array(weights)
    brier_score = cond_probs - cond_probs**2 # brier score for climatological probabilities
    
    if len(weights)>1:
        brier_score = brier_score.rename({dim_used: 'subsetting'})
        brier_score = [brier_score.isel(subsetting=i)*weights[i] for i in range(len(weights))]
        brier_score = xr.concat(brier_score, dim='subsetting').sum(dim='subsetting')/weights.sum() 
    
    return brier_score.astype('float32')

In [12]:
# function for calculating conditional probabilities and relevant Brier Score for DayMonth temporal subsetting
def DayMonth_EPEs_conditioning(subset_extremes):

    nw_crd = ActualClusters.loc[subset_extremes.time.values, 'DayMonth'] # temporal flag to replace coordinate values
    
    exceed_flags = subset_extremes.assign_coords({'time': nw_crd.values}) # rename time based on temporal flag
    
    ConnProb = []
    for i_dates_central in Sorted_Dates:
        
        central_loc = np.where(Sorted_Dates==i_dates_central)[0]
        dates_check_all = np.linspace(central_loc-offset_days, central_loc+offset_days, 2*offset_days+1)

        for loc, i_date_loc in enumerate(dates_check_all): 
            if i_date_loc >= len(Sorted_Dates):
                dates_check_all[loc] = dates_check_all[loc] - len(Sorted_Dates)
        
        dates_check_all = np.take(Sorted_Dates, dates_check_all.astype(int)).flatten()
        Kept_dates_locs = [np.where(nw_crd.values==i)[0].tolist() for i in dates_check_all]
        Kept_dates_locs = np.array([j for i in Kept_dates_locs for j in i])
        i_condprob = exceed_flags.isel(time=Kept_dates_locs).sum('time')/len(Kept_dates_locs)
        i_condprob = i_condprob.assign_coords({'temporal': i_dates_central})
        ConnProb.append(i_condprob)
        
    ConnProb = xr.concat(ConnProb, dim='temporal')
    weights_temp = nw_crd.value_counts() # weights based on occurrence of temporal subsets
    weights_temp = weights_temp.reindex(Sorted_Dates).fillna(0) # reorder to same order as the xarray "ConnProb"

    BS = statistical_brier_score(ConnProb, weights_temp.values, dim_used='temporal') # statistical Brier Score
    BS = BS.assign_coords({'Method': 'DayMonth_Temp'}) # assign new coord with the temporal subsetting info
    
    return (ConnProb.astype('float32'), BS.astype('float32'))

In [13]:
# function for calculating conditional probabilities and relevant Brier Score for specific temporal subsetting
def temporal_conditioning_subset(subset_extremes, subset_type):

    nw_crd = ActualClusters.loc[subset_extremes.time.values, subset_type] # temporal flag to replace coordinate values
    
    ConnProb = subset_extremes.assign_coords({'time': nw_crd.values}) # rename time based on temporal flag
    
    ConnProb = ConnProb.groupby('time').sum('time')/ConnProb.groupby('time').count() # get conditional prob
    ConnProb = ConnProb.rename({'time': 'temporal'}) # rename coordinate
    
    weights_temp = nw_crd.value_counts() # weights based on occurrence of temporal subsets
    weights_temp = weights_temp.reindex(ConnProb.temporal.values) # reorder to same order as the xarray "ConnProb"

    BS = statistical_brier_score(ConnProb, weights_temp.values, dim_used='temporal') # statistical Brier Score
    BS = BS.assign_coords({'Method': f'{subset_type}_Temp'}) # assign new coord with the temporal subset info
    
    return (ConnProb.astype('float32'), BS.astype('float32'))

In [14]:
# function for calculating conditional probabilities and relevant Brier Score for specific pattern-temporal subsetting
def clusters_EPEs_conditioning(subset_extremes, subset_type):

    nw_crd = subset_extremes.time.values # cluster ID
    # generate new coordinates values based on the cluster ID and the temporal flag of interest for each instance
    nw_crd = ActualClusters.loc[nw_crd, 'Label'].astype(str) + '-' + ActualClusters.loc[nw_crd, subset_type]
    
    DataUsed = subset_extremes.assign_coords({'time': nw_crd.values}) # new coordinate values
    
    DataUsed = DataUsed.groupby('time').sum('time')/DataUsed.groupby('time').count() # conditional prob.
    DataUsed = DataUsed.rename({'time': 'cluster'})
    
    weights_cluster = nw_crd.value_counts().reindex(DataUsed.cluster.values) # weights based on occurrence
    
    temporal_splitting = ActualClusters[subset_type].unique() # get all available subsets of temporal flag
    ConnProb = []
    for i_temp in temporal_splitting:
        Subset_used = [i for i in DataUsed.cluster.values if i_temp == i.split('-')[1]] # get all available clusters
        Subset_used = DataUsed.sel(cluster=Subset_used) # subset only the available cluster at the temporal subset
        Subset_used = Subset_used.assign_coords({'cluster': [int(i[0]) for i in Subset_used.cluster.values]}) # rename
        ConnProb.append(Subset_used) # append to final list

    ConnProb = xr.concat(ConnProb, dim=pd.Index(temporal_splitting, name='temporal'))
    
    BS = statistical_brier_score(DataUsed, weights_cluster.values, dim_used='cluster') # get Brier Score
    BS = BS.assign_coords({'Method': f'{subset_type}_Patt'}) # assign new coord with the temporal subsetting info
       
    return (ConnProb.astype('float32'), BS.astype('float32'))

In [15]:
def connections_stats(input_data):
    
    subset_dates, longs_used = input_data
    
    Exceed_dataset = exceed_boolean(Precipitation.sel(time=subset_dates, longitude=longs_used))
    
    Conn_Clusters, BS_All = [], []
    for i_temp in ['All', 'HalfYear']:
        i_conn, i_BS = clusters_EPEs_conditioning(subset_extremes=Exceed_dataset, subset_type=i_temp)
        Conn_Clusters.append(i_conn)
        BS_All.append(i_BS)
        
    Conn_Clusters = xr.concat(Conn_Clusters, dim='temporal')
    Conn_Temp, BS_Temp = DayMonth_EPEs_conditioning(subset_extremes=Exceed_dataset)
    
    Conn_Temp2, BS_Temp2 = temporal_conditioning_subset(Exceed_dataset, 'Season')
    Conn_Temp = xr.concat([Conn_Temp, Conn_Temp2], dim='temporal')
    
    BS_All.append(BS_Temp)
    BS_All.append(BS_Temp2)
    
    BS_All = xr.concat(BS_All, dim='Method')
    BS_All.name = 'BS'
    
    BSS_All = 1 - BS_All/BS_All.sel(Method=['DayMonth_Temp', 'Season_Temp']).min('Method')
    BSS_All.name = 'BSS'
    
    BS_All = xr.merge([BS_All, BSS_All])
    
    return {'Conn_Clusters': Conn_Clusters, 'Conn_Temp': Conn_Temp, 'BS_All': BS_All}

In [16]:
All_data = connections_stats([Precipitation.time.values, Lons_all])
Connections_Patterns = All_data['Conn_Clusters']
Connections_Patterns.to_netcdf(output_dir+'Connections_Patterns.nc')
Connections_Temporal = All_data['Conn_Temp']
Connections_Temporal.to_netcdf(output_dir+'Connections_Temporal.nc')
BS_All = All_data['BS_All']
BS_All.to_netcdf(output_dir+'BS_ERA5_All.nc')

del(All_data)

In [17]:
# function for generating a 2-d DF with the forecasted cluster allocation for each date (only for specific lead time)
def forecast_subset(lead_time):
    
    Subset_Frcst = AllocatedClusters.query('step==@lead_time and number!=-1') # remove ens. mean data cause high bias
    Subset_Frcst = Subset_Frcst.pivot_table(index='valid_time', columns='number', values='Cluster')
    Subset_Frcst.index = pd.to_datetime(Subset_Frcst.index)
        
    return Subset_Frcst

In [18]:
def frcst_precip_init_date(init_date_used):
    
    ' Get the reforecast data for the selected initialization date '
    ' There is no need to derive ens mean for precipitation data, cause the mean is very biased and wrong! '
    
    # get the data of the control member (cf)
    file_name = input_dir+'Data/Precipitation/cf/Precipitation_cf_'+init_date_used+'.grb'
    control_forecast = xr.open_dataarray(file_name, engine='cfgrib')
    control_forecast = control_forecast.astype('float32') # float32 for memory efficiency
    control_forecast = control_forecast.sel(longitude=slice(Area_used[1], Area_used[3]), 
                                            latitude=slice(Area_used[0], Area_used[2]))
    control_forecast = control_forecast.assign_coords({'number': 0})
    
    # get the data of the ensemble members (pf)
    file_name = input_dir+'Data/Precipitation/pf/Precipitation_pf_'+init_date_used+'.grb'
    ensemble_forecast = xr.open_dataarray(file_name, engine='cfgrib')
    ensemble_forecast = ensemble_forecast.astype('float32') # float32 for memory efficiency
    ensemble_forecast = ensemble_forecast.sel(longitude=slice(Area_used[1], Area_used[3]), 
                                              latitude=slice(Area_used[0], Area_used[2]))
    
    final = xr.concat([control_forecast, ensemble_forecast], dim='number') # combine cf and pf data
    
    # Precipitation is a cumulative variable, so for daily values we need differences of next with day of interest
    final = xr.concat([final.isel(step=0), final.diff('step')], dim='step')
    final = final.assign_coords({'step': final.step.values-np.timedelta64(1, 'D')}) # step is the min possible lag
    
    # slicing the data due to memory limitations: i_lead is defined later on
    final = final.sel(step=np.timedelta64(i_lead, 'D'))

    return final.reset_coords(drop=True)

In [19]:
def frcst_precip_all(dates):
    
    pool = multiprocessing.Pool() # object for multiprocessing
    Data_Pr = list(tqdm.tqdm(pool.imap(frcst_precip_init_date, dates), total=len(dates), position=0, leave=True))
    pool.close()
    
    Data_Pr = xr.concat(Data_Pr, dim='time')
    
    return Data_Pr.astype('float32')

In [20]:
# function for getting subset of forecasted precipitation data and generating boolean for exceedance of extremes
def exceed_boolean_frcst(data):
    
    Data = data.stack(all_data=['time', 'number'])    
    
    Quant = Data.quantile(np.array(P_used)/100, interpolation='linear', dim='all_data', keep_attrs=True) # thresholds
    Quant = Quant.rename({'quantile': 'percentile'}) # rename coordinate
    Quant = Quant.assign_coords({'percentile': P_used}) # assign the dim values based on percentiles

    # boolean xarray for identifying if an event is over the threshold
    Exceed_xr = [data>Quant.sel(percentile=i_p) for i_p in P_used] # boolean of exceedance per percentile
    Exceed_xr = xr.concat(Exceed_xr, dim=pd.Index(P_used, name='percentile')) # concatenate data for all percentiles
    
    return Exceed_xr

In [21]:
# function for calculating the cond. probs. for all forecasted dates
def subset_cond_prob_clustering(data_sub, temp_subset, lons_used):
    
    temp_flag = temp_flagging(data_sub.index, temp_subset) # get temporal flags of the forecasted dates
    
    EPEsProb = [Connections_Patterns.sel(cluster=i_cluster, temporal=i_temp, longitude=lons_used).mean('cluster') 
                for i_cluster, i_temp in list(zip(data_sub.values, temp_flag))]

    EPEsProb = xr.concat(EPEsProb, dim=pd.Index(data_sub.index, name='time')) # concat
    EPEsProb = EPEsProb.transpose('percentile', ...) # transpose
    
    return EPEsProb.astype('float32')  

In [22]:
# function for getting cond. prob. based on temporal subsetting only
def subset_cond_prob_temporal(data_sub, temp_subset, lons_used):
    
    temp_flag = temp_flagging(data_sub.index, temp_subset)
    
    CondProb = [Connections_Temporal.sel(temporal=i_temp, longitude=lons_used) for i_temp in temp_flag]
    CondProb = xr.concat(CondProb, dim=pd.Index(data_sub.index, name='time')) # concat
    CondProb = CondProb.transpose('percentile', ...) # transpose
    
    return CondProb.reset_coords(drop=True).astype('float32')

In [23]:
# function for calculating the brier score from actual data and not based on statistics as in the other function
def BS_calculation(forecasts, observations):
    
    BS_value = (forecasts - observations)**2
    BS_value = BS_value.sum('time')/len(BS_value.time)
    
    return BS_value#.astype('float32')

In [24]:
# Use dates of Cycle 46r1: 11 June 2019 - 30 June 2020
start_date = '20190611'
end_date = '20200630'

initialization_dates = pd.date_range(start_date, end_date)

# keep Mondays (0) and Thursdays (3)
initialization_dates = initialization_dates[(initialization_dates.weekday == 0) | (initialization_dates.weekday == 3)]
initialization_dates = initialization_dates.strftime('%Y%m%d')

del(start_date, end_date)

In [25]:
# function for performing Brier Score analysis
def brier_score_analysis(input_data):
    
    lead_time, dates_subset, lons_used = input_data
    
    Subset_Frcst = forecast_subset(lead_time) # generate dataframe with forecasted cluster allocations
    if dates_subset is not None:
        Subset_Frcst = Subset_Frcst.loc[dates_subset]
    else:
        # drop the dates that correspond to the last 5 initiatilization dates, so that the dataset has a 
        # climatologically correct number of Winter/Spring/Summer/Autumn dates, since the EPEs are based on such data
        DropDates = pd.to_datetime(initialization_dates[-5:])+np.timedelta64(lead_time, 'D')
        DropDates = [DropDates-pd.DateOffset(years=i) for i in range(1,21)]
        DropDates = [j for i in DropDates for j in i]
        Subset_Frcst = Subset_Frcst[~Subset_Frcst.index.isin(DropDates)]
 
    Subset_Exceed = exceed_boolean(Precipitation.sel(time=Subset_Frcst.index, longitude=lons_used))
    
    ActualClusters_Subset = ActualClusters.loc[Subset_Frcst.index, ['Label']]
    
    BS_Forecasts = [] # list for appending all Brier Score data (to be converted in DataArray)
    
    # calculate direct EPEs BS based on the forecasted precipitation fields
    Frst_Extremes = exceed_boolean_frcst(Precip_Frcst.sel(time=Subset_Frcst.index, longitude=lons_used))
    BS_Direct = BS_calculation(Frst_Extremes.mean('number'), Subset_Exceed)
    BS_Forecasts.append(BS_Direct.assign_coords({'Method': 'EPEs_Direct'}))
        
    # calculate indirect EPEs BS for forecasted clusters given the CondProb of EPEs based on cluster and halfyear
    for i_type in ['HalfYear']: # ['All', 'HalfYear', 'Season']: no need to perform other temporal subsets
        Cond_Prob_Frcst = subset_cond_prob_clustering(Subset_Frcst, i_type, lons_used)
        Cond_Prob_Frcst = BS_calculation(Cond_Prob_Frcst, Subset_Exceed)
        BS_Forecasts.append(Cond_Prob_Frcst.assign_coords({'Method': f'{i_type}_Patt'}))
        
        Cond_Prob_Frcst_Perfect = subset_cond_prob_clustering(ActualClusters_Subset, i_type, lons_used)
        Cond_Prob_Frcst_Perfect = BS_calculation(Cond_Prob_Frcst_Perfect, Subset_Exceed)
        BS_Forecasts.append(Cond_Prob_Frcst_Perfect.assign_coords({'Method': f'{i_type}_Patt_Perfect'}))
        del(Cond_Prob_Frcst)
    
    # calculate precipitation BS for temporal climatological connections (reference scores)
    CondProb_Clim = subset_cond_prob_temporal(Subset_Frcst, 'DayMonth', lons_used)
    CondProb_Clim = BS_calculation(CondProb_Clim, Subset_Exceed)
    BS_Forecasts.append( CondProb_Clim.assign_coords({'Method': 'DayMonth_Temp'}) )
    
    CondProb_Clim = subset_cond_prob_temporal(Subset_Frcst, 'Season', lons_used)
    CondProb_Clim = BS_calculation(CondProb_Clim, Subset_Exceed)
    BS_Forecasts.append( CondProb_Clim.assign_coords({'Method': 'Season_Temp'}) )
    
    CondProb_Clim = Subset_Exceed.sum('time')/len(Subset_Exceed.time)
    CondProb_Clim = statistical_brier_score(CondProb_Clim, [1], dim_used='subset')
    BS_Forecasts.append( CondProb_Clim.assign_coords({'Method': 'All_Temp'}) )
    del(CondProb_Clim)   
    
    BS_Forecasts = xr.concat(BS_Forecasts, dim='Method')
    BS_Forecasts.name = 'BS'
    
    BS_Ref_min = BS_Forecasts.sel(Method=['DayMonth_Temp', 'Season_Temp', 'All_Temp']).min('Method')
    BSS_Forecasts = 1 - BS_Forecasts/BS_Ref_min
    BSS_Forecasts.name = 'BSS'
    
    BS_Forecasts = xr.merge([BS_Forecasts, BSS_Forecasts]).to_array().rename({'variable': 'Var'})
    
    return BS_Forecasts.astype('float32')

In [26]:
# subset data due to memory limitations
Lons_subsets = np.array_split(Lons_all, 2)

# get the index values of the 5th, 95th and median number, when data are ordered
l_m = int(bootstraps*5/100)
l_M = int(bootstraps*95/100)-1
Md = int(bootstraps/2)

In [27]:
# function for performing the full Brier Score analysis for a specific lead time (check all combinations of subsets)
def long_subset_bs_statistics(lons_used, lead_time=0):
    
    BBS_dates = []
    for i_ln, i_season in zip([247*3, 252*3, 252*3, 249*3], ['Winter', 'Spring', 'Summer', 'Autumn']):
        AllDates = Days_used[temp_flagging(Days_used, 'Season')==i_season]
        np.random.seed(10)
        BBS_dates_i = np.random.choice(AllDates, i_ln*bootstraps) # generate all bootstrapped values
        BBS_dates_i = np.array_split(BBS_dates_i, bootstraps) # split into the number of subsets (samples)
        BBS_dates.append(BBS_dates_i)

    BBS_dates = np.concatenate(BBS_dates, axis=1)
    BBS_dates = list(BBS_dates)+[None] # add also the final bootstrap which is concerning the actual data
    del(i_ln, i_season, AllDates, BBS_dates_i) 

    # generate bootstrapped statistics
    pool = multiprocessing.Pool() # object for multiprocessing for bootstrapping
    BBS_data = list(product([lead_time], BBS_dates, [lons_used]))
    BBS_data = list(tqdm.tqdm(pool.imap(brier_score_analysis, BBS_data), total=len(BBS_data), position=0, leave=True))
    pool.close()

    BBS_data = xr.concat(BBS_data, dim='bootstrapping_frcst') # concatenate bootstrapping samples
    
    # get the 5th, and 95th value for BS without considering the "Actual" subset
    BBS_data_ordered_values = np.sort(BBS_data.isel(bootstrapping_frcst=range(bootstraps)), axis=0) # don't use Actual
    P5 = BBS_data[0]*0 + BBS_data_ordered_values[l_m]
    P95 = BBS_data[0]*0 + BBS_data_ordered_values[l_M]
    
    # get the P50 after using "Actual" subset as well
    BBS_data_ordered_values = np.sort(BBS_data, axis=0)    
    P50 = BBS_data[0]*0 + BBS_data_ordered_values[Md]
    
    # combine all data to the final xarrays
    dim_name = pd.Index(['P5', 'P50', 'Actual', 'P95'], name='bootstraps_frcst')
    Final_BS = xr.concat([P5, P50, BBS_data.isel(bootstrapping_frcst=-1).reset_coords(drop=True), P95], dim=dim_name)
    
    # get percentage of methods outperforming Reference (Sign_Ref), and direct prediction of EPEs (Sign_Direct)
    Sign_Ref = ( BBS_data.sel(Var='BSS') > 0 ).reset_coords(drop=True)
    Sign_Dir = ( BBS_data.sel(Var='BS') < BBS_data.sel(Var='BS', Method='EPEs_Direct') ).reset_coords(drop=True)
    Sign_Combo = Sign_Dir & Sign_Ref
    Sign_Ref = Sign_Ref.sum('bootstrapping_frcst')
    Sign_Ref.name = 'Sign_Ref'
    Sign_Dir = Sign_Dir.sum('bootstrapping_frcst')
    Sign_Dir.name = 'Sign_Direct'
    Sign_Combo = Sign_Combo.sum('bootstrapping_frcst')
    Sign_Combo.name = 'Sign_Combo'
    Sign_Final = xr.merge([Sign_Ref, Sign_Dir, Sign_Combo])
    Sign_Final = Sign_Final/(bootstraps+1)
    Final_BS = Final_BS.to_dataset('Var')
    Final_BS = xr.merge([Final_BS, Sign_Final])
    
    return Final_BS.astype('float32')

In [28]:
def final_bs_statistics(lead_time=0):
    Final = []
    for i_lon in Lons_subsets:
        Final.append(long_subset_bs_statistics(i_lon, lead_time))

    Final = xr.concat(Final, dim='longitude')
    Final = Final.assign_coords({'leaddays':lead_time})
    
    return Final

In [29]:
def EconomicValue(input_data):
    
    frcst_data, obs_data, p_t = input_data
    
    HI = ((frcst_data>=p_t).where(obs_data==1)).sum('time')
    FA = ((frcst_data>=p_t).where(obs_data==0)).sum('time')
    MI = ((frcst_data<p_t).where(obs_data==1)).sum('time')
    CR = ((frcst_data<p_t).where(obs_data==0)).sum('time')
    
    HitRate = HI/(HI+MI)
    FalseAlarmRate = FA/(FA+CR)

    EV = FalseAlarmRate*CostRatio*(1-Extr_Occur)-HitRate*Extr_Occur*(1-CostRatio)+Extr_Occur
    EV = (EV_clim - EV)/(EV_clim-Extr_Occur*CostRatio)
    EV = EV.assign_coords({'p_thr': p_t})
      
    return EV

In [30]:
def EcVal_analysis(lead_time=0):
    
    
    Subset_Frcst = forecast_subset(lead_time) # generate dataframe with forecasted pattern allocations
    # drop all reforecasts of last 5 initiation dates, so that seasonal frequencies are correct
    DropDates = pd.to_datetime(initialization_dates[-5:])+np.timedelta64(lead_time, 'D')
    DropDates = [DropDates-pd.DateOffset(years=i) for i in range(1,21)]
    DropDates = [j for i in DropDates for j in i]
    Subset_Frcst = Subset_Frcst[~Subset_Frcst.index.isin(DropDates)]
    
    # generate boolean with extremes for actual data, direct precip. forecast and indirect based on patterns
    Subset_Exceed = exceed_boolean(Precipitation.sel(time=Subset_Frcst.index))
    Frst_Extremes = exceed_boolean_frcst(Precip_Frcst.sel(time=Subset_Frcst.index)).mean('number')
    Cond_Prob_Frcst = subset_cond_prob_clustering(Subset_Frcst, 'HalfYear', Precipitation.longitude.values)
    CondProb_DayMonth = subset_cond_prob_temporal(Subset_Frcst, 'DayMonth', Precipitation.longitude.values)
    CondProb_Season = subset_cond_prob_temporal(Subset_Frcst, 'Season', Precipitation.longitude.values)
    
    # generate auxiliary data needed for calculating the Economic Value for different ratios of Cost/Gain measures
    global Extr_Occur, CostRatio, EV_clim
    Extr_Occur = Subset_Exceed.mean('time')
    CostRatio = xr.DataArray(np.linspace(0, 1, 100), dims=['cost_ratio'], 
                             coords={'cost_ratio': np.linspace(0, 1, 100)})
    CostRatio = Extr_Occur*0+CostRatio # generate the remaining dimensions of the cost ratio
    Extr_Occur = CostRatio*0+Extr_Occur # generate the "cost_ratio" dimension of the extremes percentage occurrences
    EV_clim = xr.concat([CostRatio, Extr_Occur], dim='clim').min('clim') # climatological gain for each coordinate
    
    # calculate Economic Value for direct, indirect and climatological forecasting and combine the data
    pool = multiprocessing.Pool()
    EV_dir = list(product([Frst_Extremes], [Subset_Exceed], np.arange(12)/11))
    EV_dir = list(tqdm.tqdm(pool.imap(EconomicValue, EV_dir), total=len(EV_dir), position=0, leave=True))
    EV_dir = xr.concat(EV_dir, dim='p_thr').max('p_thr')
    pool.close()
    
    thresholds_used = list(np.linspace(0,np.ceil(Cond_Prob_Frcst.max().values*100)/100,100))+[1]
    pool = multiprocessing.Pool()
    EV_indir = list(product([Cond_Prob_Frcst], [Subset_Exceed], thresholds_used))
    EV_indir = list(tqdm.tqdm(pool.imap(EconomicValue, EV_indir), total=len(EV_indir), position=0, leave=True))
    EV_indir = xr.concat(EV_indir, dim='p_thr').max('p_thr')
    pool.close()
    
    thresholds_used = list(np.linspace(0,np.ceil(CondProb_DayMonth.max().values*100)/100,100))+[1]
    pool = multiprocessing.Pool()
    EV_DM = list(product([CondProb_DayMonth], [Subset_Exceed], thresholds_used))
    EV_DM = list(tqdm.tqdm(pool.imap(EconomicValue, EV_DM), total=len(EV_DM), position=0, leave=True))
    EV_DM = xr.concat(EV_DM, dim='p_thr').max('p_thr')
    pool.close()
    
    thresholds_used = list(np.linspace(0,np.ceil(CondProb_Season.max().values*100)/100,100))+[1]
    pool = multiprocessing.Pool()
    EV_Sea = list(product([CondProb_Season], [Subset_Exceed], thresholds_used))
    EV_Sea = list(tqdm.tqdm(pool.imap(EconomicValue, EV_Sea), total=len(EV_Sea), position=0, leave=True))
    EV_Sea = xr.concat(EV_Sea, dim='p_thr').max('p_thr')
    pool.close()
    
    EV_final = xr.concat([EV_dir, EV_indir, EV_DM, EV_Sea], 
                         dim=pd.Index(['Direct', 'Indirect', 'Clim_DayMonth', 'Clim_Seasonal'], name='Method'))
    EV_final = EV_final.assign_coords({'leaddays': lead_time})
    del Extr_Occur, CostRatio, EV_clim
    
    return EV_final

In [31]:
LeadTimes = AllocatedClusters.step.unique()
BS_EPEs_All = []
EV_EPEs_All = []
for i_lead in tqdm.tqdm(LeadTimes):
    # read the forecasted precipitation data
    Precip_Frcst = frcst_precip_all(initialization_dates) # get the precip forecasts for the lead time of interest
    Days_used = Precip_Frcst.time.values+np.timedelta64(i_lead, 'D') # get the valid dates for the forecasts
    Precip_Frcst = Precip_Frcst.assign_coords({'time': Days_used}) # change to the valid time
    Days_used = pd.to_datetime(Days_used)
    BS_subset = final_bs_statistics(lead_time=i_lead)
    BS_EPEs_All.append(BS_subset)
    EV_subset = EcVal_analysis(lead_time=i_lead)
    EV_EPEs_All.append(EV_subset)
    
BS_EPEs_All = xr.concat(BS_EPEs_All, dim='leaddays')
BS_EPEs_All.to_netcdf(output_dir+'BS_leaddays.nc')
EV_EPEs_All = xr.concat(EV_EPEs_All, dim='leaddays')
EV_EPEs_All.to_netcdf(output_dir+'EV_leaddays.nc')