In [1]:
# (C) Copyright 1996- ECMWF.
#
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.

In [2]:
# basic libraries for data analysis
import numpy as np 
import pandas as pd
import geopandas as gpd
import xarray as xr

from itertools import product

# specialized libraries
import xagg as xa # spatial aggregation of data taken into consideration overlap of grid cells to shapefile
from scipy.stats import binom # binomial distribution for significance testing of extremes and predictors
from recombinator import moving_block_bootstrap # blocked bootstraps for checking significance of results

import multiprocessing # parallel processing
import tqdm # timing

# the below is for silencing warnings from the xagg package
from contextlib import contextmanager
import sys, os
@contextmanager
def suppress_stdout():
    with open(os.devnull, "w") as devnull:
        old_stdout = sys.stdout
        sys.stdout = devnull
        try:  
            yield
        finally:
            sys.stdout = old_stdout

### Input variables

In [3]:
dir_loc = '' 
dir_loc_wvf = ''
dir_loc_rh = ''
precip_EOBS_loc = 'rr_ens_mean_0.1deg_reg_v23.1e.nc'
precip_ERA5_loc = 'D1_Total_Precipitation.grb'

output_data_file = ''
med_clusters_loc = f'{output_data_file}Med_LocalizedPatterns_Labels.nc' # path of 9 Med. pat. allocations

In [4]:
bootstraps = 1000
rolling_days = 3 # what temporal resolution to analyse? in number of days; integer>=1
season_studied = 'Winter' # All, Winter or Summer!
observational_dataset = 'EOBS' # ERA5 or EOBS
perc_used = [90, 95, 97]
eobs_end_year = 2019 # depending data availability (for Calabria data are ok till 2019, for Israel till 2016)
prslvl_RH_value = 850 # pressure level used for overlaying Relative Humitidy predictor (700, or 850)

In [5]:
warn_areas_Calabria = gpd.read_file('zip://../Shapefiles/Calabria/WarningAreas.zip')
warn_areas_Calabria = warn_areas_Calabria.drop(columns=['id']) # keep only columns of interest
lag_Calabria = 0 # for some areas (e.g. Isreal) EOBS data have a temporal shift compared to ERA5 (based on corr)

### Create final format of shapefiles and data needed per analysed area

In [6]:
aux_name = f'{observational_dataset}_{season_studied}_Days{rolling_days}'
output_data_file = f'/{aux_name}'
output_data_file = f'{output_data_file}/Statistics_{aux_name}_Bootstraps_FullAreas_Data/'
!mkdir -p $output_data_file # directory to store the outputs
del(aux_name)

In [7]:
def prepros_shpf(shapefile_input):
    
    output_shp = shapefile_input.to_crs("EPSG:4326") # convert to lat/lon coordinate system 
    output_shp.columns = ['name_area', 'geometry'] # change names of columns for consistency

    output_shp['Area'] = len(output_shp)
    areal_full = output_shp.dissolve(by='Area')
    areal_full.name_area = 'Full'

    output_shp = pd.concat([output_shp, areal_full]).drop(columns='Area')
    
    # find boundary for gridded data based on the warning areas (& add 0.1 degree extra boundary for security)
    total_bounds = output_shp.total_bounds
    total_bounds = [np.floor(i)-.1 if count in [0, 1] else np.ceil(i)+.1 for count, i in enumerate(total_bounds)]
    
    return [output_shp, total_bounds]

In [8]:
# # dictionary of names with the subdomains used. For each key, give the following data:
# 1. warning areas, 2. precipitation subdomain, 3. temporal lag between EOBS and ERA5 daily precipitation data
domains_used = {
                'Calabria': prepros_shpf(warn_areas_Calabria)+[lag_Calabria],
                }

del(warn_areas_Calabria, lag_Calabria)

In [9]:
results_all = {i_key: {} for i_key in domains_used} # generate dictionary for storing all results

### Auxiliary functions for spatial aggregation & extremes identification

In [10]:
def generate_data_subset(data_full, dom_name_used):
    
    bbox_used = domains_used[dom_name_used][1]
    dt_sbst = data_full.sel(longitude=slice(bbox_used[0], bbox_used[2]), 
                            latitude=slice(bbox_used[3], bbox_used[1]))
    
    return dt_sbst    

In [11]:
def spatial_aggreg(input_data):
    
    data_xr_used, domain_name_used, lag_used = input_data # data used    
    
    # create auxiliary data for the domain of interest
    warn_areas = domains_used[domain_name_used][0]
    data_xr_final = generate_data_subset(data_xr_used, domain_name_used)
    
    # shifting data in case lag in EOBS data exists
    data_xr_final = data_xr_final.shift(time=lag_used)
    data_xr_final = data_xr_final.sel(time=precip_ERA5.time.values)
    
    # keep only cells with <1% NaNs (it also improves xagg cause it's wrong if only few steps of cells are NaN) 
    data_nans = np.isnan(data_xr_final).sum('time')/len(data_xr_final.time) # % of NaNs
    data_xr_final = data_xr_final.where(data_nans<0.01) # mask xarray
    
    weightmap = xa.pixel_overlaps(data_xr_final, warn_areas) # overlap of pixels & polygons
    
    aggregated = xa.aggregate(data_xr_final, weightmap) # calculation of areal average
    aggregated = aggregated.to_dataset()[data_xr_used.name] # convert to dataarray
    aggregated = aggregated.rename({'pix_idx': 'WarnArea'}) # rename coordinate

    return aggregated

In [12]:
def define_extremes(data, percentiles):
    
    # keep only cells that have <1% NaNs (also improves xagg cause it's wrong if only few steps of cell are NaN) 
    data_nans = np.isnan(data).sum('time')/len(data.time) # % of NaNs
    
    quant_data = data.quantile(np.array(percentiles)/100, interpolation='linear', dim='time') # thresh.
    quant_data = quant_data.rename({'quantile': 'percentile'}) # rename coordinate
    quant_data = quant_data.assign_coords({'percentile': percentiles}) # assign the dim values based on lags

    # boolean xarray for identifying if an event is over the threshold
    exceed_xr = [data>quant_data.sel(percentile=i_p) for i_p in percentiles] 
    exceed_xr = xr.concat(exceed_xr, dim=pd.Index(percentiles, name='percentile')) # concat data for all perc.
   
    exceed_xr = exceed_xr.where(data_nans<0.01) # mask xarray and keep only grid cells with less than 1% NaNs
   
    return exceed_xr

### ERA5 Precipitation (as Predictor)

In [13]:
precip_ERA5 = xr.open_dataarray(precip_ERA5_loc, engine='cfgrib').reset_coords(drop=True)
dates_0UTC = pd.to_datetime(pd.to_datetime(precip_ERA5.time.values).strftime('%Y%m%d'))
precip_ERA5 = precip_ERA5.assign_coords({'time': dates_0UTC})
if observational_dataset == 'ERA5':
    precip_ERA5 = precip_ERA5.sel(time=slice('1979', '2019')) # same period as patterns
else:
    precip_ERA5 = precip_ERA5.sel(time=slice('1979', str(eobs_end_year))) 

del(dates_0UTC)
precip_ERA5

In [14]:
precip_ERA5_spatial = {}
for i_key in results_all:
    with suppress_stdout():
        data_i = spatial_aggreg([precip_ERA5, i_key, 0])
    
    data_i = data_i.rolling(time=rolling_days).sum() # rolling possible cause dates continuous (or resample!)
    data_i.dropna('time').to_netcdf(f'{output_data_file}{i_key}_PrecipERA5_Timeseries.nc') # save all data
    # keep not onverlapping instances and drop the first ones because of NaN
    data_i = data_i.isel(time=range(rolling_days-1, len(data_i.time), rolling_days))
    
    if season_studied == 'Summer':
        all_dates = data_i.time.values[data_i.time.dt.month.isin([3, 4, 5, 6, 7, 8])]
    elif season_studied == 'Winter':
        all_dates = data_i.time.values[data_i.time.dt.month.isin([9, 10, 11, 12, 1, 2])]
    else:
        all_dates = data_i.time.values # keep all dates used for the analysis

    precip_ERA5_spatial[i_key] = data_i.sel(time=all_dates)

precip_dates = precip_ERA5_spatial[i_key].time.values # keep all dates used in the precipitation dataset
del(i_key, data_i)

### Mediterranean Patterns

In [15]:
med_clusters = xr.open_dataarray(med_clusters_loc)
med_clusters = med_clusters.sel(variable_type='Anomalies', ClustersNumber='Clusters_9', rolling=rolling_days)
med_clusters_max = int(med_clusters.ClustersNumber.values.tolist()[9:])
med_clusters = pd.DataFrame({'Label': med_clusters.values}, index=med_clusters.time)
med_clusters = med_clusters.loc[all_dates] # med_clusters.loc[precip_ERA5.time.values]

In [16]:
# indices for start & end of Summer Half (Summer Half between 16th April - 15th October, inclusive of both dates)
# this is based on the climatological frequencies of the Mediterranean patterns, and used only for them
sorted_dates = np.array(pd.date_range('20040101', '20041231').strftime('%m%d')) # leap year for getting all dates
start_summerhalf = np.where(sorted_dates=='0416')[0]
end_summerhalf = np.where(sorted_dates=='1015')[0] 

In [17]:
def temp_flagging(valid_dates, temp_subset):
    
    valid_dates = pd.to_datetime(valid_dates)
    
    if temp_subset == 'All':
        temporal_flag = ['All']*len(valid_dates)
    elif temp_subset == 'HalfYear':
        temporal_flag_aux = pd.Series([i[-4:] for i in valid_dates.strftime('%Y%m%d')])
        temporal_flag_aux = temporal_flag_aux.map({i: i_c for i_c, i in enumerate(sorted_dates)})
        temporal_flag_aux = temporal_flag_aux.values
        temporal_flag = np.repeat(['WinterHalf'], len(temporal_flag_aux))
        temporal_flag[(temporal_flag_aux>=start_summerhalf) & (temporal_flag_aux<=end_summerhalf)] = 'SummerHalf'
    elif temp_subset == 'Season':
        temporal_flag = (valid_dates.month%12 + 3)//3
        temporal_flag = temporal_flag.map({1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Autumn'})
    elif temp_subset == 'Month':
        temporal_flag = valid_dates.month.astype(str)
    elif temp_subset == 'DayMonth':
        temporal_flag = pd.Series([i[-4:] for i in valid_dates.strftime('%Y%m%d')])
        temporal_flag = temporal_flag.values
        
    return temporal_flag

In [18]:
for i_tag in ['All', 'HalfYear', 'Season', 'Month', 'DayMonth']:
    med_clusters[i_tag] = temp_flagging(med_clusters.index, i_tag)
    
del(i_tag,sorted_dates, start_summerhalf, end_summerhalf, temp_flagging)

### Generate bootstrap dates for checking significance of connections

In [19]:
# get the index values of the 5th, 25th, 75th, 95th and median, when data are ordered (for the bootstraping)
l_Q5 = int(bootstraps*5/100)
l_Q25 = int(bootstraps*25/100)
l_Q50 = int(bootstraps/2)
l_Q75 = int(bootstraps*75/100)-1
l_Q95 = int(bootstraps*95/100)-1

years_all = np.array(list(set(pd.to_datetime(precip_ERA5.time.values).year)))
sub_len = int(len(years_all)*.8)
np.random.seed(10)
bbs_dates = moving_block_bootstrap(years_all, 1, bootstraps, replace=False, sub_sample_length=sub_len)

del(sub_len)

### Precipitation data (the predictant)

In [20]:
precip_EOBS = xr.open_dataarray(precip_EOBS_loc)
precip_EOBS.name = 'tp' # to be consistent with ERA5 naming
precip_EOBS = precip_EOBS.sortby('latitude', ascending=False) # to be consistent with ERA5
precip_EOBS = precip_EOBS.sel(time=precip_ERA5.time)
if observational_dataset == 'ERA5':
    precip_EOBS = precip_ERA5

precip_EOBS

In [1]:
precip_EOBS_spatial = {}
for i_key in results_all:
    with suppress_stdout():
        data_i = spatial_aggreg([precip_EOBS, i_key, domains_used[i_key][-1]])        
    data_i = data_i.rolling(time=rolling_days).sum()
    data_i.dropna('time').to_netcdf(f'{output_data_file}{i_key}_PrecipObs_Timeseries.nc') # save all data
    precip_EOBS_spatial[i_key] = data_i.sel(time=all_dates) # same dates as ERA5 precip (the main predictor)
    
del(i_key, data_i)

In [22]:
extremes_data_final = {}
for i_key in results_all:
    extremes_data_final[i_key] = define_extremes(precip_EOBS_spatial[i_key], perc_used)
    
del(i_key)

### Precipitation ERA5 (as predictor)

In [23]:
perc_used_ERA5 = [90]+perc_used
perc_used_ERA5 = list(set(perc_used_ERA5))
perc_used_ERA5 = sorted(perc_used_ERA5)
perc_used_ERA5 = perc_used

extremes_ERA5 = {}
for i_key in results_all:
    i_ERA5 = define_extremes(precip_ERA5_spatial[i_key], perc_used_ERA5)
    extremes_ERA5[i_key] = i_ERA5
    
del(i_key, i_ERA5)

In [24]:
def final_dates_used(years_used):
    
    dts = [pd.date_range(f'{i}0101', f'{i}1231') for i in years_used] # generate all days in the selected years
    dts = [item for sublist in dts for item in sublist] # concatenate all dates to one list
    dts = pd.to_datetime(dts)[pd.to_datetime(dts).isin(precip_dates)] # dates available in precip data
    
    return dts

In [25]:
def cond_prob_predictors(data_used, domain_used, years_used, derive_sign=False):
    
    # get dates of interest and final data for predictor and predictant
    dts = final_dates_used(years_used)

    if data_used[domain_used].name=='tp':
        predictor = data_used[domain_used].sel(time=dts)
    else:
        predictor = generate_final_wvf(domain_used, dts)
    
    predictor = define_extremes(predictor, perc_used)
        
    predictant = precip_EOBS_spatial[domain_used].sel(time=dts)
    predictant = define_extremes(predictant, perc_used)
    
    # cond prob of extreme precip if there is extreme or no extreme value on the predictor 
    counts_Hit = predictant.where(predictor==1).sum('time')
    counts_FalseNeg = predictant.where(predictor==0).sum('time')

    cond_prb_Hit = counts_Hit/predictor.sum('time')
    cond_prb_FalseNeg = counts_FalseNeg/(predictor==0).sum('time')

    dim_name = pd.Index([1, 0], name='extr_predictor')
    counts_all = xr.concat([counts_Hit, counts_FalseNeg], dim=dim_name)
    cond_probs = xr.concat([cond_prb_Hit, cond_prb_FalseNeg], dim=dim_name)
    
    if derive_sign:
        pool = multiprocessing.Pool()
        cmbs_all = list(product([data_used], [domain_used], bbs_dates))
        cmbs_all = cmbs_all+[(data_used, domain_used, years_all)]
        res = list(tqdm.tqdm(pool.imap(cond_probs_sign, cmbs_all), total=len(cmbs_all), position=0, leave=True))
        pool.close()

        res = xr.concat(res, dim='bootstraps').mean('bootstraps')
        res = res>.95 # 95% conf. interval (over 95% of bootstraps have statistially significant connections)
        
        new_prbs = counts_all.where(res==0).sum('extr_predictor', skipna=False)/len(predictor.time)
        new_prbs = new_prbs.expand_dims({'extr_predictor': counts_all.extr_predictor.values})

        cond_probs_constrained = cond_probs.where(res).fillna(0)+new_prbs.fillna(0)
        
        cond_probs = xr.concat([cond_probs_constrained, cond_probs], dim=pd.Index([1, 0], name='Constraints'))
    
    cond_prb_Hit = cond_probs.sel(extr_predictor=1)
    cond_prb_CorNeg = cond_probs.sel(extr_predictor=0)
    perfect_forecasts = cond_prb_Hit.where(predictor).fillna(0)+cond_prb_CorNeg.where(predictor==0).fillna(0)

    perfect_brier = (perfect_forecasts - predictant)**2
    perfect_brier = perfect_brier.sum('time')/len(perfect_brier.time)
    perfect_brier.name = 'BS'

    return {'CondProbs': cond_probs, 'Brier': perfect_brier}

In [26]:
def cond_probs_sign(input_data):
    
    data_used, domain_used, years_used = input_data

    dts = final_dates_used(years_used)

    if data_used[domain_used].name=='tp':
        predictor = data_used[domain_used].sel(time=dts)
    else:
        predictor = generate_final_wvf(domain_used, dts)
    
    predictor = define_extremes(predictor, perc_used)
    
    predictant = precip_EOBS_spatial[domain_used].sel(time=dts)
    predictant = define_extremes(predictant, perc_used)

    # cond prob of extreme precip if there is extreme or no extreme value on the predictor 
    counts_Hit = predictant.where(predictor==1).sum('time')
    counts_FalseNeg = predictant.where(predictor==0).sum('time')

    cond_prb_Hit = counts_Hit/predictor.sum('time')
    cond_prb_FalseNeg = counts_FalseNeg/(predictor==0).sum('time')

    dim_name = pd.Index([1, 0], name='extr_predictor')
    cond_probabilities = xr.concat([cond_prb_Hit, cond_prb_FalseNeg], dim=dim_name)
    counts_all = xr.concat([counts_Hit, counts_FalseNeg], dim=dim_name)
    predictor_all = xr.concat([(predictor==1).sum('time'), (predictor==0).sum('time')], dim=dim_name)
    predictor_all = predictor_all/len(predictor.time)
    
    if predictor.name=='tp':
        predictor_all = predictor_all.transpose('extr_predictor', 'WarnArea', 'percentile')  
        counts_all = counts_all.transpose('extr_predictor', 'WarnArea', 'percentile')   
    else:
        predictor_all = predictor_all.transpose('extr_predictor', 
                                                'Extra', 'wvf_direction', 'WarnArea', 'percentile')
        counts_all = counts_all.transpose('extr_predictor', 'Extra', 'wvf_direction', 'WarnArea', 'percentile')

    upper_sign = binom.cdf(k=counts_all-1, n=counts_all.sum('extr_predictor'), p=predictor_all)
    lower_sign = binom.cdf(k=counts_all, n=counts_all.sum('extr_predictor'), p=predictor_all)
    # check statistical significance considering 95% two-tailed confidence interval
    sign_test = (upper_sign>.975)+(upper_sign<.025)+(lower_sign>.975)+(lower_sign<.025)
    sign_test = counts_all.copy(deep=True).fillna(0)*0+sign_test
    
    return sign_test

In [27]:
for i_key in results_all:
    results_all[i_key]['PrecipERA5'] = cond_prob_predictors(precip_ERA5_spatial, i_key, years_all, True)
    
del(i_key)

100%|███████████████████████████████████████| 1001/1001 [00:18<00:00, 53.37it/s]


### Extreme Water Vapour Flux

In [28]:
def read_wvf(variable):
    # read actual daily values of water vapour flux (wvf) data
    file_path = dir_loc_wvf + 'D1_Mean_'+variable+'.grb'
    daily_data = xr.open_dataarray(file_path, engine='cfgrib').reset_coords(drop=True) # read data
    
    daily_data.name = variable
    daily_data = daily_data.sel(time=precip_ERA5.time.values) # same as the precipitation data used
    daily_data = daily_data.sel(latitude=slice(47, 29), longitude=slice(-10, 40)) # only over Mediterranean
    
    return daily_data

In [29]:
wvf_northwards = read_wvf('WVFnorth')
wvf_northwards.name = 'wvf'
wvf_eastwards = read_wvf('WVFeast')
wvf_eastwards.name = 'wvf'

wvf_name = pd.Index(['NorthW', 'SouthW', 'EastW', 'WestW', 'Total'], name='wvf_direction')
    
wvf_spatial = {}
for i_dom in results_all:
    with suppress_stdout():
        i_dom_northw = spatial_aggreg([wvf_northwards, i_dom, 0])
    i_dom_northw = i_dom_northw.rolling(time=rolling_days).mean()
    i_dom_southw = -i_dom_northw
    
    with suppress_stdout():
        i_dom_eastw = spatial_aggreg([wvf_eastwards, i_dom, 0])
    i_dom_eastw = i_dom_eastw.rolling(time=rolling_days).mean()
    i_dom_westw = -i_dom_eastw
    
    i_dom_total = np.sqrt(i_dom_northw**2 + i_dom_eastw**2)
    i_dom_final = [i_dom_northw, i_dom_southw, i_dom_eastw, i_dom_westw, i_dom_total]
    i_dom_final = xr.concat(i_dom_final, dim=wvf_name)
                  
    i_dom_final.dropna('time').to_netcdf(f'{output_data_file}{i_dom}_WvfERA5_Timeseries.nc')
    wvf_spatial[i_dom] = i_dom_final.sel(time=all_dates)
    
del(wvf_northwards, wvf_eastwards, i_dom, i_dom_final,
    i_dom_northw, i_dom_southw, i_dom_eastw, i_dom_westw, i_dom_total)

In [30]:
""" This section is only needed if hourly data are used as inputs so they are aggregated to daily """
pls = [700, 850]

relhum = [xr.open_dataarray(f'{dir_loc_rh}D1_Mean_RH{i}_hourly_calc.grb', engine='cfgrib') for i in pls]
relhum = xr.concat(relhum, dim=pd.Index(pls, name='pressure_level'))
relhum = relhum.resample({'time': '1D'}).mean()
relhum = relhum.reset_coords(drop=True)
relhum.to_netcdf(dir_loc_rh+'D1_Mean_RHcalc.nc')
del(pls, relhum)
""" End of section """

relhum = xr.open_dataarray(f'{dir_loc_rh}D1_Mean_RHcalc.nc')
relhum = relhum.sel(time=precip_ERA5.time.values, pressure_level=prslvl_RH_value)
relhum = relhum.reset_coords(drop=True)

relhum_spatial = {}
for i_dom in results_all:
    with suppress_stdout():
        i_dom_relhum = spatial_aggreg([relhum, i_dom, 0])
    i_dom_relhum = i_dom_relhum.rolling(time=rolling_days).mean()
    coords_new_names = f'RH{prslvl_RH_value}'
    i_dom_relhum = i_dom_relhum.expand_dims({'pressure_level': [coords_new_names]})    
    i_dom_relhum.dropna('time').to_netcdf(f'{output_data_file}{i_dom}_RhERA5_Timeseries.nc')
    relhum_spatial[i_dom] = i_dom_relhum.sel(time=all_dates)
    
del(relhum, i_dom, coords_new_names, i_dom_relhum)

In [31]:
def normalize(data):
    
    return (data-data.min('time'))/(data.max('time')-data.min('time'))

In [32]:
def generate_final_wvf(dom_used, dates_used):
    
    # when combining various predictors, the data are normalized based on min-max values
    
    rh_original = relhum_spatial[dom_used].sel(time=dates_used)
    rh_rename_for_wvf = rh_original.rename({'pressure_level': 'wvf_direction'})
    wvf_original = wvf_spatial[dom_used].sel(wvf_direction=wvf_name).sel(time=dates_used)

    i_data_raw = xr.concat([wvf_original, rh_rename_for_wvf], dim='wvf_direction')
    i_data_raw = i_data_raw.expand_dims({'Extra': ['Alone']})

    i_data_norm = normalize(i_data_raw)
    
    i_data_norm1 = normalize(rh_original).reset_coords(drop='pressure_level')
    i_data_norm1 = (i_data_norm+i_data_norm1)/2
    i_data_norm1 = i_data_norm1.assign_coords({'Extra': [prslvl_RH]})

    final_wvf = xr.concat([i_data_raw, i_data_norm1], dim='Extra')
    
    return final_wvf.isel(pressure_level=0).reset_coords(drop=True)

In [33]:
prslvl_RH = f'RH{prslvl_RH_value}'

final_local_predictors_spatial = {}
for i_dom in results_all:
    
    final_local_predictors_spatial[i_dom] = generate_final_wvf(i_dom, all_dates)
    
del(i_dom)

In [34]:
perc_used_wvf = sorted([80]+perc_used)

extremes_final_local_pred = {}
for i_key in results_all:
    i_final_local_pred = define_extremes(final_local_predictors_spatial[i_key], perc_used_wvf)
    extremes_final_local_pred[i_key] = i_final_local_pred
    
del(i_key, i_final_local_pred)

In [35]:
for i_key in results_all:
    results_all[i_key]['WvfERA5'] = cond_prob_predictors(final_local_predictors_spatial, i_key, years_all, True)
    
del(i_key)

100%|███████████████████████████████████████| 1001/1001 [00:33<00:00, 30.21it/s]


### Mediterranean Patterns as predictor

In [36]:
def calculate_cond_prob_patterns(area_used, years_used, derive_sign=False):
    
    dts = final_dates_used(years_used)
    
    observations = precip_EOBS_spatial[area_used].sel(time=dts)
    observations = define_extremes(observations, perc_used)
    labels = xr.DataArray(med_clusters.Label).rename({'dim_0': 'time'})
    
    labels = labels.sel(time=observations.time) # keep only dates in precipitation data
    
    # make labels boolean for each day and cluster
    labels = [labels==i for i in range(med_clusters_max)]
    labels = xr.concat(labels, dim=pd.Index(range(med_clusters_max), name='cluster'))
    
    mask_data = labels.sum('time')!=0 # mask data & exclude clusters that are not existing in some combinations
    
    # keep extremes of wvf for further conditioning the Mediterranean patterns       
    wvf_extremes_used = generate_final_wvf(area_used, dts)
    wvf_extremes_used = define_extremes(wvf_extremes_used, [80]).rename({'percentile': 'wvf_percentile'})
    wvf_extremes_used = wvf_extremes_used.assign_coords({'wvf_percentile': [1]})
    wvf_extremes_used_p0 = (wvf_extremes_used.isel(wvf_percentile=0)>=0).assign_coords(wvf_percentile=0)
    wvf_extremes_used = xr.concat([wvf_extremes_used_p0, wvf_extremes_used], dim='wvf_percentile')
    
    allocations_data_pos = labels.where(wvf_extremes_used).fillna(0).astype(int)
    allocations_data_neg = labels.where(wvf_extremes_used==0).fillna(0).astype(int)

    ext_totals_pos = observations.where(allocations_data_pos).sum('time')
    condprobs_pos = ext_totals_pos/allocations_data_pos.sum('time')
    condprobs_pos = condprobs_pos.where(mask_data)

    samples_pos = [allocations_data_pos.sum('time'), ext_totals_pos]
    samples_pos = xr.concat(samples_pos, dim=pd.Index(['All', 'Extremes'], name='Subset'))

    ext_totals_neg = observations.where(allocations_data_neg).sum('time')
    condprobs_neg = ext_totals_neg/allocations_data_neg.sum('time')
    condprobs_neg = condprobs_neg.where(mask_data)

    samples_neg = [allocations_data_neg.sum('time'), ext_totals_neg]
    samples_neg = xr.concat(samples_neg, dim=pd.Index(['All', 'Extremes'], name='Subset'))

    dim_name = pd.Index(['Pos', 'Neg'], name='CondProbType')
    condprobs = xr.concat([condprobs_pos, condprobs_neg], dim=dim_name)
    condprobs.name = 'CondProbs' 
    counts_all = xr.concat([ext_totals_pos, ext_totals_neg], dim=dim_name)
    samples = xr.concat([samples_pos, samples_neg], dim=dim_name)
    samples.name = 'Samples'
    
    if derive_sign:
        pool = multiprocessing.Pool()
        cmbs_all = list(product([area_used], bbs_dates))
        cmbs_all = cmbs_all+[(area_used, years_all)]
        res = list(tqdm.tqdm(pool.imap(med_pat_cond_probs_sign, cmbs_all), 
                             total=len(cmbs_all), position=0, leave=True))
        pool.close()

        res = xr.concat(res, dim='bootstraps').mean('bootstraps')
        res = res>.95
        
        new_prbs = counts_all.where(res==0).sum(['CondProbType', 'cluster'])/len(observations.time)
        new_prbs = new_prbs.expand_dims({'CondProbType': dim_name, 'cluster': range(med_clusters_max)})    
        
        cond_probs_constrained = condprobs.where(res).fillna(0)+new_prbs.where(res==0).fillna(0)
        
        condprobs = xr.concat([cond_probs_constrained, condprobs], dim=pd.Index([1, 0], name='Constraints'))
        condprobs.name = 'CondProbs' 
    
    cond_prb_Hit = condprobs.sel(CondProbType='Pos')
    cond_prb_CorNeg = condprobs.sel(CondProbType='Neg')
    
    perf_frcst = (allocations_data_neg*cond_prb_CorNeg).fillna(0) + (allocations_data_pos*cond_prb_Hit).fillna(0)
    perf_frcst = perf_frcst.sum('cluster') # only 1 cluster per day, so sum works without problem (rest are 0)

    perfect_brier = (perf_frcst-observations)**2
    perfect_brier = perfect_brier.sum('time')/len(perfect_brier.time)
    perfect_brier.name = 'BS'
    
    condprobsall = xr.merge([condprobs, samples])
    
    return {'CondProbs': condprobsall, 'Brier': perfect_brier}

In [37]:
def med_pat_cond_probs_sign(input_data):
    
    area_used, years_used = input_data
    
    dts = final_dates_used(years_used)
    
    observations = precip_EOBS_spatial[area_used].sel(time=dts)
    observations = define_extremes(observations, perc_used)
    labels = xr.DataArray(med_clusters.Label).rename({'dim_0': 'time'})
    
    labels = labels.sel(time=observations.time) # keep only dates in precipitation data
    
    # make labels boolean for each day and cluster
    labels = [labels==i for i in range(med_clusters_max)]
    labels = xr.concat(labels, dim=pd.Index(range(med_clusters_max), name='cluster'))
    
    mask_data = labels.sum('time')!=0 # mask data & exclude clusters that are not existing in some combinations
    
    # keep extremes of wvf for further conditioning the Mediterranean patterns       
    wvf_extremes_used = generate_final_wvf(area_used, dts)
    wvf_extremes_used = define_extremes(wvf_extremes_used, [80]).rename({'percentile': 'wvf_percentile'})
    wvf_extremes_used = wvf_extremes_used.assign_coords({'wvf_percentile': [1]})
    wvf_extremes_used_p0 = (wvf_extremes_used.isel(wvf_percentile=0)>=0).assign_coords(wvf_percentile=0)
    wvf_extremes_used = xr.concat([wvf_extremes_used_p0, wvf_extremes_used], dim='wvf_percentile')
     
    allocations_data_pos = labels.where(wvf_extremes_used).fillna(0).astype(int)
    allocations_data_neg = labels.where(wvf_extremes_used==0).fillna(0).astype(int)

    ext_totals_pos = observations.where(allocations_data_pos).sum('time')
    condprobs_pos = ext_totals_pos/allocations_data_pos.sum('time')
    condprobs_pos = condprobs_pos.where(mask_data)

    ext_totals_neg = observations.where(allocations_data_neg).sum('time')
    condprobs_neg = ext_totals_neg/allocations_data_neg.sum('time')
    condprobs_neg = condprobs_neg.where(mask_data)

    
    dim_name = pd.Index(['Pos', 'Neg'], name='CondProbType')
    condprobs = xr.concat([condprobs_pos, condprobs_neg], dim=dim_name)
    condprobs.name = 'CondProbs' 
    
    counts_all = xr.concat([ext_totals_pos, ext_totals_neg], dim=dim_name)
    
    predictor_all = [allocations_data_pos.sum('time'), allocations_data_neg.sum('time')]
    predictor_all = xr.concat(predictor_all, dim=dim_name)/len(observations.time)    
    
    predictor_all = predictor_all.expand_dims({'percentile': observations.percentile.values})
    predictor_all = predictor_all.transpose('CondProbType', 'cluster', 'wvf_percentile', 
                                            'Extra', 'wvf_direction', 'WarnArea', 'percentile')
    counts_all = counts_all.transpose('CondProbType', 'cluster', 'wvf_percentile', 'Extra',
                                      'wvf_direction', 'WarnArea', 'percentile')

    upper_sign = binom.cdf(k=counts_all-1, n=counts_all.sum(['CondProbType', 'cluster']), p=predictor_all)
    lower_sign = binom.cdf(k=counts_all, n=counts_all.sum(['CondProbType', 'cluster']), p=predictor_all)
    # check statistical significance considering 95% two-tailed confidence interval
    sign_test = (upper_sign>.975)+(upper_sign<.025)+(lower_sign>.975)+(lower_sign<.025)
    sign_test = counts_all.copy(deep=True).fillna(0)*0+sign_test

    return sign_test

In [38]:
for i_key in tqdm.tqdm(results_all):
    results_all[i_key]['MedPatterns'] = calculate_cond_prob_patterns(i_key, years_all, True)
    
del(i_key)

100%|███████████████████████████████████████| 1001/1001 [02:12<00:00,  7.53it/s]
100%|████████████████████████████████████████████| 1/1 [02:14<00:00, 134.87s/it]


### Climatological Connections & reference Mediterranean Patterns

In [39]:
def daymonth_extremes_conditioning(observations):
    # function for calculating conditional probabilities & relevant Brier Score for DayMonth temporal subsetting

    offset_days = 15 # offset days before/after day of interest for calc. cond. probs. based on temporal info
    
    all_dates_atm_var = pd.to_datetime(observations.time.values)
    all_dates_atm_var_extd = pd.date_range(all_dates_atm_var[0] - pd.DateOffset(years=1), 
                                           all_dates_atm_var[-1] + pd.DateOffset(years=1))
    unique_daymonth = all_dates_atm_var.strftime('%m%d')
    unique_daymonth = sorted(set(unique_daymonth))

    cond_probs = []
    for daymonth_used in unique_daymonth:
        # keep dates of interest (exact day month)
        dates_used = all_dates_atm_var_extd[all_dates_atm_var_extd.strftime('%m%d').isin([daymonth_used])]

        # add buffer days (before/after)
        all_dates_used = [pd.date_range(i_date-pd.DateOffset(days=offset_days), 
                                        i_date+pd.DateOffset(days=offset_days), freq='D') 
                          for i_date in dates_used]
        all_dates_used = np.array([j for i in all_dates_used for j in i]) # flatten data to have 1-d array
        all_dates_used = all_dates_used[pd.to_datetime(all_dates_used).isin(all_dates_atm_var)] # existing dates

        # keep all dates of interest and get climatology
        clim_mean = observations.sel(time=all_dates_used).mean('time')
        cond_probs.append(clim_mean.assign_coords({'time': daymonth_used}))

    cond_probs = xr.concat(cond_probs, dim='time').rename({'time': 'temporal'})

    perfect_forecasts = cond_probs.sel(temporal=all_dates_atm_var.strftime('%m%d')).rename({'temporal': 'time'})
    perfect_forecasts = perfect_forecasts.assign_coords({'time': observations.time.values})
    
    perfect_brier = (perfect_forecasts - observations)**2
    perfect_brier = perfect_brier.sum('time')/len(perfect_brier.time)

    perfect_brier = perfect_brier.assign_coords({'Method': 'DayMonth_Temp'}) # coord with temp. subsetting info

    return (cond_probs, perfect_brier)

In [40]:
def subsetting_extremes_conditioning(observations, labels_used):
    
    # function for calculating cond. probs. and relevant Brier Score for specific pattern-temporal subsetting

    cond_probs = observations.assign_coords({'time': labels_used}) # new coordinate values
    cond_probs = cond_probs.groupby('time').sum('time')/cond_probs.groupby('time').count() # conditional prob.
    
    perfect_forecasts = cond_probs.sel(time=labels_used)
    perfect_forecasts = perfect_forecasts.assign_coords({'time': observations.time.values})
    
    perfect_brier = (perfect_forecasts - observations)**2
    perfect_brier = perfect_brier.sum('time')/len(perfect_brier.time)

    cond_probs = cond_probs.rename({'time': 'subset'})
    
    return (cond_probs, perfect_brier)

In [41]:
def general_connections_stats(observations):

    dates_used = observations.time.values

    conn_med_clusters, brier_score_all = [], []
    for i_temp in ['All', 'HalfYear']:
        # generate new coord. values based on the cluster ID and the temporal flag of interest for each instance
        lbs_usd = med_clusters.loc[dates_used, 'Label'].astype(str) + '-' + med_clusters.loc[dates_used, i_temp]
        i_conn, i_BS = subsetting_extremes_conditioning(observations, lbs_usd.values)
        i_BS = i_BS.assign_coords({'Method': f'{i_temp}_Patt'}) # new coord with the temp. subs. info
        conn_med_clusters.append(i_conn)
        brier_score_all.append(i_BS)

    conn_med_clusters = xr.concat(conn_med_clusters, dim='subset')

    conn_temp, brier_score_temp = daymonth_extremes_conditioning(observations)

    lbs_usd = med_clusters.loc[dates_used, 'Season']
    conn_temp2, brier_score_temp2 = subsetting_extremes_conditioning(observations, lbs_usd.values)
    conn_temp2 = conn_temp2.rename({'subset': 'temporal'})
    brier_score_temp2 = brier_score_temp2.assign_coords({'Method': 'Season_Temp'}) # new coord info
    conn_temp = xr.concat([conn_temp, conn_temp2], dim='temporal')

    brier_score_all.append(brier_score_temp)
    brier_score_all.append(brier_score_temp2)

    brier_score_all = xr.concat(brier_score_all, dim='Method')
    brier_score_all.name = 'BS'

    brier_score_ref = brier_score_all.sel(Method=['DayMonth_Temp', 'Season_Temp']).min('Method')
    brier_skill_score_all = 1 - brier_score_all/brier_score_ref
    brier_skill_score_all.name = 'BSS'

    brier_score_all = xr.merge([brier_score_all, brier_skill_score_all])

    return {'ConnMedClusters': conn_med_clusters, 'ConnTemp': conn_temp, 'Brier': brier_score_all}

In [42]:
for i_key in tqdm.tqdm(results_all):
    results_all[i_key]['ReferenceConnections'] = general_connections_stats(extremes_data_final[i_key])
    results_all[i_key]['ReferenceConnectionsERA5'] = general_connections_stats(extremes_ERA5[i_key])
    results_all[i_key]['ReferenceConnectionsWVF'] = general_connections_stats(extremes_final_local_pred[i_key])

del(i_key)

100%|█████████████████████████████████████████████| 1/1 [00:36<00:00, 36.13s/it]


In [43]:
def brierSS(data_used, ref_score):
    
    brier_score = data_used
    brier_score.name = 'BS'
    
    brier_ref_clim = ref_score.sel(Method=['DayMonth_Temp', 'Season_Temp']).min('Method')
    brier_skill_score_ref_clim = 1 - brier_score/brier_ref_clim
    brier_skill_score_ref_clim.name = 'BSS_clim'
    
    brier_ref_all = ref_score.min('Method')
    brier_skill_score_ref_all = 1 - brier_score/brier_ref_all
    brier_skill_score_ref_all.name = 'BSS_all'

    brier_score = xr.merge([brier_score, brier_skill_score_ref_clim, brier_skill_score_ref_all])
    
    return brier_score

In [44]:
for i_key in results_all:
    rf_i = results_all[i_key]['ReferenceConnections']['Brier']['BS']
    results_all[i_key]['PrecipERA5']['Brier'] = brierSS(results_all[i_key]['PrecipERA5']['Brier'], rf_i)
    results_all[i_key]['WvfERA5']['Brier'] = brierSS(results_all[i_key]['WvfERA5']['Brier'], rf_i)
    results_all[i_key]['MedPatterns']['Brier'] = brierSS(results_all[i_key]['MedPatterns']['Brier'], rf_i)

del(i_key, rf_i)

In [45]:
def bootstrap_brier(input_data):
    
    i_key, i_bbs = input_data
    
    # conditional probabilities of EPEs on the temporal subsets for the predictors for quantifying uncertainty
    cond_probs_Med_wvf_i = calculate_cond_prob_patterns(i_key, i_bbs, False)['CondProbs']
    cond_probs_Precip_i = cond_prob_predictors(precip_ERA5_spatial, i_key, i_bbs, False)['CondProbs']
    cond_probs_WVF_i = cond_prob_predictors(final_local_predictors_spatial, i_key, i_bbs, False)['CondProbs']
    
    i_bbs = final_dates_used(i_bbs)
    obs_bbs_i = define_extremes(precip_EOBS_spatial[i_key].sel(time=i_bbs), perc_used) # observational extremes
    
    cond_probs_RefTemp_i = general_connections_stats(obs_bbs_i)['ConnTemp']
    cond_probs_RefMed_i = general_connections_stats(obs_bbs_i)['ConnMedClusters']
    
    # climatological brier scores
    clim_cond_prob = results_all[i_key]['ReferenceConnections']['ConnTemp']
    
    seasonal_i = clim_cond_prob.sel(temporal=med_clusters.loc[i_bbs, 'Season'].values)
    seasonal_i = seasonal_i.rename({'temporal': 'time'}).assign_coords({'time': i_bbs})
    brier_seasonal_i = ((seasonal_i-obs_bbs_i)**2).sum('time')/len(i_bbs)
    brier_seasonal_i = brier_seasonal_i.assign_coords({'Method': 'Season_Temp'})
    
    daymonth_i = clim_cond_prob.sel(temporal=med_clusters.loc[i_bbs, 'DayMonth'].values)
    daymonth_i = daymonth_i.rename({'temporal': 'time'}).assign_coords({'time': i_bbs})
    brier_daymonth_i = ((daymonth_i-obs_bbs_i)**2).sum('time')/len(i_bbs)
    brier_daymonth_i = brier_daymonth_i.assign_coords({'Method': 'DayMonth_Temp'})
    
    # Mediterranean patterns reference brier scores
    med_cond_prob = results_all[i_key]['ReferenceConnections']['ConnMedClusters']
    
    lbs_usd = med_clusters.loc[i_bbs, 'Label'].astype(str) + '-' + med_clusters.loc[i_bbs, 'All']
    med_all_i = med_cond_prob.sel(subset=lbs_usd.values).rename({'subset': 'time'})
    med_all_i = med_all_i.assign_coords({'time': i_bbs})
    brier_med_all_i = ((med_all_i-obs_bbs_i)**2).sum('time')/len(i_bbs)
    brier_med_all_i = brier_med_all_i.assign_coords({'Method': 'All_Patt'})
    
    lbs_usd = med_clusters.loc[i_bbs, 'Label'].astype(str) + '-' + med_clusters.loc[i_bbs, 'HalfYear']
    med_halfyear_i = med_cond_prob.sel(subset=lbs_usd.values).rename({'subset': 'time'})
    med_halfyear_i = med_halfyear_i.assign_coords({'time': i_bbs})
    brier_med_halfyear_i = ((med_halfyear_i-obs_bbs_i)**2).sum('time')/len(i_bbs)
    brier_med_halfyear_i = brier_med_halfyear_i.assign_coords({'Method': 'HalfYear_Patt'})
    
    # concatenate climatological scores and scores from reference Mediterranean patterns
    brier_score_all_ref = [brier_seasonal_i, brier_daymonth_i, brier_med_all_i, brier_med_halfyear_i]
    brier_score_all_ref = xr.concat(brier_score_all_ref, dim='Method')
    brier_score_all_ref.name = 'BS'

    brier_score_clim_ref = brier_score_all_ref.sel(Method=['DayMonth_Temp', 'Season_Temp']).min('Method')
    brier_skill_score_all_ref = 1 - brier_score_all_ref/brier_score_clim_ref
    brier_skill_score_all_ref.name = 'BSS'

    brier_score_all_ref = xr.merge([brier_score_all_ref, brier_skill_score_all_ref])

    # precipitation ERA5 brier score
    precip_ERA5_bbs_i = define_extremes(precip_ERA5_spatial[i_key].sel(time=i_bbs), perc_used_ERA5)
    
    cond_prob_ERA5 = results_all[i_key]['PrecipERA5']['CondProbs']
    condprob_pos_precip = precip_ERA5_bbs_i.where(precip_ERA5_bbs_i==1)*cond_prob_ERA5.sel(extr_predictor=1)
    condprob_neg_precip = (precip_ERA5_bbs_i.where(precip_ERA5_bbs_i==0)+1)*cond_prob_ERA5.sel(extr_predictor=0)

    frcst_precip_ERA5_bbs_i = condprob_pos_precip.fillna(0)+condprob_neg_precip.fillna(0)
    brier_precip_ERA5_bbs_i = ((frcst_precip_ERA5_bbs_i-obs_bbs_i)**2).sum('time')/len(i_bbs)
       
    # wvf ERA5 brier score
    wvf_bbs_i_actual = generate_final_wvf(i_key, i_bbs)
    wvf_bbs_i = define_extremes(wvf_bbs_i_actual, perc_used)
    
    cond_prob_wvf = results_all[i_key]['WvfERA5']['CondProbs']
    condprob_pos_wvf = wvf_bbs_i.where(wvf_bbs_i==1)*cond_prob_wvf.sel(extr_predictor=1)
    condprob_neg_wvf = (wvf_bbs_i.where(wvf_bbs_i==0)+1)*cond_prob_wvf.sel(extr_predictor=0)

    frcst_wvf_bbs_i = condprob_pos_wvf.fillna(0)+condprob_neg_wvf.fillna(0)
    brier_wvf_bbs_i = ((frcst_wvf_bbs_i-obs_bbs_i)**2).sum('time')/len(i_bbs)
    
    # create wvf_percentile = 0 for the next combined predictors of patterns and wvf
    wvf_bbs_i_actual = define_extremes(wvf_bbs_i_actual, [80]).rename({'percentile': 'wvf_percentile'})
    
    wvf_bbs_i_actual = wvf_bbs_i_actual.assign_coords({'wvf_percentile': [1]})
    wvf_bbs_i_p0 = (wvf_bbs_i_actual.isel(wvf_percentile=0)>=0).assign_coords(wvf_percentile=0)
    wvf_bbs_i = xr.concat([wvf_bbs_i_p0, wvf_bbs_i_actual], dim='wvf_percentile')
      
    # Mediterranean patterns & wvf brier score
    medpat_wvf_cond_prob = results_all[i_key]['MedPatterns']['CondProbs']['CondProbs']
    labels_i = xr.DataArray(med_clusters.Label.loc[i_bbs]).rename({'dim_0': 'time'})
    # make labels boolean for each day and cluster
    alloc_i = [labels_i==i for i in range(len(medpat_wvf_cond_prob.cluster))]
    alloc_i = xr.concat(alloc_i, dim=medpat_wvf_cond_prob.cluster)

    brier_Med_wvf_i = []
    for i_warnarea in medpat_wvf_cond_prob.WarnArea:
        allocs_i_warn_pos = alloc_i.where(wvf_bbs_i.sel(WarnArea=i_warnarea)).fillna(0).astype(int)
        allocs_i_warn_neg = alloc_i.where(wvf_bbs_i.sel(WarnArea=i_warnarea)==0).fillna(0).astype(int)

        allocs_i_warnarea = xr.concat([allocs_i_warn_pos, allocs_i_warn_neg],
                                      dim=medpat_wvf_cond_prob.CondProbType)

        frcst_Med_i_warn = allocs_i_warnarea*medpat_wvf_cond_prob.sel(WarnArea=i_warnarea)
        frcst_Med_i_warn = frcst_Med_i_warn.sum(['cluster', 'CondProbType'])

        brier_Med_i_warn = ((frcst_Med_i_warn-obs_bbs_i.sel(WarnArea=i_warnarea))**2).sum('time')/len(i_bbs)
        brier_Med_wvf_i.append(brier_Med_i_warn)

    brier_Med_wvf_i = xr.concat(brier_Med_wvf_i, dim='WarnArea')
    
    # calculate BSS
    brier_precip_ERA5_bbs_i = brierSS(brier_precip_ERA5_bbs_i, brier_score_all_ref['BS'])
    brier_wvf_bbs_i = brierSS(brier_wvf_bbs_i, brier_score_all_ref['BS'])
    brier_Med_wvf_i = brierSS(brier_Med_wvf_i, brier_score_all_ref['BS'])
    
    return {'BrierReferenceConnections': brier_score_all_ref, 'BrierPrecipERA5': brier_precip_ERA5_bbs_i,
            'BrierWvfERA5': brier_wvf_bbs_i, 'BrierMedPatterns': brier_Med_wvf_i, 
            'ConnTempReferenceConnections': cond_probs_RefTemp_i, 
            'ConnMedClustersReferenceConnections': cond_probs_RefMed_i, 
            'CondProbsPrecipERA5': cond_probs_Precip_i, 'CondProbsWvfERA5': cond_probs_WVF_i,
            'CondProbsMedPatterns': cond_probs_Med_wvf_i}

In [46]:
def combine_bootstraps(area_name, results_key, type_used, input_data_list):
    
    key_name = type_used+results_key
    
    # concatenate data along bootstraps
    data_final = [i[key_name] for i in input_data_list]
    data_final = xr.concat(data_final, dim=pd.Index(range(len(input_data_list)), name='bootstrap'))
    original_dataarray = isinstance(data_final, xr.core.dataarray.DataArray)
    if original_dataarray:
        data_final = data_final.transpose(..., 'bootstrap')
    else:
        data_final = data_final.to_array('Indicator').transpose(..., 'bootstrap')

    # get the quantiles of interest based on bootstraps
    data_quant = np.sort(data_final, axis=-1)[..., [l_Q5, l_Q25, l_Q75, l_Q95]]
    data_quant = data_final.sel(bootstrap=range(4)).fillna(0)*0+data_quant
    if original_dataarray:
        data_quant = data_quant.assign_coords({'bootstrap': ['Q5', 'Q25', 'Q75', 'Q95']})
    else:
        data_quant = data_quant.to_dataset('Indicator')
        data_quant = data_quant.assign_coords({'bootstrap': ['Q5', 'Q25', 'Q75', 'Q95']})

    data_actual = results_all[area_name][results_key][type_used].assign_coords({'bootstrap': -1})
    if original_dataarray:
        data_final = xr.concat([data_actual, data_final], dim='bootstrap')
        data_final = data_final.transpose(..., 'bootstrap')
    else:
        data_final = xr.concat([data_actual, data_final.to_dataset('Indicator')], dim='bootstrap')
        data_final = data_final.to_array('Indicator').transpose(..., 'bootstrap')

    # get median value based on bootstraps and actual data (so median is actual sorted index)
    data_median = np.sort(data_final, axis=-1)[..., l_Q50]
    data_median = data_final.sel(bootstrap=0).fillna(0)*0+data_median
    if original_dataarray==False: data_median = data_median.to_dataset('Indicator')
    data_median = data_median.assign_coords({'bootstrap': 'Q50'})

    data_actual = data_actual.assign_coords({'bootstrap': 'Actual'})

    data_results = [data_quant, data_median, data_actual]
    data_results = xr.concat(data_results, dim='bootstrap')

    if key_name == 'CondProbsMedPatterns':
        data_results_probs = data_results['CondProbs'].isel(Subset=0).reset_coords(drop=True)
        data_results = xr.merge([data_results_probs, data_results['Samples']])

    if type_used == 'Brier':
        sign = (data_final.isel(Indicator=data_final.Indicator!='BS')>0).sum('bootstrap')/(bootstraps+1)
        sign = sign.to_dataset('Indicator')
        sign = sign.rename({i: i+'_Sign' for i in sign.keys()})

        data_results = xr.merge([data_results, sign])

    if key_name == 'BrierReferenceConnections':
        ref_temp = (data_final.sel(Indicator='BSS')==0).sum('bootstrap')/(bootstraps+1)
        ref_temp = ref_temp.expand_dims('Indicator').to_dataset('Indicator')
        ref_temp = ref_temp.rename({'BSS': 'BSS_ref_Best'})
        data_results = xr.merge([data_results, ref_temp])

    return data_results

### Save all results

In [47]:
for i_dom in results_all:
    cmbs_all = list(product([i_dom], bbs_dates))
    pool = multiprocessing.Pool(processes=8) # object for multiprocessing
    res_bbs = list(tqdm.tqdm(pool.imap(bootstrap_brier, cmbs_all), total=len(cmbs_all), position=0, leave=True))
    pool.close()
    del(pool)
    
    for i_type in set(results_all[i_dom]) - set(['ReferenceConnectionsERA5', 'ReferenceConnectionsWVF']):
        for i_key_int in results_all[i_dom][i_type]:
            results_all[i_dom][i_type][i_key_int] = combine_bootstraps(i_dom, i_type, i_key_int, res_bbs)
    
        
del(i_dom, cmbs_all, res_bbs, i_type)

100%|███████████████████████████████████████| 1000/1000 [29:53<00:00,  1.79s/it]


In [48]:
for i_dom in results_all:
    for i_type in results_all[i_dom]:
        for i_key in results_all[i_dom][i_type]:
            data_to_save = results_all[i_dom][i_type][i_key]                
            data_to_save.to_netcdf(f'{output_data_file}{i_dom}_{i_type}_{i_key}.nc')
            
del(i_dom, i_type, i_key, data_to_save)

### Check that most BS data of actual data lie in 25-75 or 5-95 percentile

In [49]:
for area in results_all:
    print('\n')
    print(area)
    for i_key in results_all[area]:
        if i_key not in ['ReferenceConnectionsERA5', 'ReferenceConnectionsWVF']:
            test = results_all[area][i_key]['Brier']['BS']

            stats = (test.sel(bootstrap='Q75')<test.sel(bootstrap='Actual')) | \
            (test.sel(bootstrap='Q25')>test.sel(bootstrap='Actual'))

            stats_extr = (test.sel(bootstrap='Q95')<test.sel(bootstrap='Actual')) | \
            (test.sel(bootstrap='Q5')>test.sel(bootstrap='Actual'))

            print(i_key, stats.sum().values, stats_extr.sum().values)
        
del(area, i_key, test, stats, stats_extr)



Calabria
PrecipERA5 12 0
WvfERA5 162 53
MedPatterns 168 0
ReferenceConnections 49 8


In [52]:
precip_ERA5.isel(latitude=0, longitude=0).diff('time')

In [51]:
precip_ERA5.isel(latitude=0, longitude=0)