In [1]:
# (C) Copyright 1996- ECMWF.
#
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.

### SOS
This script takes too much time. It is recommended to run it in cluster, and preferably in a different session for each lead time, so multiple sessions can run in parallel.

### Note
This is essentially same as the "*Script4_PatternsForecastStatistics*" script. The only difference is the input data at cell 5, and the outputs naming at cell 28. 

In [2]:
from pathlib import Path
import multiprocessing
import tqdm

import numpy as np
import pandas as pd
import xarray as xr

import sklearn.metrics as metrics

In [3]:
bootstr = 1000 # Number of bootstraps for assessing the statistical significance of the results

# get the index values of the 5th, 95th and median number, when data are ordered (for the bootstraping)
l_m = int(bootstr*5/100)
l_M = int(bootstr*95/100)-1
Md = int(bootstr/2)

In [4]:
input_dir = '/ProcessedData/'
output_dir = '/ProcessedData/ForecastsPatterns/'

Path(output_dir).mkdir(parents=True, exist_ok=True) # generate subfolder for storing the results
offset_days = 45 # days to offset before/after date of interest for defining "DayMonth" moving-window clim/pers
offset_months = 1 # months to offset before/after date of interest for defining "Monthly" moving_window clim/pers
FlexWindows = [0, 1, 3, 5] # flex windows used for assessing the brier score

In [5]:
ActualClusters = pd.read_csv(input_dir+'/PatternAllocations_ERA5_0UTC.csv', index_col=0)
ActualClusters.index = pd.to_datetime(ActualClusters.index)

In [6]:
AllocatedClusters = pd.read_csv(input_dir+'ForecastsClusterAllocations.csv').iloc[:, 1:]
AllocatedClusters[['time', 'valid_time']] = AllocatedClusters[['time', 'valid_time']].apply(pd.to_datetime)

In [7]:
All_Members = AllocatedClusters.number.unique()
ActualMem_len = (All_Members>=0).sum()
LeadDays = AllocatedClusters.step.unique()
del(All_Members)

In [8]:
Unique_States = list(set(ActualClusters.Label)) # all unique clusters

In [9]:
Sorted_Dates = np.array(pd.date_range('20040101', '20041231').strftime('%m%d')) # a leap year for getting all dates
StartSummerHalf = np.where(Sorted_Dates=='0416')[0]
EndSummerHalf = np.where(Sorted_Dates=='1015')[0]        

In [10]:
def temp_flagging(valid_dates, temp_subset):
    
    valid_dates = pd.to_datetime(valid_dates)
    
    if temp_subset == 'All':
        temporal_flag = ['All']*len(valid_dates)
    elif temp_subset == 'HalfYear':
        temporal_flag_aux = pd.Series([i[-4:] for i in valid_dates.strftime('%Y%m%d')])
        temporal_flag_aux = temporal_flag_aux.map({i: i_c for i_c, i in enumerate(Sorted_Dates)})
        temporal_flag_aux = temporal_flag_aux.values
        temporal_flag = np.repeat(['WinterHalf'], len(temporal_flag_aux))
        temporal_flag[(temporal_flag_aux>=StartSummerHalf) & (temporal_flag_aux<=EndSummerHalf)] = 'SummerHalf'
    elif temp_subset == 'Season':
        temporal_flag = (valid_dates.month%12 + 3)//3
        temporal_flag = temporal_flag.map({1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Autumn'})
    elif temp_subset == 'Month':
        temporal_flag = valid_dates.month.astype(str)
    elif temp_subset == 'DayMonth':
        temporal_flag = pd.Series([i[-4:] for i in valid_dates.strftime('%Y%m%d')])
        temporal_flag = temporal_flag.values
        
    return temporal_flag  

In [11]:
for i_tag in ['HalfYear', 'Season', 'Month', 'DayMonth']:
    ActualClusters[i_tag] = temp_flagging(ActualClusters.index, i_tag)
    
del(i_tag)

In [12]:
def frequencies(data_flags=ActualClusters.Label.values, temporal_flags=ActualClusters.index,
                temporal_window=0, subset_used='All'):
    
    Subsets_used = ['All', 'HalfYear', 'Season', 'Month', 'DayMonth']
    if subset_used not in Subsets_used: return print('Wrong input for "subset_used", should be from:', Subsets_used)
    
    clim_data = pd.Series(data_flags, index=temporal_flags) # get data for transitions' calculations    
    full_dates_range = pd.date_range(temporal_flags.min(), temporal_flags.max()) # range with all dates
    clim_data = clim_data.reindex(full_dates_range) # fill possible missing dates
    
    # target date for each temporal offset, so that latter temporal subsetting can be implemented
    Dates_targets = [clim_data.index + np.timedelta64(i_shift, 'D') for i_shift in range(temporal_window+1)]
    Dates_targets = [pd.Series(i_target) for i_target in Dates_targets] # convert from index to series
    Dates_targets = pd.concat(Dates_targets, axis=1) # concat to 1 single dataframe
    Dates_targets.index = range(len(Dates_targets))

    clim_data = [clim_data.shift(-i_shift) for i_shift in range(temporal_window+1)] # shift all necessary temp offsets
    clim_data = pd.concat(clim_data, axis=1) # concat to 1 single dataframe
    clim_data.index = range(len(clim_data))
    
    Dates_targets.columns = clim_data.columns # same name of columns for masking later on
    Dates_targets = Dates_targets.apply(lambda x: temp_flagging(x.values, subset_used), axis=0)

    if subset_used == 'All':
        index_used = ['All']
    elif subset_used == 'HalfYear':
        index_used = ['WinterHalf', 'SummerHalf']
    elif subset_used == 'Season':
        index_used = ['Winter', 'Spring', 'Summer', 'Autumn']
    elif subset_used == 'Month':
        index_used = list(np.arange(1, 13).astype(str))
    elif subset_used == 'DayMonth':
        index_used = Sorted_Dates
    
    Freqs = pd.DataFrame(0, index=index_used, columns=Unique_States)
    for date_check in index_used:
        dates_check_all = date_check
        if subset_used == 'Month':
            dates_check_all = int(dates_check_all)
            dates_check_all = np.arange(dates_check_all-offset_months, dates_check_all+offset_months+1) # used months
            # correct possible values outside [1,12] by circulating the months (1,2,3,...,12,1,2,...11,,12,1,2)
            for loc, mn_i in enumerate(dates_check_all): 
                if mn_i>12:
                    dates_check_all[loc] = mn_i - 12
                elif mn_i<1:
                    dates_check_all[loc] = 12 + mn_i
            dates_check_all = dates_check_all.astype(str)
        elif subset_used == 'DayMonth':
            central_loc = np.where(Sorted_Dates==dates_check_all)[0]
            dates_check_all = np.linspace(central_loc-offset_days, central_loc+offset_days, 2*offset_days+1)
            for loc, i_date_loc in enumerate(dates_check_all): 
                if i_date_loc >= len(Sorted_Dates):
                    dates_check_all[loc] = dates_check_all[loc] - len(Sorted_Dates)
            dates_check_all = np.take(Sorted_Dates, dates_check_all.astype(int))

        # keep the final subset of interest, by replacing all data not belonging to the temporal subset by NaN
        dates_check_all = list(np.array(dates_check_all).flatten())
        clim_data_kept = clim_data.where(Dates_targets.isin(dates_check_all))
        clim_data_kept.dropna(inplace=True) # drop rows with at least 1 NaN (not a full set thus a bit biased)
        
        for i_state in range(len(Unique_States)):
            Counts_state = (clim_data_kept==i_state).sum(axis=1)>0
            Freqs.loc[date_check, i_state] = Counts_state.sum()/len(clim_data_kept)
            
    return Freqs

In [13]:
def freqs_all(temporal_window):
    Fr = [frequencies(temporal_window=temporal_window), 
          frequencies(temporal_window=temporal_window, subset_used='HalfYear'), 
          frequencies(temporal_window=temporal_window, subset_used='DayMonth')]
    
    Fr = pd.concat(Fr)
    Fr = xr.DataArray(Fr, dims={'temp_subset': Fr.index, 'cluster': Fr.columns})
    return Fr

In [14]:
pool = multiprocessing.Pool() # object for multiprocessing
FreqsAll = list(tqdm.tqdm(pool.imap(freqs_all, FlexWindows), total=len(FlexWindows), position=0, leave=True))
pool.close()
FreqsAll = xr.concat(FreqsAll, dim=pd.Index(FlexWindows, name='temp_window'))
del(pool)

In [15]:
def transitions(data_flags=ActualClusters.Label.values, temporal_flags=ActualClusters.index,
                      lead=1, temporal_window=0, subset_used='All'):
    
    Subsets_used = ['All', 'HalfYear', 'Season', 'Month', 'DayMonth']
    if subset_used not in Subsets_used: return print('Wrong input for "subset_used", should be from:', Subsets_used)
   
    # get all shift_dates of interest, and keep only the ones that are in the "LeadDays" for comperability 
    shift_days = list(range(lead-temporal_window, lead+temporal_window+1)) # get all lead_days of interest
    shift_days = [i_shift for i_shift in shift_days if i_shift in LeadDays] # keep lead_days available in LeadDays

    transitions_data = pd.Series(data_flags, index=temporal_flags) # get data for transitions' calculations    
    full_dates_range = pd.date_range(temporal_flags.min(), temporal_flags.max()) # range with all dates
    transitions_data = transitions_data.reindex(full_dates_range) # fill possible missing dates
    transitions_data = transitions_data.fillna(-1) # any missing data fill with -1, since clusters have values >= 0
    States_actual = transitions_data.values # actual state (cluster) at each observation (row)
    
    # target date for each lead time, so that latter temporal subsetting can be implemented
    Dates_targets = [transitions_data.index + np.timedelta64(i_shift, 'D') for i_shift in shift_days]
    Dates_targets = [pd.Series(i_target) for i_target in Dates_targets] # convert from index to series
    Dates_targets = pd.concat(Dates_targets, axis=1) # concat to 1 single dataframe
    Dates_targets.index = States_actual # rename the index from actual date to actual state (cluster)

    transitions_data = [transitions_data.shift(-i) for i in shift_days] # shift all necessary lead times
    transitions_data = pd.concat(transitions_data, axis=1) # concat to 1 single dataframe
    transitions_data.index = States_actual # rename the index from actual date to actual state (cluster)
    transitions_data.replace({-1:np.nan}, inplace=True) # replace back all -1 to NaN since those dates don't exist   
    
    Dates_targets.columns = transitions_data.columns # same name of columns for masking later on
    Dates_targets = Dates_targets.apply(lambda x: temp_flagging(x.values, subset_used), axis=0)

    if subset_used == 'All':
        index_used = ['All']
    elif subset_used == 'HalfYear':
        index_used = ['WinterHalf', 'SummerHalf']
    elif subset_used == 'Season':
        index_used = ['Winter', 'Spring', 'Summer', 'Autumn']
    elif subset_used == 'Month':
        index_used = list(np.arange(1, 13).astype(str))
    elif subset_used == 'DayMonth':
        index_used = Sorted_Dates
    
    TransitionsMatrix = np.zeros([len(index_used), len(Unique_States), len(Unique_States)])
    for i_date_check, date_check in enumerate(index_used):
        dates_check_all = date_check
        if subset_used == 'Month':
            dates_check_all = int(dates_check_all)
            dates_check_all = np.arange(dates_check_all-offset_months, dates_check_all+offset_months+1) # used months
            # correct possible values outside [1,12] by circulating the months (1,2,3,...,12,1,2,...11,,12,1,2)
            for loc, mn_i in enumerate(dates_check_all): 
                if mn_i>12:
                    dates_check_all[loc] = mn_i - 12
                elif mn_i<1:
                    dates_check_all[loc] = 12 + mn_i
            dates_check_all = dates_check_all.astype(str)
        elif subset_used == 'DayMonth':
            central_loc = np.where(Sorted_Dates==dates_check_all)[0]
            dates_check_all = np.linspace(central_loc-offset_days, central_loc+offset_days, 2*offset_days+1)
            for loc, i_date_loc in enumerate(dates_check_all): 
                if i_date_loc >= len(Sorted_Dates):
                    dates_check_all[loc] = dates_check_all[loc] - len(Sorted_Dates)
            dates_check_all = np.take(Sorted_Dates, dates_check_all.astype(int))

        # keep the final subset of interest, by replacing all data not belonging to the temporal subset by NaN
        dates_check_all = list(np.array(dates_check_all).flatten())
        transitions_data_kept = transitions_data.where(Dates_targets.isin(dates_check_all))
        transitions_data_kept.dropna(inplace=True) # drop rows with at least 1 NaN (not a full set thus a bit biased)
        
        M = pd.DataFrame(0, columns=Unique_States, index=Unique_States) # DF to store transitions
        for from_state in (set(States_actual)- set([-1])): # loop through all available clusters (excluding -1 flag)
            subset_from = transitions_data_kept.loc[from_state] # keep the rows with actual cluster == from_state
            for to_state in (set(States_actual)- set([-1])):
                Trans_FromTo = (subset_from==to_state).sum(axis=1)>0 # check dates when analysed transition occurs
                M.loc[from_state, to_state] = Trans_FromTo.sum()/len(subset_from) # percentage of transitions

        TransitionsMatrix[i_date_check,:,:] = M
    
    xr_dims = {'temp_subset': index_used, 'from_cluster': Unique_States, 'to_cluster': Unique_States}
    TransitionsMatrix = xr.DataArray(TransitionsMatrix, coords=list(xr_dims.values()) ,dims=list(xr_dims.keys()))
    
    return TransitionsMatrix

In [16]:
def trans_temporal_subsets(input_data):
    
    temporal_window, lead_days = input_data
    Tr = [transitions(temporal_window=temporal_window, lead=lead_days), 
          transitions(temporal_window=temporal_window, subset_used='HalfYear', lead=lead_days)]
    
    Tr = xr.concat(Tr, dim='temp_subset')
    return Tr

In [17]:
def trans_all(lead_days):
    Tr = []
    for i_flex in FlexWindows:
        Tr.append( trans_temporal_subsets([i_flex, lead_days]) )
        
    Tr = xr.concat(Tr, dim=pd.Index(FlexWindows, name='temp_window'))
    return Tr

In [18]:
pool = multiprocessing.Pool() # object for multiprocessing
TransAll = list(tqdm.tqdm(pool.imap(trans_all, LeadDays), total=len(LeadDays), position=0, leave=True))
pool.close()
TransAll = xr.concat(TransAll, dim=pd.Index(LeadDays, name='lead_days'))
del(pool)

In [19]:
def subset_data(lead_days=1, season_used='All', flex_window=0):
    
    Seasons_used = ['All', 'WinterHalf', 'SummerHalf']
    if season_used not in Seasons_used: return print('Wrong input for "season_used", should be from:', Seasons_used)
   
    temporal_flags_used = 'All' if season_used == 'All' else 'HalfYear' # get aux for generating temporal flags

    lead_days_all = list(range(lead_days-flex_window, lead_days+flex_window+1)) # get all lead_days of interest
    lead_days_all = [i_lead for i_lead in lead_days_all if i_lead in LeadDays] # keep lead_days available in the data

    Frcst_ALL = {i_days: AllocatedClusters.query('step == @i_days') for i_days in lead_days_all} # get all forecasts

    # keep only forecast data that belong to the studied temporal subset. Use only if all lead-days are in the subset
    used_rows = [temp_flagging(Frcst_ALL[i_days].valid_time.values, temporal_flags_used) for i_days in lead_days_all] 
    used_rows = pd.concat([pd.Series(i) for i in used_rows], axis=1) # dataframe with temporal flag for each instance
    used_rows = ((used_rows == season_used).sum(axis=1)==len(lead_days_all)).values # only if all belong to the subset
    Frcst_ALL = {i_days: Frcst_ALL[i_days][used_rows] for i_days in lead_days_all} # get final subset of frcst data
    
    # subset the ERA5 data for getting the timeseries with the actual regime at each of the lead_days_all
    Frcst_Dates = {i_days: sorted(Frcst_ALL[i_days].valid_time.unique()) for i_days in lead_days_all} # dates needed
    ERA5_all = {i_days: ActualClusters.loc[dates] for i_days, dates in Frcst_Dates.items()} # ERA5 subset 
    
    Actuals = np.concatenate([i_df[['Label']].values for i_df in list(ERA5_all.values())], axis=1)

    # dates at "lead_days" before date of interest, for being able to analyse forecasting based on persistence
    Persistence_Days = Frcst_Dates[lead_days] - np.timedelta64(lead_days, 'D') 
    Persistence = ActualClusters.loc[Persistence_Days, ['Label']] # actual regime from ERA5 used for persistence
    
    Frcst_Data = {} # generate dictionary with the different methods for allocating the patterns (use numpy 3d array)
    for i_type in list(AllocatedClusters.columns[4:]): # loop through the pattern allocation methods (after column 4)
        i_frcst = {i_days: frcst_subset.pivot_table(index='valid_time', columns='number', values=i_type)
                   for i_days, frcst_subset in Frcst_ALL.items()} 
        # predictions as 3d array (time, ens. mem., days of flexible window)
        i_frcst = np.concatenate([i_df.values[..., np.newaxis] for i_df in list(i_frcst.values())], axis=2) 
        Frcst_Data[i_type] = i_frcst
    
    return (Frcst_Data, Actuals, Persistence)

In [20]:
def Freq_forecasts(input_data):
    
    Frcst_Data, Actuals, lead_days, season_used, flex_window = input_data
    
    # because of model clim., there is no reason to seperate control member, but it is considered together with ens.
    coords_Member = ['ERA5', 'Reforecasts', 'Mean'] 
    
    Freq_Occur = np.zeros([len(Unique_States), len(coords_Member), len(Frcst_Data)]) 
    
    for k, member_k in enumerate(Frcst_Data): # loop through the clustering options

        predict = Frcst_Data[member_k]

        for j_cl in Unique_States:
            y_true = (Actuals == j_cl).sum(axis=1)>0 # True if cluster exists in at least 1 day on the flex window
            Freq_Occur[j_cl, 0, k] = np.sum(y_true)/len(y_true)*100 # Freq of Occurrence on the ERA5 subset used

            frcst_mem = (predict[:, 1:, :]==j_cl).sum(axis=2) # number of days cluster observed for members (not mean)
            frcst_mem = frcst_mem>0
            Freq_Occur[j_cl, 1, k] = np.sum(frcst_mem)/frcst_mem.size*100 # Freq of Occur of all forecast members
            
            mean_mem = (predict[:, 0, :]==j_cl).sum(axis=1) # number of days cluster is observed for mean member
            mean_mem = mean_mem>0
            Freq_Occur[j_cl, 2, k] = np.sum(mean_mem)/mean_mem.size*100 # Freq of Occurrence of mean member
            
    Freq_Occur = xr.DataArray(Freq_Occur, 
                              coords=[list(Unique_States), coords_Member, list(Frcst_Data.keys())], 
                              dims=['Cluster', 'Member', 'Method']) # convert np array to xr dataarray
    
    Freq_Occur = Freq_Occur.assign_coords({'Lead_days':lead_days, 'Flex_win': flex_window, 'Season': season_used})
    
    Freq_Occur_Difs = (Freq_Occur-Freq_Occur.sel(Member='ERA5'))/Freq_Occur.sel(Member='ERA5')*100 # get freq biases
    Freq_Occur = xr.concat([Freq_Occur, Freq_Occur_Difs], dim=pd.Index(['Freq', 'FreqBias'], name='Var'))
    
    return Freq_Occur

In [21]:
def BS_decomposition(actual, prob):
    
    Probs = pd.DataFrame({'Prob': prob, 'Actual': actual})

    ProbsStats = pd.DataFrame({'Mi':np.nan}, index=np.arange(ActualMem_len+1)/ActualMem_len)
    Mi = Probs.groupby(['Prob']).apply(lambda x: (x['Actual']==1).sum())
    Ni = Probs.groupby('Prob')['Actual'].count()
    ProbsStats.loc[Mi.index, 'Mi'] = Mi
    ProbsStats.loc[Ni.index, 'Ni'] = Ni
    ProbsStats.fillna(0, inplace=True)
    M_all = ProbsStats.Mi.sum()
    N_all = ProbsStats.Ni.sum()
    ProbsStats['RelFreq'] = ProbsStats.Mi/ProbsStats.Ni
    ProbsStats['Rel'] = (ProbsStats.index - ProbsStats['RelFreq'])**2
    ProbsStats['Res'] = (M_all/N_all - ProbsStats['RelFreq'])**2 * (ProbsStats.Ni/ProbsStats.Ni.sum())
    ProbsStats['Rel_Weighted'] = ProbsStats.Rel * (ProbsStats.Ni/N_all)
    ProbsStats['Res_Weighted'] = ProbsStats.Res * (ProbsStats.Ni/ProbsStats.Ni.sum())
    Reliability = ProbsStats['Rel_Weighted'].sum()
    Resolution = ProbsStats['Res_Weighted'].sum()
    Uncertainty = M_all/N_all*(1-M_all/N_all)
    
    AgrStats = pd.Series([Reliability, Resolution, Uncertainty], index=['Reliability', 'Resolution', 'Uncertainty'])
    
    return (ProbsStats.iloc[:, :], AgrStats)

In [22]:
def brier_score(input_data):
    
    Frcst_Data, Actuals, Persistence, lead_days, season_used, flex_window = input_data
    methods_used = list(Frcst_Data.keys())
    
    # create numpy arrays for storing the Brier Score Decomposition statistics
    probs_all = np.arange(ActualMem_len+1)/ActualMem_len
    BS_Dec_Se = np.zeros([len(probs_all), 7, len(Unique_States), len(methods_used)])
    BS_Dec_Ag = np.zeros([3, len(Unique_States), len(methods_used)])
    i_k_decomp = -1
    
    # create arrays and auxiliary elements for calculating Brier Score of ens. (proper & fair) and of reference
    methods_used = methods_used+['Frequencies', 'Persistence']
    methods_used_all = methods_used+[i+'_Fair'for i in methods_used[:-2]] # add Fair BS data for the ens. forecasts
    
    forecasted_dates = Persistence.index + np.timedelta64(lead_days, 'D') # central dates of forecast
    forecasted_dates_freqs_flag = temp_flagging(forecasted_dates, 'DayMonth') # temporal flags for climatology
    forecasted_dates_persi_flag = temp_flagging(forecasted_dates, 'HalfYear') # temporal flags for persistence

    BS_Clusters = np.zeros([len(Unique_States), len(methods_used_all)]) 
    
    for i_k, k_method in enumerate(methods_used): # loop through the allocation methods
        i_k_decomp += 1 # add 1 for having the correct index on the Decomposition arrays
        if k_method in list(Frcst_Data.keys()):
            predict = Frcst_Data[k_method][:, 1:, :] # remove ens mean (1st column, as it has -1 flag)

        for j_cl in Unique_States: # loop through each cluster for calculating the brier score
            
            # get the boolean indicating whether the cluster is observed or not in at least 1 day on the flex win.
            y_true = (Actuals == j_cl).sum(axis=1)>0
            
            # get the probabilities of observing the cluster of interest
            if k_method == 'Frequencies':
                Freqs_used = FreqsAll.sel(temp_window=flex_window, cluster=j_cl)
                probs = Freqs_used.sel(temp_subset=forecasted_dates_freqs_flag).values

            elif k_method == 'Persistence':
                Trans_used = TransAll.sel(lead_days=lead_days, temp_window=flex_window, to_cluster=j_cl)
                probs = [Trans_used.sel(from_cluster=Persistence.values.flatten()[i],
                                        temp_subset=forecasted_dates_persi_flag[i]).values 
                         for i in range(len(Actuals))]
                probs = np.array(probs).flatten()

            else:
                probs = (predict==j_cl).sum(axis=2) # number of days the cluster is observed for each ens. member
                mem_counts = (probs>0)*1 # Boolean; 1 if cluster is observed at least once for each esn. mem.
                mem_counts = mem_counts.sum(axis=1) # total number of ens. members indicating the cluster
                probs = mem_counts/probs.shape[1] # % of ens. members. indicating the cluster

                # get the brier score decomposition statistics for the ens. forecasts
                DecompStats = BS_decomposition(y_true, probs)
                
                BS_Dec_Se[:, :, j_cl, i_k_decomp] = DecompStats[0].values
                BS_Dec_Ag[:, j_cl, i_k_decomp] = DecompStats[1].values

            # calculate proper brier score
            brier = metrics.brier_score_loss(y_true=y_true, y_prob=probs) # brier score
            BS_Clusters[j_cl, i_k] = brier
            
            # calculate fair brier score only for ens. forecasts
            if k_method in list(Frcst_Data.keys()):
                m_members = predict.shape[1]
                adjustment = mem_counts*(m_members-mem_counts)/m_members**2/(m_members-1)
                adjustment_mean = np.mean(adjustment)
                brier_fair = brier - adjustment_mean
                BS_Clusters[j_cl, i_k+len(methods_used)] = brier_fair
            
    # convert np array to xr dataarray
    BS_Clusters = xr.DataArray(BS_Clusters, coords=[Unique_States, methods_used_all], dims=['Cluster', 'Method']) 

    # number of actual instances that each cluster is observed for calculating weighted Combo Brier Score
    Instances_all = []
    for j_cl in Unique_States: # loop through each cluster for calculating the brier score
        y_true = (Actuals == j_cl).sum(axis=1)>0 # True if cluster exists in at least 1 day on the flex win.
        Instances_all.append(y_true.sum()) # total observed instances
    Instances_all = np.array(Instances_all)

    # calculate average brier score: either consider equal weights for each cluster (macro), or weighted (weighted)
    BS_Combo = np.zeros([2, len(methods_used_all)]) # Type, Method

    BS_Combo[0, :] = np.average(BS_Clusters, axis=0)
    BS_Combo[1, :] = np.average(BS_Clusters, axis=0, weights=Instances_all)

    BS_Combo = xr.DataArray(BS_Combo, coords=[['Macro', 'Weighted'], methods_used_all], dims=['Type', 'Method'])
    
    
    BS_Dec_Se = xr.DataArray(BS_Dec_Se, dims=['Probs', 'Stat', 'Cluster', 'Method'], 
                             coords=[probs_all, ['Mi', 'Ni', 'RelFreq', 'Rel', 'Res', 'Rel_Weight', 'Res_Weight'], 
                                     Unique_States, list(Frcst_Data.keys())])
    BS_Dec_Ag = xr.DataArray(BS_Dec_Ag, dims=['Stat', 'Cluster', 'Method'], 
                             coords=[['Rel.', 'Res.', 'Unc.'], Unique_States, list(Frcst_Data.keys())])   
    
    # assign additional coordinates with the specified arguments
    BS_Clusters = BS_Clusters.assign_coords({'Lead_days':lead_days, 'Flex_win': flex_window, 'Season': season_used})
    BS_Combo = BS_Combo.assign_coords({'Lead_days':lead_days, 'Flex_win': flex_window, 'Season': season_used})
    BS_Dec_Ag = BS_Dec_Ag.assign_coords({'Lead_days':lead_days, 'Flex_win': flex_window, 'Season': season_used})
    BS_Dec_Se = BS_Dec_Se.assign_coords({'Lead_days':lead_days, 'Flex_win': flex_window, 'Season': season_used})
    
    return {'Combo': BS_Combo, 'Clusters': BS_Clusters, 'Dec_Aggr': BS_Dec_Ag, 'Dec_Seg': BS_Dec_Se}

In [23]:
def BS_bootstrap_summary_statistics(dict_key):
    
    BS_data = xr.concat([i[dict_key] for i in BS_BS_Statistics], dim='bootstrap')
    BS_data_ordered = BS_data.argsort(axis=0)
    BS_data_ordered_values = np.take_along_axis(BS_data.values, BS_data_ordered.values, axis=0)     
    
    # get the quantiles from the actual bootstrapping statistics
    Lower_BS = BS_data[0]*0 + BS_data_ordered_values[l_m]
    Upper_BS = BS_data[0]*0 + BS_data_ordered_values[l_M]

    # append the actual data to the full bootstraps so now the median value can be extracted
    BS_data = xr.concat([BS_data, Actual_BS_Statistics[dict_key]], dim='bootstrap')
    BS_data_ordered = BS_data.argsort(axis=0)
    BS_data_ordered_values = np.take_along_axis(BS_data.values, BS_data_ordered.values, axis=0)
    
    Median_BS = BS_data[0]*0 + BS_data_ordered_values[Md]

    # combine all data to the final xarrays
    dim_name = pd.Index(['P5', 'P50', 'Actual', 'P95'], name='bootstrap')
    Final_BS = xr.concat([Lower_BS, Median_BS, Actual_BS_Statistics[dict_key], Upper_BS], dim=dim_name)
    Final_BS.name = 'BS'
    
    return Final_BS

In [24]:
def BSS_bootstrap_summary_statistics(dict_key):
    
    BS_data = xr.concat([i[dict_key] for i in BS_BS_Statistics], dim='bootstrap')
    BSS_data = 1 - BS_data/BS_data.sel(Method=['Frequencies', 'Persistence']).min('Method')
    BSS_data_ordered = BSS_data.argsort(axis=0)
    BSS_data_ordered_values = np.take_along_axis(BSS_data.values, BSS_data_ordered.values, axis=0)     
    
    # get the quantiles from the actual bootstrapping statistics
    Lower_BSS = BSS_data[0]*0 + BSS_data_ordered_values[l_m]
    Upper_BSS = BSS_data[0]*0 + BSS_data_ordered_values[l_M]

    # append the actual data to the full bootstraps so now the median value can be extracted
    BS_data = xr.concat([BS_data, Actual_BS_Statistics[dict_key]], dim='bootstrap')
    BSS_data = 1 - BS_data/BS_data.sel(Method=['Frequencies', 'Persistence']).min('Method')
    BSS_data_ordered = BSS_data.argsort(axis=0)
    BSS_data_ordered_values = np.take_along_axis(BSS_data.values, BSS_data_ordered.values, axis=0)     
    
    Median_BSS = BSS_data[0]*0 + BSS_data_ordered_values[Md]
    
    Actual_BSS = Actual_BS_Statistics[dict_key]
    Actual_BSS = 1 - Actual_BSS/Actual_BSS.sel(Method=['Frequencies', 'Persistence']).min('Method')
    
    # combine all data to the final xarrays
    dim_name = pd.Index(['P5', 'P50', 'Actual', 'P95'], name='bootstrap')
    Final_BSS = xr.concat([Lower_BSS, Median_BSS, Actual_BSS, Upper_BSS], dim=dim_name)
    Final_BSS.name = 'BSS'
    
    Wins = (BSS_data>0).sum('bootstrap')
    Wins = Wins/(bootstr+1)
    Wins.name = 'Sign'
    
    return (Final_BSS, Wins)

In [25]:
def statistics_bootstrapped(lead_days=1, season_used='All', flex_window=0):
    
    Frcst_Data, Actuals, Persistence = subset_data(lead_days, season_used, flex_window)
    
    np.random.seed(10)
    BS_indices = np.random.choice(len(Actuals), len(Actuals)*bootstr) # generate all bootstrapped values
    BS_indices = np.array_split(BS_indices, bootstr) # split into the number of subsets (samples)
    BS_indices = np.array(BS_indices)
    
    Frcst_Data_BS = [{i_key: Frcst_Data[i_key][i] for i_key in Frcst_Data} for i in BS_indices]
    Actuals_BS = [Actuals[i] for i in BS_indices]
    Persistence_BS = [Persistence.iloc[i] for i in BS_indices]
    
    global BS_BS_Statistics, Actual_BS_Statistics
    
    Inputs = list(zip(Frcst_Data_BS, Actuals_BS, Persistence_BS, 
                      [lead_days]*bootstr, [season_used]*bootstr, [flex_window]*bootstr))
    pool = multiprocessing.Pool() # object for multiprocessing
    BS_BS_Statistics = list(pool.imap(brier_score, Inputs))
    pool.close() 
    
    Actual_BS_Statistics = brier_score([Frcst_Data, Actuals, Persistence, lead_days, season_used, flex_window])
    
    BS_Combo = BS_bootstrap_summary_statistics('Combo')
    BSS_Combo, Sign_Combo = BSS_bootstrap_summary_statistics('Combo')
    BS_Combo = xr.merge([BS_Combo, BSS_Combo, Sign_Combo])
    BS_Clusters = BS_bootstrap_summary_statistics('Clusters')
    BSS_Clusters, Sign_Clust = BSS_bootstrap_summary_statistics('Clusters')
    BS_Clusters = xr.merge([BS_Clusters, BSS_Clusters, Sign_Clust])
    BS_Dec_Ag = BS_bootstrap_summary_statistics('Dec_Aggr')
    BS_Dec_Se = BS_bootstrap_summary_statistics('Dec_Seg')
    
    Sign_Combo = Sign_Combo.assign_coords({'Lead_days':lead_days, 'Flex_win': flex_window, 'Season': season_used})
    Sign_Clust = Sign_Clust.assign_coords({'Lead_days':lead_days, 'Flex_win': flex_window, 'Season': season_used})
        
    del(BS_BS_Statistics, Actual_BS_Statistics)
    
    Inputs = list(zip(Frcst_Data_BS, Actuals_BS, [lead_days]*bootstr, [season_used]*bootstr, [flex_window]*bootstr))
    pool = multiprocessing.Pool() # object for multiprocessing
    BS_data = list(pool.imap(Freq_forecasts, Inputs))
    pool.close() 
    
    Actual_Freqs = Freq_forecasts([Frcst_Data, Actuals, lead_days, season_used, flex_window])
    
    BS_data = xr.concat(BS_data, dim='bootstrap')
    BS_data_ordered = BS_data.argsort(axis=0)
    BS_data_ordered_values = np.take_along_axis(BS_data.values, BS_data_ordered.values, axis=0)
    
    # get the quantiles from the actual bootstrapping statistics
    Lower_Freq = BS_data[0]*0 + BS_data_ordered_values[l_m]
    Upper_Freq = BS_data[0]*0 + BS_data_ordered_values[l_M]

    # append the actual data to the full bootstraps so now the median value can be extracted
    BS_data = xr.concat([BS_data, Actual_Freqs], dim='bootstrap')
    BS_data_ordered = BS_data.argsort(axis=0)
    BS_data_ordered_values = np.take_along_axis(BS_data.values, BS_data_ordered.values, axis=0)

    Median_Freq = BS_data[0]*0 + BS_data_ordered_values[Md]

    # combine all data to the final xarrays
    dim_name = pd.Index(['BS_5', 'BS_Median', 'Actual', 'BS_95'], name='bootstrap')
    FinFreq = xr.concat([Lower_Freq, Median_Freq, Actual_Freqs, Upper_Freq], dim=dim_name)    
    
    return {'Combo': BS_Combo, 'Clusters': BS_Clusters, 
            'Dec_Aggr': BS_Dec_Ag, 'Dec_Seg': BS_Dec_Se, 'Freq': FinFreq}

In [26]:
def forecasts_seasonal_statistics(input_data):
    
    lead_days = input_data[0]
    flex_window = input_data[1]
    
    # brier statistics
    All = statistics_bootstrapped(lead_days=lead_days, season_used='All', flex_window=flex_window)
    Winter = statistics_bootstrapped(lead_days=lead_days, season_used='WinterHalf', flex_window=flex_window)
    Summer = statistics_bootstrapped(lead_days=lead_days, season_used='SummerHalf', flex_window=flex_window)
    
    Brier_Clusters = [i['Clusters'] for i in [All, Winter, Summer]]
    Brier_Clusters = xr.concat(Brier_Clusters, dim='Season')
    Brier_Combo = [i['Combo'] for i in [All, Winter, Summer]]
    Brier_Combo = xr.concat(Brier_Combo, dim='Season')
    Decomp_Aggr = [i['Dec_Aggr'] for i in [All, Winter, Summer]]
    Decomp_Aggr = xr.concat(Decomp_Aggr, dim='Season')
    Decomp_Segm = [i['Dec_Seg'] for i in [All, Winter, Summer]]
    Decomp_Segm = xr.concat(Decomp_Segm, dim='Season')
    Freq_Occur = [i['Freq'] for i in [All, Winter, Summer]]
    Freq_Occur = xr.concat(Freq_Occur, dim='Season')
    
    return {'Brier_Clusters': Brier_Clusters, 'Brier_Combo': Brier_Combo, 
            'Freq_Occur': Freq_Occur, 'Decomp_Aggr': Decomp_Aggr, 'Decomp_Segm': Decomp_Segm}

In [27]:
def forecasts_flexwind_statistics(lead_days):
    
    FlexAll = [forecasts_seasonal_statistics([lead_days, i]) for i in FlexWindows]
    
    Brier_Clusters = [i['Brier_Clusters'] for i in FlexAll]
    Brier_Clusters = xr.concat(Brier_Clusters, dim='Flex_win')
    Brier_Combo = [i['Brier_Combo'] for i in FlexAll]
    Brier_Combo = xr.concat(Brier_Combo, dim='Flex_win')
    Freq_Occur = [i['Freq_Occur'] for i in FlexAll]
    Freq_Occur = xr.concat(Freq_Occur, dim='Flex_win')
    Decomp_Aggr = [i['Decomp_Aggr'] for i in FlexAll]
    Decomp_Aggr = xr.concat(Decomp_Aggr, dim='Flex_win')
    Decomp_Segm = [i['Decomp_Segm'] for i in FlexAll]
    Decomp_Segm = xr.concat(Decomp_Segm, dim='Flex_win')
    
    return {'Brier_Clusters': Brier_Clusters, 'Brier_Combo': Brier_Combo, 
            'Freq_Occur': Freq_Occur, 'Decomp_Aggr': Decomp_Aggr, 'Decomp_Segm': Decomp_Segm}

In [28]:
ForecastStatistics = []
for i_lead in tqdm.tqdm(LeadDays[:]):
    ForecastStatistics.append( forecasts_flexwind_statistics(i_lead) )

Brier_Clusters = [i['Brier_Clusters'] for i in ForecastStatistics]
Brier_Clusters = xr.concat(Brier_Clusters, dim='Lead_days')
Brier_Combo = [i['Brier_Combo'] for i in ForecastStatistics]
Brier_Combo = xr.concat(Brier_Combo, dim='Lead_days')
Decomp_Aggr = [i['Decomp_Aggr'] for i in ForecastStatistics]
Decomp_Aggr = xr.concat(Decomp_Aggr, dim='Lead_days')
Decomp_Segm = [i['Decomp_Segm'] for i in ForecastStatistics]
Decomp_Segm = xr.concat(Decomp_Segm, dim='Lead_days')
Freq_Occur = [i['Freq_Occur'] for i in ForecastStatistics]
Freq_Occur = xr.concat(Freq_Occur, dim='Lead_days')
Freq_Occur = Freq_Occur.to_dataset('Var')

# flag data that have significantly different frequencies (this is true if 5% and 95% have biases of same sign)
Check = Freq_Occur['FreqBias'].sel(bootstrap='BS_5')*Freq_Occur['FreqBias'].sel(bootstrap='BS_95')
Check = Check>0
Check.name = 'SignDeviations'
Freq_Occur = xr.merge([Freq_Occur, Check])

del(ForecastStatistics, i_lead, Check)

In [29]:
Brier_Combo.to_netcdf(output_dir+'BS_Combo_0UTC.nc')
Brier_Clusters.to_netcdf(output_dir+'BS_Clusters_0UTC.nc')
Decomp_Aggr.to_netcdf(output_dir+'Decomp_Aggr_0UTC.nc')
Decomp_Segm.to_netcdf(output_dir+'Decomp_Segm_0UTC.nc')
Freq_Occur.to_netcdf(output_dir+'FreqOccur_Forecasts_0UTC.nc')
FreqsAll.to_netcdf(output_dir+'FreqsAll_ERA5_0UTC.nc')
TransAll.to_netcdf(output_dir+'TransAll_ERA5_0UTC.nc')