In [1]:
# (C) Copyright 1996- ECMWF.
#
# This software is licensed under the terms of the Apache Licence Version 2.0
# which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
# In applying this licence, ECMWF does not waive the privileges and immunities
# granted to it by virtue of its status as an intergovernmental organisation
# nor does it submit to any jurisdiction.

In [2]:
from pathlib import Path
import multiprocessing
import tqdm

import numpy as np
import pandas as pd
import xarray as xr

## User inputs

In [3]:
bootstr = 1000 # Number of bootstraps for assessing the statistical significance of the results

# get the index values of the 5th, 95th and median number, when data are ordered (for the bootstraping)
l_min = int(bootstr*5/100)
l_max = int(bootstr*95/100)-1
l_med = int(bootstr/2)

In [4]:
forecasts_loc = ''
out_dir = ''

era5_med = out_dir+'Med_LocalizedPatterns_Labels.nc'
frcsts_med = out_dir+'Med_LocalizedPatterns_ForecastAllocations.nc'

offset_days = 22 # days to offset before/after central date for getting "DayMonth" moving-window Patterns clim.

## Data reading

In [5]:
# add data for 9 Mediterranean clusters
era5_med = xr.open_dataarray(era5_med)
frcsts_med = xr.open_dataarray(frcsts_med)
frcsts_med = frcsts_med.assign_coords({'ClustersNumber': era5_med.ClustersNumber.values})

cl_max = era5_med.max().values
cl_all = list(range(cl_max+1))
frcsts_med.name = era5_med.name

## Data analysis - auxiliary functions

In [6]:
# function for getting temporal flags based on different subsetting (used later on for climatological freqs)
def temp_flagging(valid_dates, temp_subset):
    
    valid_dates = pd.to_datetime(valid_dates)
    
    if temp_subset == 'All':
        temporal_flag = ['All']*len(valid_dates)
    elif temp_subset == 'HalfYear':
        temporal_flag = (valid_dates.month%12 + 3)//3
        temporal_flag = temporal_flag.map({1: 'WinterHalf', 2: 'SummerHalf', 3: 'SummerHalf', 4: 'WinterHalf'})
    elif temp_subset == 'Season':
        temporal_flag = (valid_dates.month%12 + 3)//3
        temporal_flag = temporal_flag.map({1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Autumn'})
    elif temp_subset == 'Month':
        temporal_flag = valid_dates.month.astype(str)
    elif temp_subset == 'DayMonth':
        temporal_flag = pd.Series([i[-4:] for i in valid_dates.strftime('%Y%m%d')])
        temporal_flag = temporal_flag.values
        
    return temporal_flag  

In [7]:
# indices for start & end of Summer Half (Summer Half between 16th April - 15th October, inclusive of both dates)
# this is based on the climatological frequencies of the Mediterranean patterns
sorted_dates = np.array(pd.date_range('20040101', '20041231').strftime('%m%d')) # leap year for getting all dates
start_summerhalf = np.where(sorted_dates=='0416')[0]
end_summerhalf = np.where(sorted_dates=='1015')[0] 

In [8]:
def clim_freqs(dataset):
    
    freqs = [(dataset==i_cl).mean('time') for i_cl in range(cl_max+1)]
    freqs = xr.concat(freqs, dim=pd.Index(range(cl_max+1), name='cluster'))
    
    return freqs

In [9]:
def daymonthfreqs(input_data):
    
    dataset, daymonth_used = input_data
     
    dates_used = all_dates_atm_var_extd[all_dates_atm_var_extd.strftime('%m%d').isin([daymonth_used])]

    # add buffer days (before/after) for having a more robust climatology for mean and std
    all_dates_used = [pd.date_range(i_date-pd.DateOffset(days=offset_days), 
                                    i_date+pd.DateOffset(days=offset_days), freq='D') 
                      for i_date in dates_used]
    all_dates_used = np.array([j for i in all_dates_used for j in i]) # flatten data to have 1-d array
    all_dates_used = all_dates_used[pd.to_datetime(all_dates_used).isin(all_dates_atm_var)] # existing dates

    # keep all dates of interest and get climatology
    clim_mean = dataset.sel(time=all_dates_used)
        
    clim_freq_i_date = clim_freqs(clim_mean) # get clim frequencies of the subset of interest

    return clim_freq_i_date.assign_coords({'time': daymonth_used})

In [12]:
# get climatological frequencies (for DayMonth and considering All data)
all_dates_atm_var = pd.to_datetime(era5_med.time.values)
all_dates_atm_var_extd = pd.date_range(all_dates_atm_var[0] - pd.DateOffset(years=1), 
                                       all_dates_atm_var[-1] + pd.DateOffset(years=1))
unique_daymonth = all_dates_atm_var.strftime('%m%d')
unique_daymonth = sorted(set(unique_daymonth))

pool = multiprocessing.Pool() # object for multiprocessing
era5_freqs = list(tqdm.tqdm(pool.imap(daymonthfreqs, [(era5_med, i) for i in unique_daymonth]), 
                            total=len(unique_daymonth), position=0, leave=True))
pool.close()
era5_freqs = xr.concat(era5_freqs, dim='time')

era5_freqs = xr.concat([clim_freqs(era5_med).assign_coords({'time': 'All'}), era5_freqs], dim='time') # all data
era5_freqs = era5_freqs.expand_dims({'clim_type': frcsts_med.clim_type.values}) # expand for same dims are frcsts

del(pool, all_dates_atm_var, all_dates_atm_var_extd, unique_daymonth)

100%|█████████████████████████████████████████| 366/366 [00:07<00:00, 46.63it/s]


In [11]:
def brier_score(dates_used):
    
    observations_raw = era5_lead_all.sel(time=dates_used) # keep only common dates
    forecasts_raw = frcst_lead_all.sel(time=dates_used).drop('step') # keep only common dates
   
    # convert era5 and forecasts to boolean for each cluster
    observ = xr.concat([observations_raw==i_cl for i_cl in cl_all], dim=pd.Index(cl_all, name='cluster'))*1
    forecasts = xr.concat([forecasts_raw==i_cl for i_cl in cl_all], dim=pd.Index(cl_all, name='cluster'))*1 

    # generate reference forecasts
    ref_frcst_1 = era5_freqs.sel(time=['All']*len(observ.time))
    ref_frcst_1 = ref_frcst_1.assign_coords({'time': observ.time.values})

    ref_frcst_2 = era5_freqs.sel(time=temp_flagging(observ.time.values, 'DayMonth'))
    ref_frcst_2 = ref_frcst_2.assign_coords({'time': observ.time.values})
           
    # get brier score
    dim_name = pd.Index(['frcst_fair', 'frcst', 'ref1', 'ref2'], name='forecast_type')
    brier_ref1_all = ((observ-ref_frcst_1)**2) # calculate brier score of ref forecast 1
    brier_ref2_all = ((observ-ref_frcst_2)**2) # calculate brier score of ref forecast 2
    brier_frcst_all = ((observ-forecasts.mean('number'))**2) # calculate brier score of forecasts
    n_members = len(forecasts.number.values)
    correct_counts = (forecasts==observ).sum('number').where(observ).sum('cluster')
    fair_adjustment_all = correct_counts*(n_members-correct_counts)/n_members**2/(n_members-1)
    fair_brier_frcst_all = brier_frcst_all - fair_adjustment_all
    
    all_months = list(range(1,13))
    winter_months = [1,2,9,10,11,12]
    summer_months = [3,4,5,6,7,8]
    
    brier_all = []
    for i_mon, i_season in zip([all_months, winter_months, summer_months], ['All', 'Winter', 'Summer']):
        i_dates_kept = pd.to_datetime(observ.time.values).month.isin(i_mon)
        brier_ref1 = brier_ref1_all.isel(time=i_dates_kept).mean('time')
        brier_ref2 = brier_ref2_all.isel(time=i_dates_kept).mean('time')
        brier_frcst = brier_frcst_all.isel(time=i_dates_kept).mean('time')
        fair_brier_frcst = fair_brier_frcst_all.isel(time=i_dates_kept).mean('time')

        brier_season = [fair_brier_frcst, brier_frcst, brier_ref1, brier_ref2]
        brier_season = xr.concat(brier_season, dim=dim_name).assign_coords({'Season': i_season})
        brier_all.append(brier_season)
        
    brier_all = xr.concat(brier_all, dim='Season')
    brier_all.name = 'BS'
    
    # get brier skill score
    brier_ref_min = brier_all.sel(forecast_type=['ref1', 'ref2']).min('forecast_type')
    brier_skill_all = 1-brier_all/brier_ref_min
    brier_skill_all.name = 'BSS'
    
    brier_all = xr.merge([brier_all, brier_skill_all]) # combine brier score and brier skill score
    
    # calculate aggregated brier results and combine with the cluster-specific results
    instances_all = observ.sum('time') # count total instances of each cluster at observations
    brier_all_mean = brier_all.weighted(instances_all).mean('cluster').assign_coords({'cluster': -1})
    brier_all = xr.concat([brier_all_mean, brier_all], dim='cluster')

    return brier_all    

In [13]:
def brier_score_bootstrap(lead_used):
    
    global era5_lead_all, frcst_lead_all, step_used # global variables for using on functions
    step_used = lead_used
    era5_lead_all = era5
    frcst_lead_all = frcst.sel(step=lead_used) # subset lead day of interest for forecasts
    # convert the time (which is initiation time) to the actual valid time, for proper comparison with obs.
    frcst_lead_all = frcst_lead_all.assign_coords({'time': frcst_lead_all.time+np.timedelta64(step_used, 'D')})

    common_dates = set(frcst_lead_all.time.values) & set(era5_lead_all.time.values) # get common dates
    common_dates = sorted(common_dates) # convert to sorted listed
    common_dates = pd.to_datetime(common_dates)
    summer = common_dates[common_dates.month.isin([3,4,5,6,7,8])]
    winter = common_dates[common_dates.month.isin([1,2,9,10,11,12])]
    
    # generate bootstrap dates, with same number of winter/summer for all subsets for getting also seasonal stats
    bbs_dates = []
    for i_season in [winter, summer]:
        np.random.seed(10)
        bbs_i_season = np.random.choice(i_season, len(i_season)*bootstr) # generate all bootstrap dates
        bbs_i_season = np.array_split(bbs_i_season, bootstr) # split into number of subsets (samples)
        bbs_i_season = np.insert(np.array(bbs_i_season), 0, np.array(i_season), axis=0) # add actual at 1 place
        bbs_dates.append(bbs_i_season)

    final_bbs_dates = []
    for i_bss in range(bootstr+1): # +1 because the original dates are appended before
        i_dates = np.concatenate((bbs_dates[0][i_bss], bbs_dates[1][i_bss]))
        final_bbs_dates.append(sorted(i_dates))
    
    brier_score_all_bootstraps = [brier_score(i_dates) for i_dates in final_bbs_dates]
    bootstrap_dim = pd.Index(range(bootstr+1), name='bootstrap')
    brier_score_all_bootstraps = xr.concat(brier_score_all_bootstraps, dim=bootstrap_dim)
    
    # process bootstraps for getting the results from the Q5, Q95, Median, and Actual bootstraps
    data_final = brier_score_all_bootstraps.to_array().rename({'variable': 'Indicator'})
    data_final = data_final.transpose(..., 'bootstrap')
    
    # get percent of bootstraps that BSS is positive (for checking significance of results)
    sign = (data_final.sel(Indicator='BSS')>0).sum('bootstrap')/(bootstr+1)
    sign = sign.expand_dims('Indicator').to_dataset('Indicator')
    sign = sign.rename({'BSS': 'BSS_Sign'})
    
    # get the quantiles of interest based on bootstraps
    data_quant = np.sort(data_final.isel(bootstrap=data_final.bootstrap>0), axis=-1)[..., [l_min, l_max]]
    data_quant = data_final.sel(bootstrap=range(2))*0+data_quant
    data_quant = data_quant.assign_coords({'bootstrap': ['Q5', 'Q95']})

    # add results from original analysis
    data_actual = data_final.isel(bootstrap=0).assign_coords({'bootstrap': 'Actual'})

    # get median value based on bootstraps and actual data (so median is actual sorted index)
    data_median = np.sort(data_final, axis=-1)[..., l_med]
    data_median = data_final.sel(bootstrap=0)*0+data_median
    data_median = data_median.assign_coords({'bootstrap': 'Q50'})

    data_final = xr.concat([data_quant, data_actual, data_median], dim='bootstrap')
    data_final = data_final.to_dataset('Indicator')
    
    data_final = xr.merge([data_final, sign])
    
    del(era5_lead_all, frcst_lead_all, step_used)
    
    return data_final

In [14]:
frcst = frcsts_med.isel(number=(frcsts_med.number.values>=0))
frcst = frcst.assign_coords({'step': (frcst.step.values/np.timedelta64(1, 'D')).astype(int)})
steps_all = frcst.step.values
era5 = era5_med

pool = multiprocessing.Pool() # object for multiprocessing
bs_final = list(tqdm.tqdm(pool.imap(brier_score_bootstrap, steps_all), 
                          total=len(steps_all), position=0, leave=True))
pool.close()
bs_final = xr.concat(bs_final, dim=pd.Index(steps_all, name='step'))
del(pool)

bs_final.to_netcdf(out_dir+'Med_LocalizedPatterns_ForecastingBrier.nc')
del(frcst, steps_all, era5)

100%|███████████████████████████████████████████| 44/44 [43:49<00:00, 59.75s/it]
