# Examine the effect of the number of ensemble members on simulated interannual variability

### Author: Chris Wyburn-Powell, [github](https://github.com/chrisrwp/synthetic-ensemble/SIA/Time_period_and_sigma.ipynb)

**Input**: <br>
Sea ice area for 6 CLIVAR LE models and observations from HadISST1 historical and RCP8.5. <br>

**Output**: <br>
- $\sigma_{LE}$, $\sigma_{mem}$ and $\sigma_{obs}$ subsampled or bootstrapped to different ensemble sizes:
    - $\sigma_{LE}$ and $\sigma_{mem}$ bootstrapped 2-maximum members 1000 times
    - $\sigma_{mem}$ Subsampled scaled to 20 and 50 members 10,000 times
    - $\sigma_{mem}$ Subsampled 2-maximum members 100 times
- **Figures S1 and S2**

In [17]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.path as mpath
import matplotlib.cm as cm
from matplotlib.lines import Line2D
from matplotlib.patches import Patch
import matplotlib.patheffects as pe
import xarray as xr
import datetime
print(datetime.datetime.utcnow().strftime("%H:%M UTC %a %Y-%m-%d"))

14:05 UTC Tue 2022-04-26


In [18]:
month_names = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 
               'August', 'September', 'October', 'November', 'December']

model_names        = ['CanESM2', 'CESM1', 'CSIRO_MK36',  'GFDL_CM3', 
                      'GFDL_ESM2M', 'MPI_ESM1']
model_print_names  = ['CanESM2', 'CESM1', 'CSIRO MK3.6', 'GFDL CM3', 
                      'GFDL ESM2M', 'MPI ESM1']
obs_names          = ['HadISST1']

mem_len   = [50,  40,  30,  20,  30,  100]
start_yrs = [1950,1920,1850,1920,1950,1850]
colors    = ['m', 'b', 'g', 'orange', 'k', 'tab:olive']

# Subsample the members

In [5]:
#load detrended data
CLIVAR_det_ind = xr.open_dataset(
    '/glade/campaign/univ/ucub0084/Synthetic_ensemble/SIA/SIA_detrended/'\
    +'CLIVAR_SIA_detrended_individual_79-20.nc')
# CLIVAR_det_ens = xr.open_dataset(
#     path+'SIA/SIA_detrended/CLIVAR_SIA_detrended_ensemble_79-20.nc')

#compute the standard deviation with respect to time for each month
ind_SD = CLIVAR_det_ind.groupby('time.month').std('time')
# ens_SD = CLIVAR_det_ens.groupby('time.month').std('time')

## $\sigma_{LE}$

In [6]:
def bootstrap_month_model(sd_time, mem_n, resamp_n):
    '''
    Subsample a 2D time series (time, member) n times for all model members
    N.B. bootstrapping (with replacement) can be used instead by changing
    which lines are commented out in the nested for loop
    
    
    Parameters
    ----------
    sd_time : xarray.datarray,
        resampled standard deviation with respect to time for each member
    mem_n : interger,
        Number of ensemble members 
    resamp_n: int
        Number of random resamplings without replacement
    
    Returns
    ----------
        1D xarray.dataarray object of average standard deviation values by 
        number of bootsrapped members, shape: (n_member)
    '''  
    
    SD_LE = []
    
    #loop through 2 the maximum number of members
    for mem_n_i in np.arange(2,mem_n+1,1): 

        temp_SD_list = [] #initialize array to record all the resamplings

        for resamp_i in range(resamp_n):
            #bootstrap (with replacement)
            #list random members between 0 and last member element
            # rand_i_list = np.random.randint(0,mem_n-1, size=mem_n_i)
            
            #subsampling - without replacement
            #list random members between 0 and last member element
            rand_i_list = np.random.choice(np.arange(0,mem_n), size=mem_n_i, 
                                           replace=False) 
            
            temp_SD_list.append(sd_time.isel(member=rand_i_list).std('member'))

        SD_LE.append(np.mean(temp_SD_list))

    #make an xarray object with properly named coordinates for members
    SD_LE = xr.DataArray(data   = SD_LE,
                         coords = {'n_members':np.arange(2,mem_n+1,1)},
                         dims   = ['n_members'])
    
    return(SD_LE)

In [7]:
boostrapped_ind = []
# boostrapped_ens = []

for month_ in [3,9]:
    print(datetime.datetime.now(), month_)
    month_ind = []
#     month_ens = []
    
    for model_i, model_name in enumerate(model_names):
        print(datetime.datetime.now(), model_name)
        month_ind.append(bootstrap_month_model(ind_SD[model_name].sel(
            month=month_), mem_len[model_i], 1000))
        # month_ens.append(bootstrap_month_model(ens_SD[model_name].sel(
        #     month=month_), mem_len[model_i], 1000))
        
    boostrapped_ind.append(xr.concat((month_ind), dim='model'))
#     boostrapped_ens.append(xr.concat((month_ens), dim='model'))
    
boostrapped_ind = xr.concat((boostrapped_ind), dim='month')
boostrapped_ind['month'] = [3,9]
boostrapped_ind['model'] = model_names

# boostrapped_ens = xr.concat((boostrapped_ens), dim='month')
# boostrapped_ens['month'] = [3,9]
# boostrapped_ens['model'] = model_names

## $\sigma_{mem}$

In [9]:
#load sigma_mem
sigma_mem = xr.open_dataset(path+'SIA/SIA_resampled/Sigma_mem_individual_10000.nc')

In [16]:
boostrapped_mem = []

for month_ in [3,9]:
    print(datetime.datetime.now(), month_)
    month_mem = []
    
    for model_i, model_name in enumerate(model_names):
        print(datetime.datetime.now(), model_name)
        month_mem.append(bootstrap_month_model(
            sigma_mem[model_name].sel(month=month_), mem_len[model_i], 1000))
        
    boostrapped_mem.append(xr.concat((month_mem), dim='model'))
    
boostrapped_mem = xr.concat((boostrapped_mem), dim='month')
boostrapped_mem['month'] = [3,9]
boostrapped_mem['model'] = model_names

In [15]:
boostrapped_mem = xr.concat((boostrapped_mem), dim='month')
boostrapped_mem['month'] = [3,9]
boostrapped_mem['model'] = model_names

# Figure S2 - Subsampled ensemble size on $\sigma_{LE}$

In [1]:
#1979-2020
#open the dataset
data = boostrapped_ind.copy() #change for ind or ens

fig, axes = plt.subplots(2,2,figsize=[15,8])

#plot the sigma LE with number of members
for i, month_ in enumerate([3,9]):
    
    for model_i, model_name in enumerate(model_names):
        axes[i][0].plot(np.arange(2,101,1), data.sel(model=model_name).sel(
            month=month_), c=colors[model_i], linewidth=2.5)
        
    axes[i][0].set_ylim(0.018,0.086) #max 0.083 for ind, 0.086 for ens
    axes[i][0].set_ylabel(r'$\sigma_{LE} \ SIA \ [10^6 \ km^2]$', fontsize=20);
    axes[i][0].set_yticklabels(np.round(np.arange(0.01,0.09,0.01),2), fontsize=16)
    axes[i][0].set_xlim(2,51)
    axes[i][0].set_xticks(np.arange(5,51,5))
    axes[i][0].set_xticklabels(np.round(np.arange(5,51,5)), fontsize=16)
    axes[i][0].set_title(['March','September'][i], fontsize=22)
    axes[i][1].set_title(['March','September'][i], fontsize=22)
    axes[i][0].grid()

    
    for model_i, model_name in enumerate(model_names):
        axes[i][1].plot(np.arange(2,101,1), np.gradient(data.sel(
            model=model_name).sel(month=month_)), c=colors[model_i], 
                        linewidth=2.5)
        
    axes[i][1].axhline(0, c='0.5', linewidth=0.25)
    axes[i][1].set_ylabel(r'$\sigma_{LE}$  Derivative', fontsize=20);
    axes[i][1].set_ylim(-0.0005,0.013)
    axes[i][1].set_yticks(np.arange(0,0.013,0.002))
    axes[i][1].set_yticklabels(['0', '0.002', '0.004', '0.006', '0.008', 
                                '0.010', '0.012'], fontsize=16)
    axes[i][1].set_xlim(2,20)
    axes[i][1].set_xticks(np.arange(2,21,2))    
    axes[i][1].set_xticklabels(np.round(np.arange(2,21,2)), fontsize=16)   
    axes[i][1].grid()
    
legend_elements = [Patch(facecolor=colors[0], label='CanESM2'),
                   Patch(facecolor=colors[1], label='CESM1'),
                   Patch(facecolor=colors[2], label='CSIRO MK36'),
                   Patch(facecolor=colors[3], label='GFDL CM3'),
                   Patch(facecolor=colors[4], label='GFDL ESM2M'),
                   Patch(facecolor=colors[5], label='MPI ESM1'),]



extra_legend = plt.legend(handles=legend_elements, bbox_to_anchor=(-0.11, -0.41),
                          loc='lower center', borderaxespad=0, ncol=6, 
                          fontsize=16)
plt.gca().add_artist(extra_legend);

axes[1][0].set_xlabel('Number of members', fontsize=20)
axes[1][1].set_xlabel('Number of members', fontsize=20)
axes[0][0].text(-0.06, 1.08, 'a', transform=axes[0][0].transAxes, fontsize=20, 
                fontweight='bold', va='top', ha='right');
axes[0][1].text(-0.06, 1.08, 'b', transform=axes[0][1].transAxes, fontsize=20, 
                fontweight='bold', va='top', ha='right');
axes[1][0].text(-0.06, 1.08, 'c', transform=axes[1][0].transAxes, fontsize=20, 
                fontweight='bold', va='top', ha='right');
axes[1][1].text(-0.06, 1.08, 'd', transform=axes[1][1].transAxes, fontsize=20, 
                fontweight='bold', va='top', ha='right');

plt.tight_layout()
fig.subplots_adjust(bottom=0.2)
#change file name for ind or ens
fig.savefig('/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIA/figures/'\
            +'Member_number_and_sigma/Subsampling_Sigma_LE.pdf', 
            bbox_inches='tight') 
# fig.savefig('/glade/scratch/cwpowell/Synthetic_ensemble/SIA/figures/'\
    # +'Member_number_and_sigma/Bootstrapping_ind_larger.png', 
    # dpi=400, bbox_inches='tight')

## Bootstrap $\sigma_{mem}$ and $\sigma_{obs}$ 10,000 times for $n$ ensemble members for all models

In [5]:
sigma_mem = xr.open_dataset(path+'SIA/SIA_resampled/Sigma_mem_individual_10000.nc')
# sigma_mem = xr.open_dataset(path+'SIA/SIA_resampled/Mu_mem_individual_10000.nc')

mem_n = 50

boostrapped_min = {}
boostrapped_max = {}

for model_i, model_name in enumerate(model_names):
    print(datetime.datetime.now(), model_name)
    month_min = []
    month_max = []
    
    for month_ in np.arange(1,13):
        if month_ %6 == 0: print(datetime.datetime.now(), month_)
        
        boot_mins = []
        boot_maxs = []
        for boot_i in range(10000):
            temp_mems = sigma_mem[model_name].sel(month=month_).sel(
                member=np.random.randint(1,mem_len[model_i]+1,mem_n))
            boot_mins.append(temp_mems.min())
            boot_maxs.append(temp_mems.max())

        month_min.append(xr.concat((boot_mins),dim='bootstrap_i'
                                  ).mean('bootstrap_i'))
        month_max.append(xr.concat((boot_maxs),dim='bootstrap_i'
                                  ).mean('bootstrap_i'))
        
    boostrapped_min[model_name] = xr.concat((month_min),dim='month')
    boostrapped_max[model_name] = xr.concat((month_max),dim='month')

In [162]:
#save to NetCDF 
boostrapped_min_xr = xr.Dataset(boostrapped_min)
boostrapped_max_xr = xr.Dataset(boostrapped_max)

boostrapped_min_xr.to_netcdf(
    '/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIA/bootstrap_50_'\
    +'members_1000_sigma_mem_min.nc')
boostrapped_max_xr.to_netcdf(
    '/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIA/bootstrap_50_'\
    +'members_1000_sigma_mem_max.nc')

## Bootstrap $\sigma$ or $\mu$ for the full range of members

In [2]:
#load sigma_mem
sigma_mem = xr.open_dataset(path+'SIA/SIA_resampled/Sigma_mem_individual_10000.nc')
# sigma_mem = xr.open_dataset(path+'SIA/SIA_resampled/Mu_mem_individual_10000.nc')

boostrapped_min = {}
boostrapped_max = {}

for model_i, model_name in enumerate(model_names):
    print(datetime.datetime.now(), model_name)
    month_min = []
    month_max = []
    
    for month_ in np.arange(1,13):
        if month_ %4 == 0: print(datetime.datetime.now(), month_)
        
        mem_mins = []
        mem_maxs = []
        for mem_n in np.arange(2,mem_len[model_i]+1):
        
            boot_mins = []
            boot_maxs = []
            for boot_i in range(1000):
                temp_mems = sigma_mem[model_name].sel(month=month_).sel(
                    member=np.random.randint(1,mem_len[model_i]+1,mem_n))
                boot_mins.append(temp_mems.min())
                boot_maxs.append(temp_mems.max())
                
            mem_mins.append(xr.concat((boot_mins),dim='bootstrap_i'
                                     ).mean('bootstrap_i'))
            mem_maxs.append(xr.concat((boot_maxs),dim='bootstrap_i'
                                     ).mean('bootstrap_i'))
            
        month_min.append(xr.concat((mem_mins),dim='member'))
        month_max.append(xr.concat((mem_maxs),dim='member'))
        
    boostrapped_min[model_name] = xr.concat((month_min),dim='month')
    boostrapped_max[model_name] = xr.concat((month_max),dim='month')

In [134]:
#save to NetCDF 
for model_i, model_name in enumerate(model_names):
    boostrapped_min[model_name]['member'] = np.arange(2,mem_len[model_i]+1)
    boostrapped_max[model_name]['member'] = np.arange(2,mem_len[model_i]+1)
    
boostrapped_min_xr = xr.Dataset(boostrapped_min)
boostrapped_max_xr = xr.Dataset(boostrapped_max)

boostrapped_min_xr.to_netcdf(
    '/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIA/'\
    +'bootstrap_members_1000_sigma_mem_min.nc')
boostrapped_max_xr.to_netcdf(
    '/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIA/'\
    +'bootstrap_members_1000_sigma_mem_max.nc')

In [136]:
#open from NetCDF
mu_min_boot = xr.open_dataset(
    '/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIA/'\
    +'bootstrap_members_1000_mu_mem_min.nc')
mu_max_boot = xr.open_dataset(
    '/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIA/'\
    +'bootstrap_members_1000_mu_mem_max.nc')

sigma_min_boot = xr.open_dataset(
    '/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIA/'\
    +'bootstrap_members_1000_sigma_mem_min.nc')
sigma_max_boot = xr.open_dataset(
    '/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIA/'\
    +'bootstrap_members_1000_sigma_mem_max.nc')

sigma_obs = xr.open_dataset(
    '/glade/scratch/cwpowell/Synthetic_ensemble/SIA/'\
    +'SIA_resampled/Sigma_obs_individual_10000.nc')

## Subsample (without replacement) to 20 members
**This is for comparing across models with different ensemble sizes, used in Figure S5**

In [3]:
# mem_values = xr.open_dataset(path+'SIA/SIA_resampled/Sigma_mem_individual_10000.nc')
mem_values = xr.open_dataset(path+'SIA/SIA_resampled/Mu_mem_individual_10000.nc')

subsample_n = 20

subsample_min = {}
subsample_max = {}

for model_i, model_name in enumerate(model_names):
    print(datetime.datetime.now(), model_name)
    month_min = []
    month_max = []
    
    for month_ in np.arange(1,13):
        if month_ %6 == 0: print(datetime.datetime.now(), month_)
        
        mins = []
        maxs = []
        for boot_i in range(1000):
            temp_mems = mem_values[model_name].sel(month=month_).sel(
                member=np.random.choice(np.arange(1,mem_len[model_i]+1), 
                                        subsample_n, replace=False))
            mins.append(temp_mems.min())
            maxs.append(temp_mems.max())

        month_min.append(xr.concat((mins),dim='subsample_i').mean('subsample_i'))
        month_max.append(xr.concat((maxs),dim='subsample_i').mean('subsample_i'))
        
    subsample_min[model_name] = xr.concat((month_min),dim='month')
    subsample_max[model_name] = xr.concat((month_max),dim='month')

In [19]:
#save to NetCDF 
subsample_min_xr = xr.Dataset(subsample_min)
subsample_max_xr = xr.Dataset(subsample_max)

subsample_min_xr.to_netcdf(
    '/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIA/subsample_20_'\
    +'members_1000_mu_mem_min.nc')
subsample_max_xr.to_netcdf(
    '/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIA/subsample_20_'\
    +'members_1000_mu_mem_max.nc')

## Subsample for 2 - 100 members

In [21]:
mem_values = xr.open_dataset(path+'SIA/SIA_resampled/Sigma_mem_individual_10000.nc')
# mem_values = xr.open_dataset(path+'SIA/SIA_resampled/Mu_mem_individual_10000.nc')

subsample_min = {}
subsample_max = {}

for model_i, model_name in enumerate(model_names):
    print(datetime.datetime.now(), model_name)
    month_min = []
    month_max = []
    
    for month_ in np.arange(1,13):
        if month_ %6 == 0: print(datetime.datetime.now(), month_)
        
        mem_mins = []
        mem_maxs = []
        for mem_n in np.arange(2,101):
            
            if mem_n >= mem_len[model_i]:
                mem_mins.append(mem_values[model_name].sel(month=month_).min('member'))
                mem_maxs.append(mem_values[model_name].sel(month=month_).max('member'))
            
            else:
                mins = []
                maxs = []
                for boot_i in range(100):
                    temp_mems = mem_values[model_name].sel(month=month_).sel(member=np.random.choice(np.arange(1,mem_len[model_i]+1), mem_n, replace=False))
                
                    mins.append(temp_mems.min())
                    maxs.append(temp_mems.max())

                mem_mins.append(xr.concat((mins),dim='subsample_i').mean('subsample_i'))
                mem_maxs.append(xr.concat((maxs),dim='subsample_i').mean('subsample_i'))
            
        month_min.append(xr.concat((mem_mins),dim='member'))
        month_max.append(xr.concat((mem_maxs),dim='member'))
        
    subsample_min[model_name] = xr.concat((month_min),dim='month')
    subsample_max[model_name] = xr.concat((month_max),dim='month')

In [80]:
#save to NetCDF 
subsample_min_xr = xr.Dataset(subsample_min)
subsample_max_xr = xr.Dataset(subsample_max)

subsample_min_xr.to_netcdf('/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIA/subsample_members_100_mu_mem_min.nc')
subsample_max_xr.to_netcdf('/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIA/subsample_members_100_mu_mem_max.nc')

In [22]:
#load data for plotting
sigma_obs = xr.open_dataset('/glade/scratch/cwpowell/Synthetic_ensemble/SIA/SIA_resampled/Sigma_obs_individual_10000.nc')
# mu_obs = xr.open_dataset('/glade/scratch/cwpowell/Synthetic_ensemble/SIA/SIA_resampled/Mu_obs_individual_10000.nc')

subsample_min = xr.open_dataset('/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIA/subsample_members_100_sigma_mem_min.nc')
subsample_max = xr.open_dataset('/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIA/subsample_members_100_sigma_mem_max.nc')

# Figure S1 - December maximum and minimum $\sigma_{mem}$ subsampled from 2-maximum members

In [6]:
fig = plt.figure(figsize=[9,5])

month_ = 12
sigma_obs_month = sigma_obs.drop('Merged').drop('SII').to_array()

    
for model_i, model_name in enumerate(model_names):
    #N.B. member does not have coordinates, starts from zero
    plt.plot(np.arange(2,mem_len[model_i]+1), subsample_min[model_name].sel(
        month=month_).sel(member=slice(0,mem_len[model_i]-1)),
                       c=colors[model_i], linewidth=1.5)
    plt.plot(np.arange(2,mem_len[model_i]+1), subsample_max[model_name].sel(
        month=month_).sel(member=slice(0,mem_len[model_i]-1)), 
                       c=colors[model_i], linewidth=1.5)

plt.ylabel(r'$\sigma_{mem}$, $\sigma_{obs}$ [10$^6$ km$^2$]', fontsize=18)
            
plt.plot([0,101],[sigma_obs_month.sel(month=month_).min('variable'),
                  sigma_obs_month.sel(month=month_).min('variable')], 
         ls='--', lw=2, c='r')

plt.plot([0,101],[sigma_obs_month.sel(month=month_).max('variable'),
                  sigma_obs_month.sel(month=month_).max('variable')], 
         ls='-.', lw=2, c='r')

plt.fill_between([0,105],
                 [sigma_obs_month.sel(month=months[ax_x]).min('variable'), 
                  sigma_obs_month.sel(month=months[ax_x]).min('variable')],
                 [sigma_obs_month.sel(month=months[ax_x]).max('variable'), 
                  sigma_obs_month.sel(month=months[ax_x]).max('variable')], 
                 color='r', alpha=0.5)
        
plt.xlim(2,100)
plt.xticks(np.arange(10,101,10))
plt.tick_params(axis='both', which='major', labelsize=15)
# plt.grid(axis='x', which='both')
plt.xlabel('Number of members',fontsize=18)

# plt.title(month_names[month_-1],fontsize=18)

legend_elements = [Line2D([0], [0], c=colors[0], lw=2.5, label='CanESM2'),
                   Line2D([0], [0], c=colors[1], lw=2.5, label='CESM1'),
                   Line2D([0], [0], c=colors[2], lw=2.5, label='CSIRO MK3.6'),
                   Line2D([0], [0], c=colors[3], lw=2.5, label='GFDL CM3'),
                   Line2D([0], [0], c=colors[4], lw=2.5, label='GFDL ESM2M'),
                   Line2D([0], [0], c=colors[5], lw=2.5, label='MPI ESM1'),
                   Line2D([0], [0], c='r', ls='-.', lw=2.5, 
                          label=r'Maximum $\sigma_{obs}$'),
                   Line2D([0], [0], c='r', ls='--', lw=2.5, 
                          label=r'Minimum $\sigma_{obs}$')]

extra_legend = plt.legend(handles=legend_elements, bbox_to_anchor=(1.03, 0.85), 
                          loc='upper left', borderaxespad=0, ncol=1, fontsize=13)
plt.gca().add_artist(extra_legend);
fig.savefig('/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIA/figures/'\
            +'Member_number_and_sigma/December_subsampling_100_sigma_mem_obs_'\
            +'stop_max.pdf', bbox_inches='tight')