# Resample detrended filtered and different time period linear detrended modeled and observed SIC

### Author: Chris Wyburn-Powell, [github](https://github.com/chrisrwp/synthetic-ensemble/SIC/Resample_filtered_models_obs.ipynb)

**Input**: <br>
- Observations: Detrended relative to a 2 year lowpass Buttworth filter
- Detrended CLIVAR LE Archive model output from CanESM2, CESM1, CSIRO MK3.6, GDL CM3, GFDL ESM2M, MPI ESM1. Detrended relative to a 2 year lowpass Butterworth filter.

**Output**: <br>
- Mean and standard deviation of 1000 resamplings. Each resampling is itself the standard deviation (with respect to time) of SIC with a 2 year block bootstrap size. Separate files for each model and month

In [1]:
import numpy as np
import xarray as xr
import datetime
import dask

print(datetime.datetime.utcnow().strftime("%H:%M UTC %a %Y-%m-%d"))

17:27 UTC Fri 2022-04-15


In [2]:
#for running on Cheyenne, takes ~2.5 minutes per month for models, ~5 seconds for HadISST1 per month, ~50 seconds for NSIDC datasets per month
from dask_jobqueue import PBSCluster
from dask.distributed import Client

cluster = PBSCluster(cores    = 1,
                     memory   = '1GB',
                     queue    = 'casper', #'economy' for Cheyenne, 'casper' for Casper
                     walltime = '00:10:00')

cluster.scale(16)
client = Client(cluster)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 34518 instead
  http_address["port"], self.http_server.port


0,1
Client  Scheduler: tcp://10.12.206.47:37903  Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/cwpowell/proxy/{port}/status,Cluster  Workers: 14  Cores: 14  Memory: 14.00 GB


In [2]:
data_path = '/glade/scratch/cwpowell/Synthetic_ensemble/'

model_names  = ['CanESM2', 'CESM1', 'CSIRO_MK36', 'GFDL_CM3', 'GFDL_ESM2M', 'MPI_ESM1' ]
mem_len      = [50,        40,      30,           20,         30,           100        ]

lat_labs = ['lat', 'j', 'lat', 'rlat', 'lat', 'j']
lon_labs = ['lon', 'i', 'lon', 'rlon', 'lon', 'i']

month_names = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 
               'August', 'September', 'October', 'November', 'December']

# Define the resampling function with a 2 year block bootstrap size

In [3]:
def resample_boot2_mem(data, time_period, resamp_n, lat_lab, lon_lab):
    '''
    Resample a 3D time series using a 2 year block boostrap size with replacement
    3D so can only resample one member at a time
    
    Parameters
    ----------
    data: xarray Dataarray,
        Detredned SIC dataset with lat, lon and time (single member)
    time_period : integer,
        For 1979-2020 use 42 as the total number of years in that time period
    resamp_n : integer,
        The number of times for resampling to take place
    lat_lab: string,
        Name of coordinate for latitude, e.g. 'lat', 'rlat', 'j'
    lon_lab: string,
        Name of coordinate for longitude, e.g. 'lon', 'rlon', 'i'
    
    Returns
    ----------
        Two 2D xarray dataarray objects of resamplings of the input data with standard deviation with respect to time already computed, shape: (lat, lon)
    '''  
    
    #create an xarray dataarray of indexes for half the length of the time period, year_i coordinates 1,3,5...
    boot_2_first_ind = xr.DataArray(data   = np.random.randint(0,time_period-2, (resamp_n, int(time_period/2))), 
                                    coords = {'resampling':np.arange(1,resamp_n+1,1), 'year_i':np.arange(1,time_period+1,2)},
                                    dims   = ['resampling', 'year_i'])

    #create an identical dataarray but with each element incremented by 1, year_i coordinates 2,4,6....
    boot_2_second_ind = (boot_2_first_ind+1).copy()
    boot_2_second_ind['year_i'] = np.arange(2,time_period+2,2) #make the coordinates incremented by 1 as well

    #concatenate the two arrays with the year_i coordinates in order, this allows a 2 year block boostrap size
    all_boot_2_ind = xr.concat((boot_2_first_ind, boot_2_second_ind), dim='year_i').sortby('year_i')
    
    #initialize a numpy array for all the resampled data after standard deviations have been applied
    resampled = np.empty((resamp_n, len(data[lat_lab]), len(data[lon_lab])))
    for resamp_i in range(resamp_n): #loop through all of the resamplings
        resampled[resamp_i] = data.isel(time=all_boot_2_ind.isel(resampling=resamp_i)).std('year_i')
    
    #convert the numpy array into an xarray
    resampled_xr = xr.DataArray(data   = resampled, 
                                coords = {'resampling':np.arange(1,resamp_n+1), lat_lab:data[lat_lab], lon_lab:data[lon_lab]}, 
                                dims   = ['resampling', lat_lab, lon_lab])    
    
    #compute the final product of the mean and standard deviations across resamplings
    mean_resampled = resampled_xr.mean('resampling', skipna=True)
    SD_resampled   = resampled_xr.std('resampling', skipna=True)

    return(mean_resampled, SD_resampled)

# Resample models

In [None]:
start_yr = 1979
end_yr = 2010

for month_ in np.arange(1,13):
    print(datetime.datetime.now(), month_)
    for model_i, model_name in enumerate(model_names):

        #load the correct detrended data file
        # detrended = xr.open_dataarray('/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIC/Detrend_filter/'+
        #                               '{}_detrended_2yr_filter_1979_2020_{}.nc'.format(model_name, str(month_).zfill(2)))  
        detrended = xr.open_dataarray('/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIC/Detrended/'+
                                      '{}_detrended_{}_individual_{}_{}.nc'.format(model_name, 
                                          str(month_).zfill(2), start_yr, end_yr,))
        
        ###############################################################################
        # compute resampling of all members at once using dask
        all_resamp = [] #initialize a list to store the delayed objects
        for mem_i, mem in enumerate(np.arange(1,mem_len[model_i]+1)): #loop through all members
            #append delayed objects for computation simulatneously later on
            all_resamp.append(dask.delayed(resample_boot2_mem)(detrended.sel(member=mem), end_yr-start_yr+1,
                                                               1000, lat_labs[model_i], lon_labs[model_i]))

        results_resamp = dask.compute(*all_resamp) #do the simultaneous computation on all members 

        ###############################################################################
        # convert the xarray object tuples into an xarray dataset with correct coordinates
        all_means, all_SDs = [], []
        for mem_i in range(mem_len[model_i]):
            all_means.append(results_resamp[mem_i][0])
            all_SDs.append(results_resamp[mem_i][1])

        save_resamp = xr.Dataset({'mean':xr.concat((all_means), dim='member'), 'SD':xr.concat((all_SDs), dim='member')})
        save_resamp['member'] = np.arange(1,mem_len[model_i]+1) #label the member dimension

        ###############################################################################
        # save the xarray dataset to NetCDF
        save_resamp.attrs = {'Description': 'Resampled standard deviations with respect to time of Arctic sea ice concentrations (SIC) for model {}. '\
                        +'Mean - mean standard deviation across the 1000 resamplings, SD - standard deviation across the 1000 resamplings. '\
                        +'Years 1979-2020 for the month of {} are resampled 1000 times with a 2 year bootstrap size. '\
                        # +'SIC anomalies were calculated from a 2 year lowpass Butterworth filter on the individual member and '\
                        +'SIC anomalies were calculated from linear detrending on the individual member at '\
                        +'each grid cell.'.format(model_name, month_names[month_-1]),
                             'Units'      : '%',
                             'Timestamp'  : str(datetime.datetime.utcnow().strftime("%H:%M UTC %a %Y-%m-%d")),                
                             'Data source': 'CLIVAR Large Ensemble Archive (doi: 10.1038/s41558-020-0731-2)',
                             'Analysis'   : 'Python 3.7.9 - https://github.com/chrisrwp/synthetic-ensemble/SIC/Resample_filtered_models_obs.ipynb',
                        }

        save_resamp.to_netcdf('/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIC/Resampled'+
                              # '_filter/{}_resampled_2yr_filter_1979_2020_{}.nc'.format(model_name, str(month_).zfill(2)))
                              '/{}_resampled_individual_{}_{}_{}.nc'.format(model_name, start_yr, end_yr, str(month_).zfill(2)))

2022-04-14 18:39:41.717095 1
2022-04-14 18:42:22.056827 2
2022-04-14 18:44:27.531821 3
2022-04-14 18:46:32.424045 4
2022-04-14 18:48:37.786059 5
2022-04-14 18:50:43.054911 6
2022-04-14 18:52:47.508708 7
2022-04-14 18:54:51.405994 8
2022-04-14 18:56:54.978755 9
2022-04-14 18:58:59.360294 10


# Resample Observations

In [13]:
start_yr = 1989
end_yr = 2020

for dataset_name in ['NSIDC_NT', 'NSIDC_BT', 'HadISST1']: #'NSIDC_CDR', 
    print(datetime.datetime.now(), dataset_name)
    
    if dataset_name == 'HadISST1':
        lat_lab = 'latitude'
        lon_lab = 'longitude'
        data_doi = 'doi:10.1029/2002JD002670'
    else:
        lat_lab = 'ygrid'
        lon_lab = 'xgrid'
        data_doi = 'doi:10.7265/efmz-2t65'
    
    #load the correct detrended data file
    detrended = xr.open_dataarray('/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIC/'\
                                  # +'Detrend_filter/{}_SIC_2yr_filter_1979_2020.nc'.format(dataset_name))  
                                  +'Detrended/{}_detrended_individual_{}_{}.nc'.format(dataset_name, start_yr, end_yr))  
            
#     ############################## WITH DASK ###############################
#     all_resamp = []
#     for month_ in np.arange(1,13):

#         month_data = detrended.sel(time=detrended['time.month']==month_)

#         ###############################################################################
#         # compute resampling of each detrending technique
#         with dask.config.set(**{'array.slicing.split_large_chunks': True}):
#             all_resamp.append(dask.delayed(resample_boot2_mem)(month_data, end_yr-start_yr+1, 
#                                                                1000, lat_lab, lon_lab))
#     results_resamp = dask.compute(*all_resamp)
    
#     ###############################################################################
#     # convert the xarray object tuples into an xarray dataset with correct coordinates
#     all_means, all_SDs = [], []
#     for month_ in np.arange(1,13):
#         all_means.append(results_resamp[month_-1][0])
#         all_SDs.append(results_resamp[month_-1][1])
#     ############################## WITH DASK ###################################
    
    ############################## WITHOUT DASK ###################################
    all_means = [] 
    all_SDs = [] 
    
    for month_ in np.arange(1,13):
        print(datetime.datetime.now(), month_)
        
        month_data = detrended.sel(time=detrended['time.month']==month_)
        
        temp_data = resample_boot2_mem(month_data, end_yr-start_yr+1, 1000, lat_lab, lon_lab)
        all_means.append(temp_data[0])
        all_SDs.append(temp_data[1])
    
    ############################## WITHOUT DASK ###################################
                              
    save_resamp = xr.Dataset({'mean':xr.concat((all_means), dim='month'), 'SD':xr.concat((all_SDs), dim='month')})
    save_resamp['month'] = np.arange(1,13)

    ###############################################################################
    # save the xarray dataset to NetCDF
    save_resamp.attrs = {'Description': 'Resampled standard deviations with respect to time of Arctic sea ice concentrations '\
                         +'(SIC) for the observational dataset {}. Mean - mean standard deviation across the 1000 resamplings, '\
                         +'SD - standard deviation across the 1000 resamplings. All months for the years {}-{} are '\
                         +'resampled 1000 times with a 2 year bootstrap size. SIC anomalies were calculated from '\
                         # +'a 2 year lowpass Butterworth filter '\
                         +'an ordinary least squares regression linear trend '\
                         +'on each individual grid cell.'.format(dataset_name, start_yr, end_yr),
                         'Units'      : '%',
                         'Timestamp'  : str(datetime.datetime.utcnow().strftime("%H:%M UTC %a %Y-%m-%d")),                
                         'Data source': data_doi,
                         'Analysis'   : 'Python 3.7.9 - https://github.com/chrisrwp/synthetic-ensemble/SIC/Resample_filtered_models_obs.ipynb'}

    save_resamp.to_netcdf('/glade/scratch/cwpowell/Synthetic_ensemble_revisions/SIC/'\
                          # +'Resampled_filter/{}_resampled_SIC_2yr_filter_1979_2020.nc'.format(dataset_name))
                          +'Resampled/{}_resampled_SIC_individual_{}_{}.nc'.format(dataset_name, start_yr, end_yr))