# Detrend Model SIC by its ensemble mean

### Author: Chris Wyburn-Powell, [github](https://github.com/chrisrwp/synthetic-ensemble/SIC/Detrend_SIC_models.ipynb)

**Input**: <br>
- CLIVAR LE Archive model output from CanESM2, CESM1, CSIRO MK3.6, GDL CM3, GFDL ESM2M, MPI ESM1

**Output**: <br>
- Reduced datasets for all members of the same model for a given month
- Detrended data based on:
  * Ensemble mean, i.e. the linear trend of the mean of all members
  * Ensemble mean with adjustments so the ensemble mean trend does not reach below 0% or above 100% SIC
  * Individual mean, i.e. the linear trend of the member which is being detrended 
  * Individual mean with adjustments to within 0-100% SIC
<br>
  
**Method**: <br>
- Use `dask` to loop through all the model output data and reconstitute it for >30N and 1979-2020, this time by month rather than by member
- Use a linear trend as calculated by `xarray.DataArray.polyfit` to detrend the data

In [1]:
import numpy as np
import xarray as xr
import scipy.signal as sig
import datetime
import dask

print(datetime.datetime.utcnow().strftime("%H:%M UTC %a %Y-%m-%d"))

16:57 UTC Wed 2021-08-04


In [2]:
data_path = '/glade/scratch/cwpowell/Synthetic_ensemble/'

model_names  = ['CanESM2', 'CESM1', 'CSIRO_MK36', 'GFDL_CM3', 'GFDL_ESM2M', 'MPI_ESM1']
mem_len      = [50,        40,      30,           20,         30,           100       ]

month_names = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 
               'August', 'September', 'October', 'November', 'December']

In [1]:
#create dask workers
from dask_jobqueue import PBSCluster
from dask.distributed import Client

cluster = PBSCluster(cores    = 2,
                     memory   = '5GB',
                     queue    = 'economy',
                     walltime = '00:20:00')

cluster.scale(8)
client = Client(cluster)
client

# Compute a reduced dataset for 1979-2020 for >30N for all members
## Define a function to load the correct model output files

In [7]:
def load_member(model, i, sic_sit, chunk_size=False, historical=False):
    '''
    Open a single member file of either sea ice concentration or thickness from the CLIVAR LE archive using xarray.open_dataset
    
    Parameters
    ----------
    model : string,
        Choose from ['CanESM2', 'CESM1', 'GFDL_CM3', GFDL_ESM2M', 'CSIRO_MK36', 'MPI_ESM1']
    i : integer,
        Member number e.g. 1
    sic_sit : string,
        Variable concentration or thickness, choose from ['sic', 'sit']
    chunk_size : integer, optional
        Choose an int e.g. 50 to use dask chunks to open the data, defaults to not use dask
    historical : boolean
        Only use for MPI_ESM1 to specify the time period required, defaults to RCP85 time period
    decode_bool : boolean
        Only use for GFDL_CM3 SIT member 1
    
    Returns
    ----------
        xarray.DataSet object from the CLIVAR LE archive sea ice output
    '''  
    
    base_path = '/glade/collections/cdg/data/CLIVAR_LE/'
    
    assert sic_sit in ['sic', 'sit'], 'invalid variable name'
    
    if np.logical_or(model=='GFDL_ESM2M', np.logical_and(model=='GFDL_CM3', sic_sit=='sit')): #time is not recognized by xarray
        decode_bool = False
    else:
        decode_bool = True
        
    
    ############### generate the file path ###############
    if model == 'CanESM2':
        path = base_path+'canesm2_lens/OImon/{}/{}_OImon_CanESM2_historical_rcp85_r{}i1p1_195001-210012.nc'.format(sic_sit, sic_sit, i)
        
    elif model == 'CESM1':
        if i == 1:
            path = base_path+'cesm_lens/OImon/{}/{}_OImon_CESM1-CAM5_historical_rcp85_r1i1p1_185001-210012.nc'.format(sic_sit, sic_sit, i)
        else:
            path = base_path+'cesm_lens/OImon/{}/{}_OImon_CESM1-CAM5_historical_rcp85_r{}i1p1_192001-210012.nc'.format(sic_sit, sic_sit, i)
            
    elif model == 'GFDL_ESM2M':
        path = base_path+'gfdl_esm2m_lens/OImon/{}/{}_OImon_GFDL-ESM2M_historical_rcp85_r{}i1p1_195001-210012.nc'.format(sic_sit, sic_sit, i)
    
    elif model == 'GFDL_CM3':
        path = base_path+'gfdl_cm3_lens/OImon/{}/{}_OImon_GFDL-CM3_historical_rcp85_r{}i1p1_192001-210012.nc'.format(sic_sit, sic_sit, i)
        
    elif model == 'CSIRO_MK36':
        path = base_path+'csiro_mk36_lens/OImon/{}/{}_OImon_CSIRO-Mk3-6-0_historical_rcp85_r{}i1p1_185001-210012.nc'.format(sic_sit, sic_sit, i)
        
    elif model == 'MPI_ESM1':
        period = [['historical', 'rcp85'], ['1850p3_185001-200512', '2005p3_200601-209912']]
        
        if historical: #2005-12 or previous
            path = base_path+'mpi_lens/OImon/{}/{}_OImon_MPI-ESM_{}_r{}i{}.nc'.format(sic_sit, sic_sit, str(period[0][0]), str(i).zfill(3), str(period[1][0]))
        else:
            path = base_path+'mpi_lens/OImon/{}/{}_OImon_MPI-ESM_{}_r{}i{}.nc'.format(sic_sit, sic_sit, str(period[0][1]), str(i).zfill(3), str(period[1][1]))
    else:
        print('invalid model name')
       
    ############### use the file path to open the NetCDF file using xarray ###############
    if chunk_size:
        data = xr.open_dataset(path, chunks={'time':(chunk_size)}, decode_times=decode_bool)
    else:
        data = xr.open_dataset(path, decode_times=decode_bool)
            
    return(data)

## Make reduced dataset with each month being separate but for all members

In [8]:
def reduce_data(model_name, mem_len_, month_, start_yr, end_yr, chunk_size):
    '''
    Open a single member file of either sea ice concentration or thickness from the CLIVAR LE archive using xarray.open_dataset
    
    Parameters
    ----------
    model_name : string,
        Choose from ['CanESM2', 'CESM1', 'GFDL_CM3', GFDL_ESM2M', 'CSIRO_MK36', 'MPI_ESM1']
    mem_len_ : integer,
        Number of members e.g. 50
    month_ : integer,
        Number corresponding to the month e.g. 1 is January
    start_yr : integer
        Calendar year of the start of required time period e.g. 1950
    end_yr : integer
        Calendar year of the end of required time period (inclusive) e.g. 2020
    chunk_size : integer, optional
        Choose an int e.g. 100 to use dask chunks to open the data, defaults to not use dask
    
    Returns
    ----------
        xarray.DataSet object of combined data for NH (>30N) for years specificed for a specific month
    '''  
    
    #define chatachteristics
    if model_name in ['CESM1', 'MPI_ESM1']:
        lat_lon = ['j', 'i']
    elif  model_name == 'GFDL_CM3':
        lat_lon = ['rlat', 'rlon']
    else:
        lat_lon = ['lat', 'lon']
            
    ##############################################################################################
    for member_i in np.arange(1,mem_len_+1):

        #open member dataset using dask and xarray
        #need to combine historical (pre 2005-12) and RCP8.5 to 2020-12 for MPI
        if model_name == 'MPI_ESM1':
            member_hist = load_member(model_name, member_i, 'sic', chunk_size, historical=True)
            member_fut  = load_member(model_name, member_i, 'sic', chunk_size)
            member = xr.concat((member_hist, member_fut), dim='time')

        else:
            member = load_member(model_name, member_i, 'sic', chunk_size)

        #need to change CESM1 dates from following month to mid-month as per CSIRO_MK36
        #GFDL_ESM2M needs to be changed as xarray cannot decode these dates
        if model_name in ['CESM1', 'GFDL_ESM2M']:
            time_CSIRO = load_member('CSIRO_MK36', 1, 'sic', chunk_size)

            if model_name == 'CESM1':
                if member_i == 1:
                    member['time'] = time_CSIRO['time'] #the first CESM1 member starts in 1850
                else:
                    member['time'] = time_CSIRO['time'].sel(time=slice('1920-01','2100-12'))       
            else:
                member['time'] = time_CSIRO['time'].sel(time=slice('1950-01','2100-12'))   
        
        #only want northern hemisphere
        if model_name == 'GFDL_CM3':
            member_NH = (member.where(member['rlat']>30,drop=True))['sic']
            #also correct members 1-8, from skipping 2005-12-16 to 2006-02-21 not 2006-01-16
            #replace with time from member 9 which doesn't have the offset
            GFDL_CM3_mem_9 = load_member('GFDL_CM3', 9, 'sic')
            member_NH['time'] = GFDL_CM3_mem_9['time']
        else:
            member_NH = (member.where(member['lat']>30,drop=True))['sic']
        
        #select the correct years and detrend into a detrended xarray dataarray
        member_NH_yrs = member_NH.sel(time=member_NH['time.month']==month_).sel(time=slice('{}'.format(start_yr),'{}'.format(end_yr)))

        #define the shape of the reduced dataset
        if member_i == 1: all_mem = np.empty((mem_len_, member_NH_yrs.shape[0], member_NH_yrs.shape[1], member_NH_yrs.shape[2]))
        
        all_mem[member_i-1] = member_NH_yrs #add this data to the correct position in the array according to its member

    ##############################################################################################
    #convert to xarray dataarray and save to NetCDF
    all_mem_xr = xr.DataArray(data   = all_mem,
                              coords = {'member'       : np.arange(1,mem_len_+1),
                                        'time'         : member_NH_yrs['time'],
                                        str(lat_lon[0]): member_NH_yrs[lat_lon[0]],
                                        str(lat_lon[1]): member_NH_yrs[lat_lon[1]]},
                              dims   = ['member', 'time', str(lat_lon[0]), str(lat_lon[1])])

    all_mem_xr.load()
    
    all_mem_xr.attrs = {'Description': 'Reduced dataset of sea ice concentrations (SIC) for the model {}, for all ensemble members in the month of {} for the period {}-{}'.format(model_name, month_names[month_-1], start_yr, end_yr),
                        'Units'      : '% sea ice concentration',
                        'Timestamp'  : str(datetime.datetime.utcnow().strftime("%H:%M UTC %a %Y-%m-%d")),                
                        'Data source': 'CLIVAR Large Ensemble Archive (doi: 10.1038/s41558-020-0731-2)',
                        'Analysis'   : 'Python 3.7.9 - https://github.com/chrisrwp/obs-ensemble/Time_period_and_sigma.ipynb'}

    all_mem_xr.to_netcdf(data_path+'SIC/Reduced_datasets/{}_reduced_{}-{}_{}.nc'.format(model_name, start_yr, end_yr, str(month_).zfill(2)))

In [4]:
#compute the reduced datasets
start_yr = 1979
end_yr   = 2020
chunk_size = 100

for month_ in np.arange(1,13):
    print(month_)
    for model_i, model_name in enumerate(model_names):
        print(datetime.datetime.now(), model_name)
        dask.compute(dask.delayed(reduce_data)(model_name, mem_len[model_i], month_, start_yr, end_yr, chunk_size))

# Use the reduced datasets to detrend relative to the ensemble mean and the individual members

In [18]:
#use matrix operations for both ensemble and individual
start_yr = 1979
end_yr   = 2020

for model_name in model_names:
    print(model_name)
    for month_ in np.arange(1,13):
        print(datetime.datetime.now(), month_)
    
        ##############################################################################################   
        #define grid chatachteristics
        if model_name in ['CESM1', 'MPI_ESM1']:
            lat_lon = ['j', 'i']
        elif  model_name == 'GFDL_CM3':
            lat_lon = ['rlat', 'rlon']
        else:
            lat_lon = ['lat', 'lon']

        #load the reduced dataset for the model and month in question
        month_data_ = xr.open_dataarray(data_path+'SIC/Reduced_datasets/{}_reduced_1979-2020_{}.nc'.format(model_name, str(month_).zfill(2)))

        #change the time to whole numbers for ease of trend calculations
        month_data = month_data_.copy()
        month_data['time'] = np.arange(start_yr,end_yr+1)
        
        #GFDL ESM2M uses fraction not percentage for SIC, change to % for this model
        if model_name == 'GFDL_ESM2M': month_data = month_data * 100
        
        ##############################################################################################
        #generate a matrix of year values for computing the trend with the trend coefficients
        yrs_ind = np.empty((len(month_data['time']), len(month_data['member']), len(month_data[lat_lon[0]]), len(month_data[lat_lon[1]])))
        yrs_ens = np.empty((len(month_data['time']), len(month_data[lat_lon[0]]), len(month_data[lat_lon[1]])))

        for yr_i, yr in enumerate(np.arange(start_yr,end_yr+1)):
            yrs_ind[yr_i] = np.ones((len(month_data['member']), len(month_data[lat_lon[0]]), len(month_data[lat_lon[1]]))) * yr
            yrs_ens[yr_i] = np.ones((len(month_data[lat_lon[0]]), len(month_data[lat_lon[1]]))) * yr

        yrs_ind = xr.DataArray(data = yrs_ind, coords = {'time':month_data['time'], 'member':month_data['member'], 
                               lat_lon[0]:month_data[lat_lon[0]], lat_lon[1]:month_data[lat_lon[1]]}, dims = ['time', 'member', lat_lon[0], lat_lon[1]])  

        yrs_ens = xr.DataArray(data = yrs_ens, coords = {'time':month_data['time'], lat_lon[0]:month_data[lat_lon[0]], 
                               lat_lon[1]:month_data[lat_lon[1]]}, dims = ['time', lat_lon[0], lat_lon[1]])  

        ##############################################################################################
        #calculate the ensemble linear trend coefficients and the coresponding values each year for that trend
        ens_coefs = month_data.mean('member').polyfit(dim='time', deg=1, skipna=True)
        ens_trend = yrs_ens * ens_coefs.sel(degree=1) + ens_coefs.sel(degree=0)
        
        #now member data and the trends are in the same time coordinates, compute anomalies
        detrended_ens = month_data - ens_trend
        detrended_ens['time'] = month_data_['time'] #now calculations have taken place revert to the original time coordinates
        detrended_ens = xr.Dataset({'SIC': detrended_ens['polyfit_coefficients']})

        ################ adjust the trend so that it only contains physically possible values (0-100%) ################
        ens_trend_adj = ens_trend.where(ens_trend>=0,0) #if the trend goes negative, limit it at 0%
        ens_trend_adj = ens_trend_adj.where(ens_trend_adj<=100,100) #cap any trend values >100% to 100%
        ens_trend_adj = ens_trend_adj.where(ens_trend) #put any nan values back in

        detrended_ens_adj = month_data - ens_trend_adj
        detrended_ens_adj['time'] = month_data_['time'] #revert to the original time coordinates
        detrended_ens_adj = xr.Dataset({'SIC': detrended_ens_adj['polyfit_coefficients']})

        ##############################################################################################
        #calculate the linear trend coefficients and the coresponding values for each member
        ind_coefs = month_data.polyfit(dim='time', deg=1, skipna=True)
        ind_trend = yrs_ind * ind_coefs.sel(degree=1) + ind_coefs.sel(degree=0)

        #use the month's data with modified time coordinates to compute the detrended data
        detrended_ind = month_data - ind_trend
        detrended_ind['time'] = month_data_['time'] #now calculations have taken place revert to the original time coordinates
        detrended_ind = xr.Dataset({'SIC': detrended_ind['polyfit_coefficients']})

        ################ adjust the trend so that it only contains physically possible values (0-100%) ################
        ind_trend_adj = ind_trend.where(ind_trend>=0,0) #if the trend goes negative, limit it at 0%
        ind_trend_adj = ind_trend_adj.where(ind_trend_adj<=100,100) #cap any trend values >100% to 100%
        ind_trend_adj = ind_trend_adj.where(ind_trend) #put any nan values back in

        detrended_ind_adj = month_data - ind_trend_adj
        detrended_ind_adj['time'] = month_data_['time'] #revert to the original time coordinates
        detrended_ind_adj = xr.Dataset({'SIC': detrended_ind_adj['polyfit_coefficients']})

        ##############################################################################################
        #save individual and ensemble detrended data to NetCDF 
        attrs_dict = {'Description': 'Detrended Arctic sea ice concentrations (SIC) the model {}. Years 1979-2020, month of {}. Detrended relative to the linear trend of the ensemble mean.'.format(model_name, month_names[month_-1]), 
                      'Units'      : '%',
                      'Timestamp'  : str(datetime.datetime.utcnow().strftime("%H:%M UTC %a %Y-%m-%d")),
                      'Data source': 'CLIVAR Large Ensemble Archive, doi:10.1038/s41558-020-0731-2',
                      'Analysis'   : 'https://github.com/chrisrwp/synthetic-ensemble/SIC/Detrend_SIC_models.ipynb'}

        #detrended by the ensemble mean, without adjustment to physical values
        detrended_ens.attrs = attrs_dict
        detrended_ens.to_netcdf(data_path+'SIC/Detrended/{}_detrended_{}_ensemble_1979_2020.nc'.format(model_name, str(month_).zfill(2)))

        #detrended by the ensemble mean, adjusted to physical values
        detrended_ens_adj_attrs = attrs_dict.copy()
        detrended_ens_adj_attrs['Description'] = 'Detrended Arctic sea ice concentrations (SIC) the model {}. Years 1979-2020, month of {}. Detrended relative to the linear trend of the ensemble mean. The trend in each grid cell is limited to physical values of between 0 and 100% SIC'.format(model_name, month_names[month_-1])
        detrended_ens_adj.attrs = detrended_ens_adj_attrs
        detrended_ens_adj.to_netcdf(data_path+'SIC/Detrended/{}_detrended_adj_{}_ensemble_1979_2020.nc'.format(model_name, str(month_).zfill(2)))

        #detrended by the individual member trend, without adjustment to physical values
        detrended_ind_attrs = attrs_dict.copy()
        detrended_ind_attrs['Description'] = 'Detrended Arctic sea ice concentrations (SIC) the model {}. Years 1979-2020, month of {}. Detrended relative to the individual ensemble member linear trend.'.format(model_name, month_names[month_-1])
        detrended_ind.attrs = detrended_ind_attrs
        detrended_ind.to_netcdf(data_path+'SIC/Detrended/{}_detrended_{}_individual_1979_2020.nc'.format(model_name, str(month_).zfill(2)))  

        #detrended by the individual member trend, adjusted to physical values
        detrended_ind_adj_attrs = attrs_dict.copy()
        detrended_ind_adj_attrs['Description'] = 'Detrended Arctic sea ice concentrations (SIC) the model {}. Years 1979-2020, month of {}. Detrended relative to the individual ensemble member linear trend. The trend in each grid cell is limited to physical values of between 0 and 100% SIC'.format(model_name, month_names[month_-1])
        detrended_ind_adj.attrs = detrended_ind_adj_attrs
        detrended_ind_adj.to_netcdf(data_path+'SIC/Detrended/{}_detrended_adj_{}_individual_1979_2020.nc'.format(model_name, str(month_).zfill(2)))  

# Compute $\sigma_{LE}$

In [13]:
for model_name in model_names:
    print(datetime.datetime.now(), model_name)
    sigma_LE_model = {}
    for adj in ['', 'adj_']:
        for ind_ens in ['ensemble', 'individual']:
            sigma_LE_model_type = []
            for month_ in np.arange(1,13):
                month_detrended = xr.open_dataset(data_path+'SIC/Detrended/{}_detrended_{}{}_{}_1979_2020.nc'.format(model_name, adj, str(month_).zfill(2), ind_ens))
                sigma_LE_model_type.append(month_detrended['SIC'].std('time').std('member'))
                
            sigma_LE_model_type = xr.concat((sigma_LE_model_type), dim='month') 
            sigma_LE_model_type['month'] = np.arange(1,13)
            
            sigma_LE_model[adj+ind_ens] = sigma_LE_model_type
    
    sigma_LE_model = xr.Dataset(sigma_LE_model)
    sigma_LE_model.attrs = {'Description': 'Standard deviation between ensemble members for detrended sea ice concentration (SIC). Detrended 1979-2020 relative to the ensemble or individual members, with adj meaning unphysical values of the detrended data are correct to physical bounds', 
                            'Units'      : '%',
                            'Timestamp'  : str(datetime.datetime.utcnow().strftime("%H:%M UTC %a %Y-%m-%d")),
                            'Data source': 'CLIVAR Large Ensemble Archive (doi: 10.1038/s41558-020-0731-2)',
                            'Analysis'   : 'https://github.com/chrisrwp/synthetic-ensemble/SIC/Detrend_SIC_models.ipynb'}
    
    sigma_LE_model.to_netcdf(data_path+'SIC/Detrended/Sigma_LE_{}.nc'.format(model_name))