# Detrend Observational SIC by individual trends and jackknifed trends

### Author: Chris Wyburn-Powell, [github](https://github.com/chrisrwp/synthetic-ensemble/SIC/Detrend_SIC_obs.ipynb)

**Input datasets, as created in this [notebook](https://github.com/chrisrwp/synthetic-ensemble/SIA/SIA_calculations_observations.ipynb1):** <br>

- **NOAA/NSIDC CDR version 4 (CDR, BT, NT)**: Pole hole is filled using the average SIC of the surrounding grid cells (built in). Missing months (1984-07 1987-12, 1988-01) are filled by looking at the closest valid months for SIA (CDR), idenfitying whether the previous or following year's SIA for those valid months are closets to that year with missing data, then selecting the previous or following SIC data to fill the missing data. E.g. For 1984-07: SIA for 1983-06 and 1985-06 are compared with 1984-06 and 1983-08 and 1985-08 are compared with 1984-08. 1985 is found to be closer to 1984 than 1983 was with 1984 so to fill 1984-07, 1985-07 is copied. Similarly SIC values for 1988-12 and 1989-01 are used to fill 1987-12 and 1988-01.
- **HadISST1**: Discontinuities for months 2009-03 and 2009-04 were found with extreme negative anomalies which do not appear in other datasets. SIC from 2007-03 is used for 2009-03 and 2008-04 are used for 2009-04.
- **Merged Hadley OI**: Data interpolated over land is masked using the land mask for HadISST1. 2009-03 and 2009-04 are filled with data from 2007-03 and 2008-04 respectively similarly to HadISST1. 2009-02 and 2009-05 are filled with 2010-02 and 2010-05 respectively.

In [1]:
import numpy as np
import xarray as xr
import datetime

print(datetime.datetime.utcnow().strftime("%H:%M UTC %a %Y-%m-%d"))

23:40 UTC Sat 2021-08-14


In [2]:
data_path = '/glade/scratch/cwpowell/Synthetic_ensemble/'

month_names = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 
               'August', 'September', 'October', 'November', 'December']

# Detrend HadISST1 relative to each cell's trend and 50 jackknifed trends

## Detrend HadISST1 on individual cell trends

In [6]:
start_yr = 1979
end_yr   = 2020

jackknife_n = 50
len_rand = 32 #number of randomly chosen elements to be deleted

# data = xr.open_dataset(data_path+'Raw_data/observations/HadISST/HadISST1_NH_79-20_filled.nc')
# data = xr.open_dataset(data_path+'Raw_data/observations/NSIDC_CDR_v4/SIC_CDR_BT_NT_79-20_filled.nc')
data = xr.open_dataset(data_path+'Raw_data/observations/merged_Hadley_OI/merged_Hadley_OI_SIC_79-20.nc')

# for var_name in ['CDR', 'BT', 'NT']:
# dataset_name = 'HadISST1' #HadISST1
# dataset_name = 'NSIDC_{}'.format(var_name) #NSIDC CDR version 4 (CDR)
dataset_name = 'Merged_Hadley_OI'

var_name = 'sic' #HadISST1 and Merged
# var_name = 'CDR' #NSIDC CDR, or replace CDR with NT or BT

# data_doi = 'doi:10.1029/2002JD002670' #HadISST1
#     data_doi = 'doi:10.7265/efmz-2t65' #NSIDC CDR version 4
data_doi = 'doi:10.5065/r33v-sv91' #Merged

lat_lon = ['latitude', 'longitude'] #for HadISST1 and merged
#     lat_lon = ['ygrid', 'xgrid'] #for CDR, NT, BT

ind     = []
ind_adj = []
jak     = []
jak_adj = []

for month_ in np.arange(1,13):
    print(datetime.datetime.now(), month_)

    #change the time to whole numbers for ease of trend calculations
    month_data_ = data.sel(time=data['time.month']==month_).sel(time=slice(str(start_yr), str(end_yr)))
    if var_name in ['CDR', 'BT', 'NT']:
        month_data = month_data_.copy()
        for var_ in ['CDR', 'BT', 'NT']:
            if var_ != var_name:
                month_data  = month_data.drop(var_)
    else:
        month_data = month_data_.copy() #create a copy to preseve original variable's time dimension
    month_data['time'] = np.arange(start_yr,end_yr+1)

    ##############################################################################################
    #generate a matrix of year values for computing the trend with the trend coefficients
    yrs_ind = np.empty((len(month_data['time']), len(month_data[lat_lon[0]]), len(month_data[lat_lon[1]])))
    yrs_jak = np.empty((len(month_data['time']), jackknife_n, len(month_data[lat_lon[0]]), len(month_data[lat_lon[1]])))

    for yr_i, yr in enumerate(np.arange(start_yr,end_yr+1)):
        yrs_ind[yr_i] = np.ones((len(month_data[lat_lon[0]]), len(month_data[lat_lon[1]]))) * yr
        yrs_jak[yr_i] = np.ones((len(month_data[lat_lon[0]]), len(month_data[lat_lon[1]]))) * yr

    yrs_ind = xr.DataArray(data = yrs_ind, coords = {'time':month_data['time'], lat_lon[0]:month_data[lat_lon[0]], 
                           lat_lon[1]:month_data[lat_lon[1]]}, dims = ['time', lat_lon[0], lat_lon[1]])  

    yrs_jak = xr.DataArray(data = yrs_jak, coords = {'time':month_data['time'], 'jackknife':np.arange(1,jackknife_n+1), 
                           lat_lon[0]:month_data[lat_lon[0]], lat_lon[1]:month_data[lat_lon[1]]}, 
                           dims = ['time', 'jackknife', lat_lon[0], lat_lon[1]])  

    ##############################################################################################
    #calculate the linear trend coefficients and the coresponding values each year for the individual trend
    ind_coefs = month_data.polyfit(dim='time', deg=1, skipna=True)
    ind_trend = yrs_ind * ind_coefs.sel(degree=1) + ind_coefs.sel(degree=0)

    #now member data and the trends are in the same time coordinates, compute anomalies
    detrended_ind = month_data[var_name] - ind_trend
    detrended_ind['time'] = month_data_['time'] #now calculations have taken place revert to the original time coordinates

    ################ adjust the trend so that it only contains physically possible values (0-100%) ################
    ind_trend_adj = ind_trend.where(ind_trend['{}_polyfit_coefficients'.format(var_name)]>=0,0) #if the trend goes negative, limit it at 0%
    ind_trend_adj = ind_trend_adj.where(ind_trend_adj<=100,100) #cap any trend values >100% to 100%
    ind_trend_adj = ind_trend_adj.where(ind_trend['{}_polyfit_coefficients'.format(var_name)]) #put any nan values back in

    detrended_ind_adj = month_data - ind_trend_adj['{}_polyfit_coefficients'.format(var_name)]
    detrended_ind_adj['time'] = month_data_['time'] #revert to the original time coordinates

    ##############################################################################################
    #compute a jackknife of each grid cell, note all grid cells in the month have the same years deleted    
    jackknife_data = []
    for i in range(jackknife_n):
        rand_yrs = np.sort(np.random.choice(np.arange(1979,2021),size=len(month_data['time'])-len_rand, replace=False))
        jackknife_data.append(month_data.sel(time=rand_yrs))

    jackknife_data = xr.concat((jackknife_data), dim='jackknife')
    jackknife_data['jackknife'] = np.arange(1, jackknife_n+1)

    #calculate the linear trends
    jak_coefs = jackknife_data.polyfit(dim='time', deg=1, skipna=True)
    jak_trend = yrs_jak * jak_coefs.sel(degree=1) + jak_coefs.sel(degree=0)

    #now member data and the trends are in the same time coordinates, compute anomalies
    detrended_jak = month_data[var_name] - jak_trend
    detrended_jak['time'] = month_data_['time'] #now calculations have taken place revert to the original time coordinates

    ################ adjust the trend so that it only contains physically possible values (0-100%) ################
    jak_trend_adj = jak_trend.where(jak_trend['{}_polyfit_coefficients'.format(var_name)]>=0,0) #if the trend goes negative, limit it at 0%
    jak_trend_adj = jak_trend_adj.where(jak_trend_adj<=100,100) #cap any trend values >100% to 100%
    jak_trend_adj = jak_trend_adj.where(jak_trend['{}_polyfit_coefficients'.format(var_name)]) #put any nan values back in

    detrended_jak_adj = month_data - jak_trend_adj['{}_polyfit_coefficients'.format(var_name)]
    detrended_jak_adj['time'] = month_data_['time'] #revert to the original time coordinates

    ##############################################################################################
    #make a xarray dataarrays for the 4 types of detrending
    ind.append(detrended_ind)
    ind_adj.append(detrended_ind_adj)
    jak.append(detrended_jak)
    jak_adj.append(detrended_jak_adj)

ind     = xr.concat((ind), dim='time')
ind_adj = xr.concat((ind_adj), dim='time')
jak     = xr.concat((jak), dim='time')
jak_adj = xr.concat((jak_adj), dim='time')

##############################################################################################
#concatenate all months together and save to NetCDF
######### CHANGE BACK TO ALL MONTHS WHEN CHANGING MONTHS #########
attrs_dict = {'Description': 'Detrended Arctic sea ice concentrations (SIC) for the observational dataset {}. Years 1979-2020, March and September. Detrended relative to the linear trend of the each month.'.format(dataset_name), 
              'Units'      : '%',
              'Timestamp'  : str(datetime.datetime.utcnow().strftime("%H:%M UTC %a %Y-%m-%d")),
              'Data source': '{}, {}'.format(dataset_name, data_doi),
              'Analysis'   : 'https://github.com/chrisrwp/synthetic-ensemble/SIC/Detrend_SIC_obs.ipynb'}

#detrended by the lienar trend, without adjustment to physical values
ind.attrs = attrs_dict
ind = ind.rename({'{}_polyfit_coefficients'.format(var_name):'SIC'})
ind.to_netcdf(data_path+'SIC/Detrended/{}_detrended_individual_1979_2020_03_09.nc'.format(dataset_name))

#detrended by the linear trend, adjusted to physical values
ind_adj_attrs = attrs_dict.copy()
ind_adj_attrs['Description'] = 'Detrended Arctic sea ice concentrations (SIC) for the observational dataset {}. Years 1979-2020, March and September. Detrended relative to the linear trend of the each month. The trend in each grid cell is limited to physical values of between 0 and 100% SIC'.format(dataset_name)
ind_adj.attrs = ind_adj_attrs
ind_adj = ind_adj.rename({var_name:'SIC'})
ind_adj.to_netcdf(data_path+'SIC/Detrended/{}_detrended_adj_individual_1979_2020_03_09.nc'.format(dataset_name))

# #detrended by the jackknife trends, without adjustment to physical values
jak_attrs = attrs_dict.copy()
jak_attrs['Description'] = 'Detrended Arctic sea ice concentrations (SIC) the model {}. Years 1979-2020, March and September. Detrended relative to the {} jackknifed trends, computed by removing {} data points for each grid cell.'.format(dataset_name, jackknife_n, len_rand)
jak.attrs = jak_attrs
jak = jak.rename({'{}_polyfit_coefficients'.format(var_name):'SIC'})
jak.to_netcdf(data_path+'SIC/Detrended/{}_detrended_jackknife_1979_2020_03_09.nc'.format(dataset_name))  

#detrended by the individual member trend, adjusted to physical values
jak_adj_attrs = attrs_dict.copy()
jak_adj_attrs['Description'] = 'Detrended Arctic sea ice concentrations (SIC) the model {}. Years 1979-2020, March and September. Detrended relative to the {} jackknifed trends, computed by removing {} data points for each grid cell. The trend in each grid cell is limited to physical values of between 0 and 100% SIC'.format(dataset_name, jackknife_n, len_rand)
jak_adj.attrs = jak_adj_attrs
jak_adj = jak_adj.rename({var_name:'SIC'})
jak_adj.to_netcdf(data_path+'SIC/Detrended/{}_detrended_adj_jackknife_1979_2020_03_09.nc'.format(dataset_name))  

2021-08-14 17:46:19.767859 1
2021-08-14 17:47:28.666549 2
2021-08-14 17:48:35.689070 3
2021-08-14 17:49:43.816294 4
2021-08-14 17:50:51.415648 5
2021-08-14 17:51:58.229470 6
2021-08-14 17:53:04.762056 7
2021-08-14 17:54:11.641346 8
2021-08-14 17:55:18.706962 9
2021-08-14 17:56:25.485116 10
2021-08-14 17:57:36.559625 11
2021-08-14 17:58:57.160713 12


# Use Dask

In [None]:
def SIC_detrend_obs(data, month_, dataset_name, data_doi, var_name, lat_lon, jackknife_n, len_rand):
    '''
    Detrend a 3D relative to , with and without adjusting to physical limits of 0 and 100% SIC

    Parameters
    ----------
    data : integer,
        For 1979-2020 use 42 as the total number of years in that time period
    data : 1 dimensional xarray dataarray,
        For 1979-2020 this is an array of shape [42] 

    Returns
    ----------
        2D xarray dataarray object of 1000 resamplings of the input data, shape: (time_period, 1000)
    '''  
    
    start_yr = 1979
    end_yr   = 2020

    #change the time to whole numbers for ease of trend calculations
    month_data_ = data.sel(time=data['time.month']==month_).sel(time=slice(str(start_yr), str(end_yr)))
    if var_name in ['CDR', 'BT', 'NT']:
        month_data = month_data_.copy()
        for var_ in ['CDR', 'BT', 'NT']:
            if var_ != var_name:
                month_data  = month_data.drop(var_)
    else:
        month_data = month_data_.copy() #create a copy to preseve original variable's time dimension
    month_data['time'] = np.arange(start_yr,end_yr+1)
    
    ##############################################################################################
    #generate a matrix of year values for computing the trend with the trend coefficients
    yrs_ind = np.empty((len(month_data['time']), len(month_data[lat_lon[0]]), len(month_data[lat_lon[1]])))
    yrs_jak = np.empty((len(month_data['time']), jackknife_n, len(month_data[lat_lon[0]]), len(month_data[lat_lon[1]])))

    for yr_i, yr in enumerate(np.arange(start_yr,end_yr+1)):
        yrs_ind[yr_i] = np.ones((len(month_data[lat_lon[0]]), len(month_data[lat_lon[1]]))) * yr
        yrs_jak[yr_i] = np.ones((len(month_data[lat_lon[0]]), len(month_data[lat_lon[1]]))) * yr

    yrs_ind = xr.DataArray(data = yrs_ind, coords = {'time':month_data['time'], lat_lon[0]:month_data[lat_lon[0]], 
                           lat_lon[1]:month_data[lat_lon[1]]}, dims = ['time', lat_lon[0], lat_lon[1]])  

    yrs_jak = xr.DataArray(data = yrs_jak, coords = {'time':month_data['time'], 'jackknife':np.arange(1,jackknife_n+1), 
                           lat_lon[0]:month_data[lat_lon[0]], lat_lon[1]:month_data[lat_lon[1]]}, 
                           dims = ['time', 'jackknife', lat_lon[0], lat_lon[1]])  

    ##############################################################################################
    #calculate the linear trend coefficients and the coresponding values each year for the individual trend
    ind_coefs = month_data.polyfit(dim='time', deg=1, skipna=True)
    ind_trend = yrs_ind * ind_coefs.sel(degree=1) + ind_coefs.sel(degree=0)

    #now member data and the trends are in the same time coordinates, compute anomalies
    detrended_ind = month_data[var_name] - ind_trend
    detrended_ind['time'] = month_data_['time'] #now calculations have taken place revert to the original time coordinates

    ################ adjust the trend so that it only contains physically possible values (0-100%) ################
    ind_trend_adj = ind_trend.where(ind_trend['{}_polyfit_coefficients'.format(var_name)]>=0,0) #if the trend goes negative, limit it at 0%
    ind_trend_adj = ind_trend_adj.where(ind_trend_adj<=100,100) #cap any trend values >100% to 100%
    ind_trend_adj = ind_trend_adj.where(ind_trend['{}_polyfit_coefficients'.format(var_name)]) #put any nan values back in

    detrended_ind_adj = month_data - ind_trend_adj['{}_polyfit_coefficients'.format(var_name)]
    detrended_ind_adj['time'] = month_data_['time'] #revert to the original time coordinates

    ##############################################################################################
    #compute a jackknife of each grid cell, note all grid cells in the month have the same years deleted    
    jackknife_data = []
    for i in range(jackknife_n):
        rand_yrs = np.sort(np.random.choice(np.arange(1979,2021),size=len(month_data['time'])-len_rand, replace=False))
        jackknife_data.append(month_data.sel(time=rand_yrs))
    
    jackknife_data = xr.concat((jackknife_data), dim='jackknife')
    jackknife_data['jackknife'] = np.arange(1, jackknife_n+1)
    
    #calculate the linear trends
    jak_coefs = jackknife_data.polyfit(dim='time', deg=1, skipna=True)
    jak_trend = yrs_jak * jak_coefs.sel(degree=1) + jak_coefs.sel(degree=0)
    
    #now member data and the trends are in the same time coordinates, compute anomalies
    detrended_jak = month_data[var_name] - jak_trend
    detrended_jak['time'] = month_data_['time'] #now calculations have taken place revert to the original time coordinates

    ################ adjust the trend so that it only contains physically possible values (0-100%) ################
    jak_trend_adj = jak_trend.where(jak_trend['{}_polyfit_coefficients'.format(var_name)]>=0,0) #if the trend goes negative, limit it at 0%
    jak_trend_adj = jak_trend_adj.where(jak_trend_adj<=100,100) #cap any trend values >100% to 100%
    jak_trend_adj = jak_trend_adj.where(jak_trend['{}_polyfit_coefficients'.format(var_name)]) #put any nan values back in

    detrended_jak_adj = month_data - jak_trend_adj['{}_polyfit_coefficients'.format(var_name)]
    detrended_jak_adj['time'] = month_data_['time'] #revert to the original time coordinates
    
    ################
    return(detrended_ind, detrended_ind_adj, detrended_jak, detrended_jak_adj)