# Calculate persistence correlation coefficients as a benchmark of predictability

### Author - Chris Wyburn-Powell, see the latest version on [github](https://github.com/chrisrwp/low-frequency-variability/blob/main/null_model/Null_model_persistence.ipynb)


**Input:**
- Regional sea ice concentration (SIC) 1920-2014 for all CMIP6 GCMs which appear in the CVDP historical groups A,B,C. Calculations are based on both the raw SIC data and the 2 year lowpass filtered data. 

**Output:**
- Pearson correlation coefficients for all GCM members, by region, month of the year, and lag time of 1-20 years

In [1]:
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import datetime
import dask

In [2]:
CMIP6_info = xr.open_dataset(
    '/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_info/'\
    +'CMIP6_modeling_center_members_doi.nc'
)

## Compute persistence correlation based on raw SIC data (not lowpass filtered)

In [7]:
var_ = 'regional_SIC'

corr_SIC = {}

for model_name in CMIP6_info['model'].drop_sel(model=['CAS-ESM2-0']).values:
    print(datetime.datetime.now(), model_name)
    
    try:
        model_data = xr.open_dataset(
            f'/glade/work/cwpowell/low-frequency-variability/raw_data/'
            f'regional_sea_ice_CMIP6/Regional_SIC_SIT_all_mem_{model_name}_'
            f'1850_2014.nc'
        )
        
        
    except (FileNotFoundError):
        print(model_name,'FILE NOT FOUND')
        continue
        
        
    all_mems = []
    for mem_ in np.sort(model_data['member'].values):
        print(datetime.datetime.now(), mem_)
        all_regions = []
        for region_ in [1,2,3,4,5,6,11]:
            all_months = []
            for month_ in np.arange(1,13):
                all_lags = []
                for lag_ in np.arange(1,21):
                    month_region = model_data[var_].sel(member=mem_).sel(time=
                        model_data['time.month']==month_).sel(region=region_)

                    #select the base time period
                    sea_ice_init = month_region.sel(
                        time=slice('1920',str(2014-lag_)))
                    sea_ice_init['time'] = range(len(sea_ice_init['time']))
                    #select a lagged time period but use same time coordinates
                    sea_ice_lagged = month_region.sel(
                        time=slice(str(1920+lag_),'2014'))
                    sea_ice_lagged['time'] = range(len(sea_ice_lagged['time']))
                    all_lags.append(xr.corr(sea_ice_init, sea_ice_lagged))

                all_lags_xr = xr.concat((all_lags),dim='lag')
                all_lags_xr['lag'] = np.arange(1,21)
                all_months.append(all_lags_xr)

            all_months_xr = xr.concat((all_months),dim='month')
            all_months_xr['month'] = np.arange(1,13)
            all_regions.append(all_months_xr)

        all_regions_xr = xr.concat((all_regions),dim='region')
        all_regions_xr['region'] = [1,2,3,4,5,6,11]
        all_mems.append(all_regions_xr)

    all_mems_xr = xr.concat((all_mems),dim='member')
    all_mems_xr['member'] = np.sort(model_data['member'].values)
    corr_SIC[model_name] = all_mems_xr
    
    
            

In [29]:
for model_name in np.sort(list(corr_SIC.keys())):
    doi_model = doi_model = CMIP6_info['doi'].sel(model=model_name)
    model_corr = xr.Dataset({'r_value':corr_SIC[model_name]})
    model_corr.attrs = {
        'Description': 'Pearson correlation coefficient between sea ice '\
            f'thickness in {model_name} historical forcing 1920-2014 and '\
            'the same data lagged by 1-20 years, by region, lag and member',
        'Timestamp'  : str(datetime.datetime.utcnow().strftime(
            "%H:%M UTC %a %Y-%m-%d")),
        'Data source': '{}, doi:{} . '.format(model_name, doi_model), 
        'Analysis'   : 'https://github.com/chrisrwp/low-frequency-variability/'\
            'blob/main/null_model/Null_model_persistence.ipynb'
    }
    
    model_corr.to_netcdf(
        '/glade/work/cwpowell/low-frequency-variability/null_model/'
        f'Pearson_correlation_SIT_lagged_1_20_years_{model_name}.nc'
    )

## Now compute Pearson correlation coefficient for lowpass filtered data

In [5]:
for model_name in CMIP6_info['model'].drop_sel(model=[
    'CAS-ESM2-0','FGOALS-f3-L','FGOALS-g3']).values:
    print(datetime.datetime.now(), model_name)
    
    try:
        model_data = xr.open_dataset(
            f'/glade/work/cwpowell/low-frequency-variability/input_data/'
            f'Regional_SIC_detrended_lowpass_filter_{model_name}_1920_2014.nc'
        )
        
    except (FileNotFoundError):
        print(model_name,'FILE NOT FOUND')
        continue
        
    all_mems = []
    for mem_ in np.sort(model_data['member'].values):
        all_regions = []
        for region_ in [1,2,3,4,5,6,11]:
            all_months = []
            for month_ in np.arange(1,13):
                all_lags = []
                for lag_ in np.arange(1,21):
                    month_region = model_data['SIC'].sel(member=mem_).sel(
                        month=month_).sel(region=region_)

                    #select the base time period
                    sea_ice_init = month_region.sel(
                        year=slice('1920',str(2014-lag_)))
                    sea_ice_init['year'] = range(len(sea_ice_init['year']))
                    #select a lagged time period but use same time coordinates
                    sea_ice_lagged = month_region.sel(
                        year=slice(str(1920+lag_),'2014'))
                    sea_ice_lagged['year'] = range(len(sea_ice_lagged['year']))
                    all_lags.append(xr.corr(sea_ice_init, sea_ice_lagged))

                all_lags_xr = xr.concat((all_lags),dim='lag')
                all_lags_xr['lag'] = np.arange(1,21)
                all_months.append(all_lags_xr)

            all_months_xr = xr.concat((all_months),dim='month')
            all_months_xr['month'] = np.arange(1,13)
            all_regions.append(all_months_xr)

        all_regions_xr = xr.concat((all_regions),dim='region')
        all_regions_xr['region'] = [1,2,3,4,5,6,11]
        all_mems.append(all_regions_xr)

    all_mems_xr = xr.concat((all_mems),dim='member')
    all_mems_xr['member'] = np.sort(model_data['member'].values)
        
    
    doi_model = CMIP6_info['doi'].sel(model=model_name)
    model_corr = xr.Dataset({'r_value':all_mems_xr})
    model_corr.attrs = {
        'Description': 'Pearson correlation coefficient between sea ice '\
            f'concentration in {model_name} historical forcing 1920-2014 '\
            'with a 2 year lowpass filter. The same data is lagged by 1-20 '\
            'years, evaluated by region and member',
        'Timestamp'  : str(datetime.datetime.utcnow().strftime(
            "%H:%M UTC %a %Y-%m-%d")),
        'Data source': '{}, doi:{} . '.format(model_name, doi_model), 
        'Analysis'   : 'https://github.com/chrisrwp/low-frequency-variability/'\
            'blob/main/null_model/Null_model_persistence.ipynb'
    }
    
    model_corr.to_netcdf(
        '/glade/work/cwpowell/low-frequency-variability/null_model/'
        f'Pearson_correlation_SIC_lagged_1_20_years_lowpass_{model_name}.nc'
    )

## Compute persistence on ensemble mean detrended data

In [4]:
for model_name in [
    'CanESM5', 'CESM2-LENS', 'MIROC6', 'GISS-E2-1-G', 'ACCESS-ESM1-5', 
    'IPSL-CM6A-LR', 'MIROC-ES2L', 'MPI-ESM1-2-LR', 'NorCPM1', 'GISS-E2-1-H',
    'EC-Earth3', 'CNRM-CM6-1', 'UKESM1-0-LL'
]:
    print(datetime.datetime.now(), model_name)
    
    try:
        model_data = xr.open_dataset(
            f'/glade/work/cwpowell/low-frequency-variability/input_data/'
            f'Regional_SIC_detrended_ensemble_mean_{model_name}_1920_2014.nc'
        )
        
    except (FileNotFoundError):
        print(model_name,'FILE NOT FOUND')
        continue
        
    all_mems = []
    for mem_ in np.sort(model_data['member'].values):
        all_regions = []
        for region_ in [1,2,3,4,5,6,11]:
            all_months = []
            for month_ in np.arange(1,13):
                all_lags = []
                for lag_ in np.arange(1,21):
                    month_region = model_data['SIC'].sel(member=mem_).sel(
                        month=month_).sel(region=region_)

                    #select the base time period
                    sea_ice_init = month_region.sel(
                        year=slice('1920',str(2014-lag_)))
                    sea_ice_init['year'] = range(len(sea_ice_init['year']))
                    #select a lagged time period but use same time coordinates
                    sea_ice_lagged = month_region.sel(
                        year=slice(str(1920+lag_),'2014'))
                    sea_ice_lagged['year'] = range(len(sea_ice_lagged['year']))
                    all_lags.append(xr.corr(sea_ice_init, sea_ice_lagged))

                all_lags_xr = xr.concat((all_lags),dim='lag')
                all_lags_xr['lag'] = np.arange(1,21)
                all_months.append(all_lags_xr)

            all_months_xr = xr.concat((all_months),dim='month')
            all_months_xr['month'] = np.arange(1,13)
            all_regions.append(all_months_xr)

        all_regions_xr = xr.concat((all_regions),dim='region')
        all_regions_xr['region'] = [1,2,3,4,5,6,11]
        all_mems.append(all_regions_xr)

    all_mems_xr = xr.concat((all_mems),dim='member')
    all_mems_xr['member'] = np.sort(model_data['member'].values)
        
    
    doi_model = CMIP6_info['doi'].sel(model=model_name)
    model_corr = xr.Dataset({'r_value':all_mems_xr})
    model_corr.attrs = {
        'Description': 'Pearson correlation coefficient between sea ice '\
            f'concentration in {model_name} historical forcing 1920-2014 '\
            'detrended with the ensemble mean. The same data is lagged by '\
            '1-20 years, evaluated by region and member',
        'Timestamp'  : str(datetime.datetime.utcnow().strftime(
            "%H:%M UTC %a %Y-%m-%d")),
        'Data source': '{}, doi:{} . '.format(model_name, doi_model), 
        'Analysis'   : 'https://github.com/chrisrwp/low-frequency-variability/'\
            'blob/main/null_model/Null_model_persistence.ipynb'
    }
    
    model_corr.to_netcdf(
        '/glade/work/cwpowell/low-frequency-variability/null_model/'\
        'Pearson_correlation_SIC_lagged_1_20_years_ensemble_detrended_'\
        f'{model_name}.nc'
    )

2023-02-28 15:21:18.344229 CanESM5
2023-02-28 15:48:23.367934 CESM2-LENS
2023-02-28 16:09:16.365957 MIROC6
2023-02-28 16:24:09.991388 GISS-E2-1-G
2023-02-28 16:36:48.583729 ACCESS-ESM1-5
2023-02-28 16:44:56.709812 IPSL-CM6A-LR
2023-02-28 16:51:33.600177 MIROC-ES2L
2023-02-28 16:58:03.880998 MPI-ESM1-2-LR
2023-02-28 17:04:22.318689 NorCPM1
2023-02-28 17:10:40.774661 GISS-E2-1-H
2023-02-28 17:15:56.607814 EC-Earth3
2023-02-28 17:20:46.541537 CNRM-CM6-1
2023-02-28 17:25:11.663730 UKESM1-0-LL


## Compute persistence on the observations

In [14]:
obs_data = xr.open_dataset(
    '/glade/work/cwpowell/low-frequency-variability/input_data/'\
    'Regional_SIC_detrended_lowpass_filter_HadISST1_1950_2022.nc'
)
        

all_regions = []
for region_ in [1,2,3,4,5,6,11]:
    all_months = []
    for month_ in np.arange(1,13):
        all_lags = []
        for lag_ in np.arange(1,21):
            month_region = obs_data['regional_SIC'].sel(month=month_).sel(
                region=region_)

            #select the base time period
            sea_ice_init = month_region.sel(
                year=slice('1960',str(2022-lag_)))
            sea_ice_init['year'] = range(len(sea_ice_init['year']))
            #select a lagged time period but use same time coordinates
            sea_ice_lagged = month_region.sel(
                year=slice(str(1960+lag_),'2022'))
            sea_ice_lagged['year'] = range(len(sea_ice_lagged['year']))
            all_lags.append(xr.corr(sea_ice_init, sea_ice_lagged))

        all_lags_xr = xr.concat((all_lags),dim='lag')
        all_lags_xr['lag'] = np.arange(1,21)
        all_months.append(all_lags_xr)

    all_months_xr = xr.concat((all_months),dim='month')
    all_months_xr['month'] = np.arange(1,13)
    all_regions.append(all_months_xr)

all_regions_xr = xr.concat((all_regions),dim='region')
all_regions_xr['region'] = [1,2,3,4,5,6,11]   

In [13]:
obs_corr = xr.Dataset({'r_value':all_regions_xr})
obs_corr.attrs = {
    'Description': 'Pearson correlation coefficient between sea ice '\
        f'concentration in HadISST1 1960-2022, linearly detrended and 2-year '\
        'lowpass filter. The same data is lagged by 1-20 years, evaluated by '\
        'region as per the NSIDC MASIE regions.',
    'Timestamp'  : str(datetime.datetime.utcnow().strftime(
        "%H:%M UTC %a %Y-%m-%d")),
    'Data source': 'HadISST1, doi:10.1029/2002JD002670.', 
    'Analysis'   : 'https://github.com/chrisrwp/low-frequency-variability/'\
        'blob/main/null_model/Null_model_persistence.ipynb'
}

obs_corr.to_netcdf(
    '/glade/work/cwpowell/low-frequency-variability/null_model/Pearson_'\
    'correlation_SIC_lagged_1_20_years_linear_detrended_lowpass_filt_HadISST1.nc'
)