# Reduce CVDP data into timeseries data for each model and observations

### Author - Chris Wyburn-Powell, see the latest version on [github](https://github.com/chrisrwp/low-frequency-variability/blob/main/input_data/Reduce_CVDP_datasets.ipynb)

**Input:**
- Raw CVDP data for all variables 1920-2014 (or 1970-2014) for all availible CMIP6 GCMs (69) and historical realizations (807) and observational datasets from CERA20C_ERAI, ERA20C_ERAI, HadISST

**Method:**
- Extract CVDP variables for each season, averaged for each month within that season, or just selecting the correct season. In the case of annual data this one value is copied for each of the 4 seasons.
- The data from each variable and each month is detrended and standardized for the period 1920-2014.
- Metadata is added to the variables and packaged as a single xarray.Dataset for each model or observational dataset containing all low frequency variable for all availible members.

**Output:**
- Relavant variables from CVDP for each member or observational dataset for 1920-2014 (or 1970-2014) in monthly intervals, standardized. NetCDF files in format: `CVDP_standardized_linear_detrended_{start_year}_2014_historical_{model_name}.nc`

In [1]:
import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
import datetime
import scipy.signal as signal
import re
import os

In [2]:
#list of observational dataset names which have data 1920-2014
#HadISST has AMO, IPO, NINO34, PDO. No AMOC in any dataset
#CERA20C_ERAI and ERA20C_ERAI have NAM, NPO, PNA, NPI. 
obs_dataset_names = ['CERA20C_ERAI', 'ERA20C_ERAI', 'HadISST']

var_extract_monthly = ['amo_timeseries_mon', 'ipo_timeseries_mon', 'nino34',
                       'pdo_timeseries_mon', 'atlantic_meridional_mode',
                       'atlantic_nino', 'indian_ocean_dipole', 'nino12',
                       'nino3',  'nino4',]

var_extract_seasonally = ['nam_timeseries_', 'npo_timeseries_',
                          'pna_timeseries_', 'nao_timeseries_',
                          'sam_timeseries_', 'tas_global_avg_']

var_extract_annually = ['amoc_timeseries_ann', 'npi_ndjfm']

var_dataset_names = {'amo_timeseries_mon':'AMO', 'ipo_timeseries_mon':'IPO',
                     'nino34':'NINO34', 'pdo_timeseries_mon':'PDO',
                     'atlantic_meridional_mode':'AMM',
                     'atlantic_nino':'ATN', 'indian_ocean_dipole':'IOD',
                     'nino12':'NINO12', 'nino3':'NINO3', 'nino4':'NINO4',
                     'amoc_timeseries_ann':'AMOC', 'npi_ndjfm':'NPI', 
                     'nam_timeseries_':'NAM', 'npo_timeseries_':'NPO', 
                     'pna_timeseries_':'PNA', 'nao_timeseries_':'NAO',
                     'sam_timeseries_':'SAM', 'tas_global_avg_':'TAS',
                    }

CVDP_var_descriptions = {'AMO': 'Atlantic Multi-decadal Oscillation',
                         'IPO': 'Interdecadal Pacific Oscillation',
                         'NINO34': 'Niño 3.4 index',
                         'PDO': 'Pacific Decadal Oscillation',
                         'AMM': 'Atlantic Meridional Mode',
                         'ATN': 'Atlantic Nino',
                         'IOD': 'Indian Ocean Dipole',
                         'NINO12': 'Niño 1.2 index',
                         'NINO3' : 'Niño 3 index',
                         'NINO4' : 'Niño 4 index',
                         'AMOC': 'Atlantic Meridional Overturning Circulation',
                         'NPI': 'North Pacific Index',                         
                         'NAM': 'Northern Annular Mode',
                         'NPO': 'North Pacific Oscillation',
                         'PNA': 'Pacific/North American Teleconnection Pattern',
                         'NAO': 'North Atlantic Oscillation',
                         'SAM': 'Southern Annular Mode',
                         'TAS': 'Global Near-Surface Air Temperature'
                        }

In [3]:
#list all of the model names - N.B. manually add CESM2-LENS as this GCM has
#a different naming convention without vairant ID, also remove ICON-ESM-LR
#as this has a columnar format data and only AMOC was able to be processed
CMIP6_CVDP_fnames = np.sort(list(os.listdir('/glade/work/cwpowell/'\
    +'low-frequency-variability/raw_data/CMIP6_CVDP_historical/')))

model_names = []
for i in CMIP6_CVDP_fnames:
    try:
        model_names.append(re.findall('.*?(?=_r.*.cvdp)', i)[0])
    except IndexError:
        pass
        # print(i) #this is to list all of the observational records
    
model_names = np.unique(model_names)
model_names = np.append(model_names,'CESM2-LENS') #add CESM2-LENS
model_names = model_names[model_names != 'ICON-ESM-LR'] #remove ICON-ESM-LR

In [4]:
#load dates in datetime64 for the mid-month date 1920-01 to 2014-12
time_seqential = xr.open_dataarray('/glade/work/cwpowell/'\
    +'low-frequency-variability/raw_data/datetime64_1920_2014_monthly.nc')

### Define a function for extracting the relevant variables from model CVDP data and create monthly data

In [None]:
for model_name in model_names:
    # print('\n',model_name)
    missing = []
    present = []    
    
    realization_list = []
    for i in CMIP6_CVDP_fnames:
        if model_name+'_' in i:
            realization_list.append(re.findall('(?<=_).*?(?=.cvdp)', i)[0])

    realization_list = np.sort(realization_list)
    
    for mem_ in realization_list:
        test_CVDP_mem = xr.open_dataset('/glade/work/cwpowell/'\
                +'low-frequency-variability/raw_data/CMIP6_CVDP_historical/'\
                +'{}_{}'.format(model_name, mem_)\
                +'.cvdp_data.1920-2014.nc', decode_times=False)
        
        if 'amo_timeseries_mon' not in list(test_CVDP_mem.keys()):
            missing.append(mem_)
        else:
            present.append(mem_)
    
    if len(missing) > 0:
        print(model_name, len(missing), missing)

In [7]:
def extract_CVDP(model_name):
    '''
    Extract all CVDP variables deemed useful to the analysis, these are listed
    outside of this function in var_extract_monthly, var_extract_seasonally,
    var_extract_annually. Create a monthly value for each dataset, even if the
    data is recorded seasonally or annually. Relies on time_seqential dataset
    already being loaded
    
    Parameters
    ----------
    model_name : str
        The model_name as it appears in the CVDP files e.g. 'CanESM5'
    
    Returns
    ----------
        An xarray.dataarray of a subset of the CVDP variables for the time 
        period 1920-2014 for each month and for all members
    '''  
    
    #make a list of the realizations (variant labels) available for CVDP
    realization_list = []
    for i in CMIP6_CVDP_fnames:
        if model_name+'_' in i:
            realization_list.append(re.findall('(?<=_).*?(?=.cvdp)', i)[0])
        
    realization_list = np.sort(realization_list)
    
    #define datetime arrays with different orders
    #1920-01,1920-02...2014-11, 2014-12
    time_linear = np.arange(np.datetime64('1920-01'),
                                np.datetime64('2015-01'), 
                                np.timedelta64(1, 'M')).astype(np.datetime64)

    #1920-01, 1921-01...2014-01, 1920-02, 1921-02....2014-02.......2014-12
    time_single_month = np.array([]).astype(np.datetime64)

    for month_ in np.arange(1,13):
        time_single_month = np.append(time_single_month, 
            np.arange(np.datetime64('1920-{}'.format(str(month_).zfill(2))), 
                      np.datetime64('2015-{}'.format(str(month_).zfill(2))), 
                      np.timedelta64(1, 'Y')
                     ).astype(np.datetime64)
                 )

    ######################### run the data extraction ##########################
    CVDP_model_data = []

    for mem_ in realization_list:
        CVDP_mem = xr.open_dataset('/glade/work/cwpowell/'\
            +'low-frequency-variability/raw_data/CMIP6_CVDP_historical/'\
            +'{}_{}'.format(model_name, mem_)\
            +'.cvdp_data.1920-2014.nc', decode_times=False)
        
        CVDP_mem_orig_time = CVDP_mem['time'].copy()
        CVDP_mem['time'] = time_seqential
        
        CVDP_mem_dict = {}

        for var_name in var_extract_monthly:
            try:
                #save the monthly variables without alteration
                CVDP_mem_dict[var_name] = xr.DataArray(
                    data = np.ravel(CVDP_mem[var_name].copy()),
                    coords = {'time':time_linear},
                    dims = ['time']
                ).sortby('time')
            
            except KeyError: #variable missing
                # print(mem_, var_name)
                CVDP_mem_dict[var_name] = xr.DataArray(
                    data = np.zeros(len(time_linear))*np.nan,
                    coords = {'time':time_linear},
                    dims = ['time']
                ).sortby('time')

        for var_name in var_extract_annually:
            try:
                temp_annual_mem = []
                for month_ in np.arange(1,13):
                    temp_annual_mem.append(CVDP_mem[var_name])

                CVDP_mem_dict[var_name] = xr.DataArray(
                    data = np.ravel(temp_annual_mem),
                    coords = {'time':time_single_month},
                    dims = ['time']
                ).sortby('time')

            except KeyError: #variable missing, usually AMOC
                # print(mem_, var_name)
                CVDP_mem_dict[var_name] = xr.DataArray(
                    data = np.zeros(len(time_single_month))*np.nan,
                    coords = {'time':time_single_month},
                    dims = ['time']
                ).sortby('time')

        for var_name in var_extract_seasonally:
            temp_seaonally_mem = []
            for month_ in np.arange(1,13):
                #stricly 1 and 2 should be the following year, but there 
                #wouldn't be enough data points for 1920-2014, but 1921-2015 
                #instead which is unhelpful for this analysis
                if month_ in [12,1,2]:
                    temp_seaonally_mem.append(CVDP_mem[var_name+'djf'])
                elif month_ in [3,4,5]:
                    temp_seaonally_mem.append(CVDP_mem[var_name+'mam'])
                elif month_ in [6,7,8]:
                    temp_seaonally_mem.append(CVDP_mem[var_name+'jja'])
                elif month_ in [9,10,11]:
                    temp_seaonally_mem.append(CVDP_mem[var_name+'son'])

            CVDP_mem_dict[var_name] = xr.DataArray(
                data = np.ravel(temp_seaonally_mem),
                coords = {'time':time_single_month},
                dims = ['time']
            ).sortby('time')

        CVDP_model_data.append(xr.Dataset(CVDP_mem_dict))

    xr_CVDP_model_data = xr.concat((CVDP_model_data),dim='member')
    
    for var_name in list(xr_CVDP_model_data.keys()):
        xr_CVDP_model_data = xr_CVDP_model_data.rename(
            {var_name:var_dataset_names[var_name]})
        
    xr_CVDP_model_data['member'] = realization_list
    
    return(xr_CVDP_model_data)

## Compute seasonal CVDP data, linear detrend and standardize

In [6]:
def detrend_sandardize(data_, ax_n, standard=False):
    data_ = (data_ * 0) + signal.detrend(data=data_.fillna(0), axis=ax_n)
    
    if standard: #standardize the data
        detrended = (data_ - data_.mean('time')) / data_.std('time')

    return(detrended)

In [None]:
for model_name in model_names:
    print(datetime.datetime.now(), model_name)

    CVDP_extracted = extract_CVDP(model_name)

    all_var_data = {}
    for key_ in CVDP_extracted.keys():

        all_season_data = []
        for season_months in [[12,1,2],[3,4,5],[6,7,8],[9,10,11]]:
            #select the variable data for each month of the season
            raw_season0 = CVDP_extracted[key_].sel(
                time=CVDP_extracted['time.month']==season_months[0])
            raw_season1 = CVDP_extracted[key_].sel(
                time=CVDP_extracted['time.month']==season_months[1])
            raw_season2 = CVDP_extracted[key_].sel(
                time=CVDP_extracted['time.month']==season_months[2])
            
            #take the monthly average for the season
            raw_seaon_av = xr.concat(
                (raw_season0, raw_season1, raw_season2),dim='season_month')
            raw_seaon_av = raw_season1*0 + raw_seaon_av.mean('season_month')
            
            #linearly detrend and standardize the seasonal average over time
            all_season_data.append(
                detrend_sandardize(raw_seaon_av.sel(time=slice('1970','2014')),
                                   1, standard=True)
            )

        all_var_data[key_] = xr.concat(
            (all_season_data),dim='time').sortby('time')

    all_var_data = xr.Dataset(all_var_data)

    all_var_data.attrs = {
        'Description': 'Linearly detrended and standardized variables from '\
            +'the CVDP (Climate Variability Diagnostics Package) for global '\
            +f'climate model {model_name}, seasonally for 1970-2014.',
        'Units'      : 'standardized values',
        'Timestamp'  : str(datetime.datetime.utcnow().strftime(
            "%H:%M UTC %a %Y-%m-%d")),
        'Data source': 'CVDP doi: 10.1002/2014EO490002.',
        'Analysis'   : 'https://github.com/chrisrwp/low-frequency-variability/'\
            +'blob/main/input_data/Reduce_CVDP_datasets.ipynb'
    }

    for data_var in list(all_var_data.keys()):
        all_var_data[data_var].attrs = {
            'Description':CVDP_var_descriptions[data_var]}

    all_var_data.to_netcdf('/glade/work/cwpowell/low-frequency-variability/'\
        +'input_data/CVDP_standardized_linear_detrended_1970_2014_'\
        +f'historical_{model_name}.nc')

# Extract observational CVDP data, linear detrend and standardize

In [61]:
#for observations, use SST not TAS
var_extract_seasonally.remove('tas_global_avg_')
var_extract_seasonally.append('sst_global_avg_')

var_dataset_names['sst_global_avg_'] = 'TAS'

In [62]:
def extract_CVDP_obs(obs_name):
    '''
    Extract all CVDP variables deemed useful to the analysis, these are listed
    outside of this function in var_extract_monthly, var_extract_seasonally,
    var_extract_annually. Create a monthly value for each dataset, even if the
    data is recorded seasonally or annually. Relies on time_seqential dataset
    already being loaded
    
    Parameters
    ----------
    obs_name : str
        The observational dataset name  as it appears in the CVDP files e.g. 
        'HadISST'
    
    Returns
    ----------
        An xarray.dataarray of a subset of the CVDP variables for the time 
        period 1920-2014 for each month
    '''  
    
    #define datetime arrays with different orders
    #1920-01,1920-02...2014-11, 2014-12
    time_linear = np.arange(np.datetime64('1920-01'),
                                np.datetime64('2015-01'), 
                                np.timedelta64(1, 'M')).astype(np.datetime64)

    #1920-01, 1921-01...2014-01, 1920-02, 1921-02....2014-02.......2014-12
    time_single_month = np.array([]).astype(np.datetime64)

    for month_ in np.arange(1,13):
        time_single_month = np.append(time_single_month, 
            np.arange(np.datetime64('1920-{}'.format(str(month_).zfill(2))), 
                      np.datetime64('2015-{}'.format(str(month_).zfill(2))), 
                      np.timedelta64(1, 'Y')
                     ).astype(np.datetime64)
                 )

    ######################### run the data extraction ##########################
    CVDP_obs_dict = {}
    
    CVDP_obs = xr.open_dataset('/glade/work/cwpowell/'\
        +'low-frequency-variability/raw_data/CMIP6_CVDP_historical/'\
        +'{}.cvdp_data.1920-2014.nc'.format(obs_name),
                               decode_times=False)
    try:
        CVDP_obs_orig_time = CVDP_obs['time'].copy()
        CVDP_obs['time'] = time_seqential
    except KeyError:
        print(obs_name)

    for var_name in var_extract_monthly:
        try:
            #save the monthly variables without alteration
            CVDP_obs_dict[var_name] = xr.DataArray(
                data = np.ravel(CVDP_obs[var_name].copy()),
                coords = {'time':time_linear},
                dims = ['time']
            ).sortby('time')

        except KeyError: #variable missing
            print(var_name)
            CVDP_obs_dict[var_name] = xr.DataArray(
                data = np.zeros(len(time_linear))*np.nan,
                coords = {'time':time_linear},
                dims = ['time']
            ).sortby('time')

    for var_name in var_extract_annually:
        try:
            temp_annual = []
            for month_ in np.arange(1,13):
                temp_annual.append(CVDP_obs[var_name])

            CVDP_obs_dict[var_name] = xr.DataArray(
                data = np.ravel(temp_annual),
                coords = {'time':time_single_month},
                dims = ['time']
            ).sortby('time')

        except KeyError: #variable missing, usually AMOC
            print(var_name)
            CVDP_obs_dict[var_name] = xr.DataArray(
                data = np.zeros(len(time_single_month))*np.nan,
                coords = {'time':time_single_month},
                dims = ['time']
            ).sortby('time')

    for var_name in var_extract_seasonally:
        try:
            temp_seaonally = []
            for month_ in np.arange(1,13):
                #stricly 1 and 2 should be the following year, but there 
                #wouldn't be enough data points for 1920-2014, but 1921-2015 
                #instead which is unhelpful for this analysis
                if month_ in [12,1,2]:
                    temp_seaonally.append(CVDP_obs[var_name+'djf'])
                elif month_ in [3,4,5]:
                    temp_seaonally.append(CVDP_obs[var_name+'mam'])
                elif month_ in [6,7,8]:
                    temp_seaonally.append(CVDP_obs[var_name+'jja'])
                elif month_ in [9,10,11]:
                    temp_seaonally.append(CVDP_obs[var_name+'son'])

            CVDP_obs_dict[var_name] = xr.DataArray(
                data = np.ravel(temp_seaonally),
                coords = {'time':time_single_month},
                dims = ['time']
            ).sortby('time')
        
        except KeyError: #variable missing
            print(var_name)
            CVDP_obs_dict[var_name] = xr.DataArray(
                data = np.zeros(len(time_single_month))*np.nan,
                coords = {'time':time_single_month},
                dims = ['time']
            ).sortby('time')

    CVDP_obs_xr = xr.Dataset(CVDP_obs_dict)

    for var_name in list(CVDP_obs_xr.keys()):
        CVDP_obs_xr = CVDP_obs_xr.rename(
            {var_name:var_dataset_names[var_name]})
    
    return(CVDP_obs_xr)

In [None]:
for obs_name in obs_dataset_names:
    print(datetime.datetime.now(), obs_name)
    
    CVDP_extracted = extract_CVDP_obs(obs_name)
    
    all_var_data = {}
    for key_ in CVDP_extracted.keys():

        all_month_data = []
        for month_ in np.arange(1,13):
            #select the variable data for the month 
            unfiltered = CVDP_extracted[key_].sel(
                time=CVDP_extracted['time.month']==month_)
            #filter with a 2 year lowpass filter for each month, standardize
            # all_month_data.append(filt_lowpass(unfiltered, 1, 0.25, 5, 0, 
            #                                    detrend=True, standard=True))
            
            #detrend and standardize
            all_month_data.append(detrend_sandardize(unfiltered, 0, 
                                                     standard=True))

        all_var_data[key_] = xr.concat((all_month_data),
                                       dim='time').sortby('time')

    all_var_data = xr.Dataset(all_var_data)

    all_var_data.attrs = {
        'Description': 'Standardized and linearly detrended low '\
            +'frequency variables from CVDP (Climate Variability Diagnostics '\
            +'Package) for observational dataset {} for the '.format(obs_name)\
            +'period 1920-2014 for each month',
        'Units'      : 'standardized values',
        'Timestamp'  : str(datetime.datetime.utcnow().strftime(
            "%H:%M UTC %a %Y-%m-%d")),
        'Data source': 'CVDP doi: 10.1002/2014EO490002.',
        'Analysis'   : 'https://github.com/chrisrwp/low-fequency-variability/'\
            +'imput_data/Reduce_CVDP_datasets.ipynb'
    }

    for data_var in list(all_var_data.keys()):
        all_var_data[data_var].attrs = {
            'Description':CVDP_var_descriptions[data_var]}

    all_var_data.to_netcdf('/glade/work/cwpowell/low-frequency-variability/'\
        +'input_data/CVDP_standardized_linear_detrended_1920_2014_'\
        +'{}_all_var.nc'.format(obs_name))