# Reduce CVDP data into purely timeseries data for each model

**Input:**
- Raw CVDP data for all variables 1920-2014 for selected CMIP6 large ensembles following historical forcing and observational datasets

**Method:**
- Extract CVDP variables for each month of the year, for monthly data this is just coped, for seasonal data e.g. DJF the seasonal value is used for December, January and February. For annual data each month of the year is derived from the annual dataset so January and December use the same value.
- The data from each variable and each month is detrended then undergoes a 2 year lowpass Butterworth filter and is then standardized by each member. 
- Metadata is added to the variables and packaged as a single xarray.Dataset for each model or observational dataset containing all low frequency variable for all availible members.

**Output:**
- Relavant variables from CVDP for each member or observational dataset for 1920-2014 in monthly intervals, standardized and excluding high frequency data

In [1]:
import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
import datetime
import scipy.signal as signal
import re
import os

In [2]:
#list of observational dataset names which have data 1920-2014
#HadISST has AMO, IPO, NINO34, PDO. No AMOC in any dataset
#CERA20C_ERAI and ERA20C_ERAI have NAM, NPO, PNA, NPI. 
obs_dataset_names = ['CERA20C_ERAI', 'ERA20C_ERAI', 'HadISST']

var_extract_monthly = ['amo_timeseries_mon', 'ipo_timeseries_mon', 'nino34',
                       'pdo_timeseries_mon']

var_extract_seasonally = ['nam_timeseries_', 'npo_timeseries_',
                          'pna_timeseries_']

var_extract_annually = ['amoc_timeseries_ann', 'npi_ndjfm']

var_dataset_names = {'amo_timeseries_mon':'AMO', 'ipo_timeseries_mon':'IPO',
                     'nino34':'NINO34', 'pdo_timeseries_mon':'PDO', 
                     'amoc_timeseries_ann':'AMOC', 'npi_ndjfm':'NPI', 
                     'nam_timeseries_':'NAM', 'npo_timeseries_':'NPO', 
                     'pna_timeseries_':'PNA'
                    }

CVDP_var_descriptions = {'AMO': 'Atlantic Multi-decadal Oscillation',
                         'IPO': 'Interdecadal Pacific Oscillation',
                         'NINO34': 'Niño 3.4 index',
                         'PDO': 'Pacific Decadal Oscillation',
                         'AMOC': 'Atlantic Meridional Overturning Circulation',
                         'NPI': 'North Pacific Index',
                         'NAM': 'Northern Annular Mode',
                         'NPO': 'North Pacific Oscillation',
                         'PNA': 'Pacific/North American Teleconnection Pattern'
                        }

In [3]:
#list all of the model names
CMIP6_CVDP_fnames = np.sort(list(os.listdir('/glade/work/cwpowell/'\
    +'low-frequency-variability/raw_data/CMIP6_CVDP_historical/')))

model_names = []
for i in CMIP6_CVDP_fnames:
    try:
        model_names.append(re.findall('.*?(?=_r.*.cvdp)', i)[0])
    except IndexError:
        pass
        # print(i) #this is to list all of the observational records
    
model_names = np.unique(model_names)

In [4]:
#load dates in datetime64 for the mid-month date 1920-01 to 2014-12
time_seqential = xr.open_dataarray('/glade/work/cwpowell/'\
    +'low-frequency-variability/raw_data/datetime64_1920_2014_monthly.nc')

### Define a function for extracting the relevant variables from model CVDP data and create monthly data

In [5]:
def extract_CVDP(model_name):
    '''
    Extract all CVDP variables deemed useful to the analysis, these are listed
    outside of this function in var_extract_monthly, var_extract_seasonally,
    var_extract_annually. Create a monthly value for each dataset, even if the
    data is recorded seasonally or annually. Relies on time_seqential dataset
    already being loaded
    
    Parameters
    ----------
    model_name : str
        The model_name as it appears in the CVDP files e.g. 'CanESM5'
    
    Returns
    ----------
        An xarray.dataarray of a subset of the CVDP variables for the time 
        period 1920-2014 for each month and for all members
    '''  
    
    #make a list of the realizations (variant labels) available for CVDP
    realization_list = []
    for i in CMIP6_CVDP_fnames:
        if model_name+'_' in i:
            realization_list.append(re.findall('(?<=_).*?(?=.cvdp)', i)[0])
        
    realization_list = np.sort(realization_list)
    
    #define datetime arrays with different orders
    #1920-01,1920-02...2014-11, 2014-12
    time_linear = np.arange(np.datetime64('1920-01'),
                                np.datetime64('2015-01'), 
                                np.timedelta64(1, 'M')).astype(np.datetime64)

    #1920-01, 1921-01...2014-01, 1920-02, 1921-02....2014-02.......2014-12
    time_single_month = np.array([]).astype(np.datetime64)

    for month_ in np.arange(1,13):
        time_single_month = np.append(time_single_month, 
            np.arange(np.datetime64('1920-{}'.format(str(month_).zfill(2))), 
                      np.datetime64('2015-{}'.format(str(month_).zfill(2))), 
                      np.timedelta64(1, 'Y')
                     ).astype(np.datetime64)
                 )

    ######################### run the data extraction ##########################
    CVDP_model_data = []

    for mem_ in realization_list:
        CVDP_mem = xr.open_dataset('/glade/work/cwpowell/'\
            +'low-frequency-variability/raw_data/CMIP6_CVDP_historical/'\
            +'{}_{}'.format(model_name, mem_)\
            +'.cvdp_data.1920-2014.nc', decode_times=False)

        CVDP_mem_orig_time = CVDP_mem['time'].copy()
        CVDP_mem['time'] = time_seqential

        CVDP_mem_dict = {}

        for var_name in var_extract_monthly:
            try:
                #save the monthly variables without alteration
                CVDP_mem_dict[var_name] = xr.DataArray(
                    data = np.ravel(CVDP_mem[var_name].copy()),
                    coords = {'time':time_linear},
                    dims = ['time']
                ).sortby('time')
            
            except KeyError: #variable missing
                # print(mem_, var_name)
                CVDP_mem_dict[var_name] = xr.DataArray(
                    data = np.zeros(len(time_linear))*np.nan,
                    coords = {'time':time_linear},
                    dims = ['time']
                ).sortby('time')

        for var_name in var_extract_annually:
            try:
                temp_annual_mem = []
                for month_ in np.arange(1,13):
                    temp_annual_mem.append(CVDP_mem[var_name])

                CVDP_mem_dict[var_name] = xr.DataArray(
                    data = np.ravel(temp_annual_mem),
                    coords = {'time':time_single_month},
                    dims = ['time']
                ).sortby('time')

            except KeyError: #variable missing, usually AMOC
                # print(mem_, var_name)
                CVDP_mem_dict[var_name] = xr.DataArray(
                    data = np.zeros(len(time_single_month))*np.nan,
                    coords = {'time':time_single_month},
                    dims = ['time']
                ).sortby('time')

        for var_name in var_extract_seasonally:
            temp_seaonally_mem = []
            for month_ in np.arange(1,13):
                #stricly 1 and 2 should be the following year, but there 
                #wouldn't be enough data points for 1920-2014, but 1921-2015 
                #instead which is unhelpful for this analysis
                if month_ in [12,1,2]:
                    temp_seaonally_mem.append(CVDP_mem[var_name+'djf'])
                elif month_ in [3,4,5]:
                    temp_seaonally_mem.append(CVDP_mem[var_name+'mam'])
                elif month_ in [6,7,8]:
                    temp_seaonally_mem.append(CVDP_mem[var_name+'jja'])
                elif month_ in [9,10,11]:
                    temp_seaonally_mem.append(CVDP_mem[var_name+'son'])

            CVDP_mem_dict[var_name] = xr.DataArray(
                data = np.ravel(temp_seaonally_mem),
                coords = {'time':time_single_month},
                dims = ['time']
            ).sortby('time')

        CVDP_model_data.append(xr.Dataset(CVDP_mem_dict))

    xr_CVDP_model_data = xr.concat((CVDP_model_data),dim='member')
    
    for var_name in list(xr_CVDP_model_data.keys()):
        xr_CVDP_model_data = xr_CVDP_model_data.rename(
            {var_name:var_dataset_names[var_name]})
        
    xr_CVDP_model_data['member'] = realization_list
    
    return(xr_CVDP_model_data)

### Define a function for detreding, filtering with a lowpass Butterworth filter, and standardizing

In [10]:
def filt_lowpass(data_, sample_freq, cutoff, order, ax_n, detrend=False,
                 standard=False):
    '''
    Filter a time series using a lowpass Butterworth filter. 
    Uses scipy.signal.butter and scipy.signal.filtfilt
    
    Parameters
    ----------
    data_ : n dimensional xarray dataarray,
        For 1979-2020 this is an array of shape [42] 
    sample_freq: float,
        The sampling frequency of the input data, typically sample_freq=1 [year]
    cutoff: float,
        The fraction of the nyquist frequency (itself half the sampling 
        frequency). To filter with a 2-year lowpass filter with
        sample_freq=1 (year), cutoff=0.25
    order: int
        The order of the Butterworth filter, typically 4-6
    ax_n : int
        Which axis to do the filtering on (time)
    detrend: bool
        Whether to detrend the data with a linear trend
    standard: bool
        Whether to standardize the data after filtering

    Returns
    ----------
        numpy array of the same shape as the input data
    '''

    if detrend: #detrend the data first
        data_ = (data_ * 0) + signal.detrend(data=data_.fillna(0), axis=ax_n)

    b, a = signal.butter(order, cutoff, btype='lowpass') #low pass filter
    #apply the filter forward and backward along a given axis
    filtered = signal.filtfilt(b, a, data_, axis=ax_n) 

    filtered_xr = (data_ * 0) + filtered

    if standard: #standardize the data
        filtered_xr = (filtered_xr - filtered_xr.mean('time')) \
                      / filtered_xr.std('time')

    return(filtered_xr)

# Extract CMIP6 model variables, lowpass filter, standardize and save to NetCDF

In [32]:
for model_name in model_names:
    print(datetime.datetime.now(), model_name)

    CVDP_extracted = extract_CVDP(model_name)

    all_var_data = {}
    for key_ in CVDP_extracted.keys():

        all_month_data = []
        for month_ in np.arange(1,13):
            #select the variable data for the month 
            unfiltered = CVDP_extracted[key_].sel(
                time=CVDP_extracted['time.month']==month_)

            #filter with a 2 year lowpass filter for each month and standardize
            all_month_data.append(filt_lowpass(unfiltered, 1, 0.25, 5, 1, 
                                               detrend=True, standard=True))

        all_var_data[key_] = xr.concat((all_month_data),
                                       dim='time').sortby('time')

    all_var_data = xr.Dataset(all_var_data)

    all_var_data.attrs = {
        'Description': 'Standardized and 2 year lowpass filter of low '\
            +'frequency variables from CVDP (Climate Variability Diagnostics '\
            +'Package) for model {} for the period '.format(model_name)\
            +'1920-2014 for each month',
        'Units'      : 'standardized values',
        'Timestamp'  : str(datetime.datetime.utcnow().strftime(
            "%H:%M UTC %a %Y-%m-%d")),
        'Data source': 'CVDP doi: 10.1002/2014EO490002.',
        'Analysis'   : 'https://github.com/chrisrwp/low-fequency-variability/'\
            +'imput_data/Reduce_CVDP_datasets.ipynb'
    }

    for data_var in list(all_var_data.keys()):
        all_var_data[data_var].attrs = {
            'Description':CVDP_var_descriptions[data_var]}

    all_var_data.to_netcdf('/glade/work/cwpowell/low-frequency-variability/'\
        +'input_data/CVDP_standardized_1920_2014_historical_'\
        +'{}.nc'.format(model_name))

# Extract observational CVDP data, lowpass filter, standardize and save to NetCDF

In [23]:
def extract_CVDP_obs(obs_name):
    '''
    Extract all CVDP variables deemed useful to the analysis, these are listed
    outside of this function in var_extract_monthly, var_extract_seasonally,
    var_extract_annually. Create a monthly value for each dataset, even if the
    data is recorded seasonally or annually. Relies on time_seqential dataset
    already being loaded
    
    Parameters
    ----------
    obs_name : str
        The observational dataset name  as it appears in the CVDP files e.g. 
        'HadISST'
    
    Returns
    ----------
        An xarray.dataarray of a subset of the CVDP variables for the time 
        period 1920-2014 for each month
    '''  
    
    #define datetime arrays with different orders
    #1920-01,1920-02...2014-11, 2014-12
    time_linear = np.arange(np.datetime64('1920-01'),
                                np.datetime64('2015-01'), 
                                np.timedelta64(1, 'M')).astype(np.datetime64)

    #1920-01, 1921-01...2014-01, 1920-02, 1921-02....2014-02.......2014-12
    time_single_month = np.array([]).astype(np.datetime64)

    for month_ in np.arange(1,13):
        time_single_month = np.append(time_single_month, 
            np.arange(np.datetime64('1920-{}'.format(str(month_).zfill(2))), 
                      np.datetime64('2015-{}'.format(str(month_).zfill(2))), 
                      np.timedelta64(1, 'Y')
                     ).astype(np.datetime64)
                 )

    ######################### run the data extraction ##########################
    CVDP_obs_dict = {}
    
    CVDP_obs = xr.open_dataset('/glade/work/cwpowell/'\
        +'low-frequency-variability/raw_data/CMIP6_CVDP_historical/'\
        +'{}.cvdp_data.1920-2014.nc'.format(obs_name),
                               decode_times=False)
    try:
        CVDP_obs_orig_time = CVDP_obs['time'].copy()
        CVDP_obs['time'] = time_seqential
    except KeyError:
        print(obs_name)

    for var_name in var_extract_monthly:
        try:
            #save the monthly variables without alteration
            CVDP_obs_dict[var_name] = xr.DataArray(
                data = np.ravel(CVDP_obs[var_name].copy()),
                coords = {'time':time_linear},
                dims = ['time']
            ).sortby('time')

        except KeyError: #variable missing
            print(var_name)
            CVDP_obs_dict[var_name] = xr.DataArray(
                data = np.zeros(len(time_linear))*np.nan,
                coords = {'time':time_linear},
                dims = ['time']
            ).sortby('time')

    for var_name in var_extract_annually:
        try:
            temp_annual = []
            for month_ in np.arange(1,13):
                temp_annual.append(CVDP_obs[var_name])

            CVDP_obs_dict[var_name] = xr.DataArray(
                data = np.ravel(temp_annual),
                coords = {'time':time_single_month},
                dims = ['time']
            ).sortby('time')

        except KeyError: #variable missing, usually AMOC
            print(var_name)
            CVDP_obs_dict[var_name] = xr.DataArray(
                data = np.zeros(len(time_single_month))*np.nan,
                coords = {'time':time_single_month},
                dims = ['time']
            ).sortby('time')

    for var_name in var_extract_seasonally:
        try:
            temp_seaonally = []
            for month_ in np.arange(1,13):
                #stricly 1 and 2 should be the following year, but there 
                #wouldn't be enough data points for 1920-2014, but 1921-2015 
                #instead which is unhelpful for this analysis
                if month_ in [12,1,2]:
                    temp_seaonally.append(CVDP_obs[var_name+'djf'])
                elif month_ in [3,4,5]:
                    temp_seaonally.append(CVDP_obs[var_name+'mam'])
                elif month_ in [6,7,8]:
                    temp_seaonally.append(CVDP_obs[var_name+'jja'])
                elif month_ in [9,10,11]:
                    temp_seaonally.append(CVDP_obs[var_name+'son'])

            CVDP_obs_dict[var_name] = xr.DataArray(
                data = np.ravel(temp_seaonally),
                coords = {'time':time_single_month},
                dims = ['time']
            ).sortby('time')
        
        except KeyError: #variable missing
            print(var_name)
            CVDP_obs_dict[var_name] = xr.DataArray(
                data = np.zeros(len(time_single_month))*np.nan,
                coords = {'time':time_single_month},
                dims = ['time']
            ).sortby('time')

    CVDP_obs_xr = xr.Dataset(CVDP_obs_dict)

    for var_name in list(CVDP_obs_xr.keys()):
        CVDP_obs_xr = CVDP_obs_xr.rename(
            {var_name:var_dataset_names[var_name]})
    
    return(CVDP_obs_xr)

In [24]:
for obs_name in obs_dataset_names:
    print(datetime.datetime.now(), obs_name)
    
    CVDP_extracted = extract_CVDP_obs(obs_name)
    
    all_var_data = {}
    for key_ in CVDP_extracted.keys():

        all_month_data = []
        for month_ in np.arange(1,13):
            #select the variable data for the month 
            unfiltered = CVDP_extracted[key_].sel(
                time=CVDP_extracted['time.month']==month_)
            #filter with a 2 year lowpass filter for each month and standardize
            CVDP_filt_standard = filt_lowpass(unfiltered, 1, 0.25, 5, 0, 
                                              detrend=True, standard=True)

        all_var_data[key_] = xr.concat((CVDP_filt_standard),
                                       dim='time').sortby('time')

    all_var_data = xr.Dataset(all_var_data)

    all_var_data.attrs = {
        'Description': 'Standardized and 2 year lowpass filter of low '\
            +'frequency variables from CVDP (Climate Variability Diagnostics '\
            +'Package) for observational dataset {} for the '.format(obs_name)\
            +'period 1920-2014 for each month',
        'Units'      : 'standardized values',
        'Timestamp'  : str(datetime.datetime.utcnow().strftime(
            "%H:%M UTC %a %Y-%m-%d")),
        'Data source': 'CVDP doi: 10.1002/2014EO490002.',
        'Analysis'   : 'https://github.com/chrisrwp/low-fequency-variability/'\
            +'imput_data/Reduce_CVDP_datasets.ipynb'
    }

    for data_var in list(all_var_data.keys()):
        all_var_data[data_var].attrs = {
            'Description':CVDP_var_descriptions[data_var]}

    all_var_data.to_netcdf('/glade/work/cwpowell/low-frequency-variability/'\
        +'input_data/CVDP_standardized_1920_2014_{}.nc'.format(obs_name))

2022-07-14 11:03:44.378371 CERA20C_ERAI
CERA20C_ERAI
amo_timeseries_mon
ipo_timeseries_mon
nino34
pdo_timeseries_mon
amoc_timeseries_ann
2022-07-14 11:03:46.068678 ERA20C_ERAI
ERA20C_ERAI
amo_timeseries_mon
ipo_timeseries_mon
nino34
pdo_timeseries_mon
amoc_timeseries_ann
2022-07-14 11:03:47.629995 HadISST
amoc_timeseries_ann
npi_ndjfm
nam_timeseries_
npo_timeseries_
pna_timeseries_
