# Extract both SIC and CVDP data from the Pre-Industrial Control Runs

### Author - Chris Wyburn-Powell, see the latest version on [github](https://github.com/chrisrwp/low-frequency-variability/blob/main/input_data/Extract_SIC_CVDP_from_PI_Control.ipynb)

**Input:**
- Raw `siconc` or `siconca` files and CVDP data for all variables 1920-2014 (or 1970-2014) for all availible CMIP6 GCMs (69) and historical realizations (807) and observational datasets from CERA20C_ERAI, ERA20C_ERAI, HadISST

**Method:**
- For the sea ice concentration data, compute regional area time series for each GCM
- For the CVDP data, extract only the necessary cliamte mode of variability time series data
- Separate the SIC and CVDP time series into 95 year chunks to act like members from the historical period 1920-2014 and save these data to NetCDF
- Standardize the data and collect for easy processing in the neural network

**Output:**
- Return individual NetCDF files of the regional sea ice area, as well as consolidated files for all GCMs for the SIC: `Regional_SIC_lowpass_filter_PI_Control_MMLE_500_first_3_train.nc` and for the CVDP: `CVDP_standardized_PI_Control_MMLE_500_first_3_train.nc`

In [1]:
import xarray as xr
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.signal as signal
import datetime
import cftime
import os
import pickle
import glob
import re
import dask
print(datetime.datetime.now())

2023-05-10 14:12:02.373018


In [109]:
from dask_jobqueue import PBSCluster
from dask.distributed import Client

cluster = PBSCluster(cores    = 1,
                     memory   = '10GB',
                     queue    = 'casper',
                     walltime = '00:07:00',
                     project  = 'UCUB0084',
                    )

cluster.scale(16)
client = Client(cluster)
client

Perhaps you already have a cluster running?
Hosting the HTTP server on port 39718 instead
  http_address["port"], self.http_server.port


0,1
Client  Scheduler: tcp://10.12.206.49:44346  Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/cwpowell/proxy/39718/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


## Compute the SIC extraction

In [12]:
PI_paths = {
    'ACCESS-CM2_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/CSIRO-ARCCSS/ACCESS-CM2/piControl/r1i1p1f1/SImon/siconc/gn/v20191112/siconc/',
    'ACCESS-ESM1-5_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/CSIRO/ACCESS-ESM1-5/piControl/r1i1p1f1/SImon/siconc/gn/v20191214/siconc/',
    'BCC-CSM2-MR_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/BCC/BCC-CSM2-MR/piControl/r1i1p1f1/SImon/siconc/gn/v20200219/siconc/',
    'BCC-ESM1_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/BCC/BCC-ESM1/piControl/r1i1p1f1/SImon/siconc/gn/v20200219/siconc/',
    'CanESM5_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/CCCma/CanESM5/piControl/r1i1p1f1/SImon/siconc/gn/v20190429/siconc/',
    'CanESM5_r1i1p2f1':'/glade/collections/cmip/CMIP6/CMIP/CCCma/CanESM5/piControl/r1i1p2f1/SImon/siconc/gn/v20190429/siconc/',
    'CanESM5-CanOE_r1i1p2f1':'/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_picontrol_siconc/CanESM5-CanOE/',
    # 'CAS-ESM2-0_r1i1p1f1':'/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_picontrol_siconc/CAS-ESM2-0/',
    'CESM2_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/NCAR/CESM2/piControl/r1i1p1f1/SImon/siconc/gn/latest/',
    'CESM2-FV2_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/NCAR/CESM2-FV2/piControl/r1i1p1f1/SImon/siconc/gn/latest/',
    # 'CESM2-WACCM_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/NCAR/CESM2-WACCM/piControl/r1i1p1f1/SImon/siconc/gn/latest/',
    'CESM2-WACCM-FV2_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/NCAR/CESM2-WACCM-FV2/piControl/r1i1p1f1/SImon/siconc/gn/latest/',
    'CMCC-CM2-SR5_r1i1p1f1':'/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_picontrol_siconc/CMCC-CM2-SR5/',
    'CMCC-ESM2_r1i1p1f1':'/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_picontrol_siconc/CMCC-ESM2/',
    'CNRM-CM6-1_r1i1p1f2':'/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_picontrol_siconc/CNRM-CM6-1/',
    'CNRM-ESM2-1_r1i1p1f2':'/glade/collections/cmip/CMIP6/CMIP/CNRM-CERFACS/CNRM-ESM2-1/piControl/r1i1p1f2/SImon/siconc/gn/v20181115/siconc/',
    'E3SM-1-0_r1i1p1f1':'/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_picontrol_siconc/E3SM-1-0/',
    'EC-Earth3_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/EC-Earth-Consortium/EC-Earth3/piControl/r1i1p1f1/SImon/siconc/gn/v20200312/siconc/',
    'EC-Earth3-CC_r1i1p1f1':'/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_picontrol_siconc/EC-Earth3-CC/',
    'EC-Earth3-Veg_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/EC-Earth-Consortium/EC-Earth3-Veg/piControl/r1i1p1f1/SImon/siconc/gn/v20200226/siconc/',
    'EC-Earth3-Veg-LR_r1i1p1f1':'/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_picontrol_siconc/EC-Earth3-Veg-LR/',
    'GFDL-ESM4_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/NOAA-GFDL/GFDL-ESM4/piControl/r1i1p1f1/SImon/siconc/gn/v20180701/siconc/',
    'GFDL-CM4_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/NOAA-GFDL/GFDL-CM4/piControl/r1i1p1f1/SImon/siconc/gn/v20180701/siconc/',
    'GISS-E2-1-G_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/NASA-GISS/GISS-E2-1-G/piControl/r1i1p1f1/SImon/siconca/gn/v20180824/siconca/',
    'GISS-E2-1-H_r1i1p1f1':'/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_picontrol_siconc/GISS-E2-1-H/',
    'HadGEM3-GC31-LL_r1i1p1f1':'/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_picontrol_siconc/HadGEM3-GC31-LL/',
    'HadGEM3-GC31-MM_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/MOHC/HadGEM3-GC31-MM/piControl/r1i1p1f1/SImon/siconc/gn/v20191204/siconc/',
    'INM-CM4-8_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/INM/INM-CM4-8/piControl/r1i1p1f1/SImon/siconc/gr1/v20190605/siconc/',
    'INM-CM5-0_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/INM/INM-CM5-0/piControl/r1i1p1f1/SImon/siconc/gr1/v20190619/siconc/',
    'IPSL-CM6A-LR_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/IPSL/IPSL-CM6A-LR/piControl/r1i1p1f1/SImon/siconc/gn/v20200326/siconc/',
    'MIROC-ES2L_r1i1p1f2':'/glade/collections/cmip/CMIP6/CMIP/MIROC/MIROC-ES2L/piControl/r1i1p1f2/SImon/siconc/gn/v20190823/siconc/',
    'MIROC6_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/MIROC/MIROC6/piControl/r1i1p1f1/SImon/siconc/gn/v20181212/siconc/',
    'MPI-ESM-1-2-HAM_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/HAMMOZ-Consortium/MPI-ESM-1-2-HAM/piControl/r1i1p1f1/SImon/siconc/gn/v20190627/siconc/',
    'MPI-ESM1-2-HR_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/MPI-M/MPI-ESM1-2-HR/piControl/r1i1p1f1/SImon/siconc/gn/v20190710/siconc/',
    'MPI-ESM1-2-LR_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/MPI-M/MPI-ESM1-2-LR/piControl/r1i1p1f1/SImon/siconc/gn/v20190710/siconc/',
    'NorCPM1_r1i1p1f1':'/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_picontrol_siconc/NorCPM1/r1i1p1f1/',
    # 'NorESM2-LM_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-LM/piControl/r1i1p1f1/SImon/siconc/gn/v20190920/siconc/',
    'NorESM2-MM_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/NCC/NorESM2-MM/piControl/r1i1p1f1/SImon/siconc/gn/v20191108/siconc/',
    # 'SAM0-UNICON_r1i1p1f1':'/glade/collections/cmip/CMIP6/CMIP/SNU/SAM0-UNICON/piControl/r1i1p1f1/SImon/siconc/gn/v20190910/siconc/',
    'UKESM1-0-LL_r1i1p1f2':'/glade/collections/cmip/CMIP6/CMIP/MOHC/UKESM1-0-LL/piControl/r1i1p1f2/SImon/siconc/gn/v20200427/siconc/',
}

In [16]:
region_mask_GISS_E2_1_G = xr.open_dataset(
    '/glade/work/cwpowell/low-frequency-variability/raw_data/'\
    +'masie_masks/masiemask_GISS-E2-1-G.nc')

areacella_GISS_E2_1_G = xr.open_dataset(
        '/glade/work/cwpowell/low-frequency-variability/'\
        +'raw_data/masie_masks/areacello_GISS-E2-1-G_30N.nc')

areacella_GISS_E2_1_G = areacella_GISS_E2_1_G['areacello']

#load the land variable mrsos as a land mask as sftlf etc. not available
mrsos = xr.open_dataset(
    '/glade/work/cwpowell/low-frequency-variability/raw_data/masie_masks/'\
    +'mrsos_Lmon_GISS-E2-2-G_historical_r1i1p1f1_gn_185001-187512.nc')

areacella_GISS_E2_1_G = areacella_GISS_E2_1_G.where(
    mrsos['mrsos'].isel(time=0).where(mrsos['mrsos'].isel(time=0)==0)==0)

In [17]:
with open('/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_info/'\
          +'CMIP6_areacello_lat_names.pickle', 'rb') as handle:
    lat_names = pickle.load(handle)
    
with open('/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_info/'\
      +'CMIP6_areacello_paths.pickle', 'rb') as handle:
    areacello_paths = pickle.load(handle)
    
with open('/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_info/'\
          +'CMIP6_x_y_names.pickle', 'rb') as handle:
    x_y_names = pickle.load(handle)

In [18]:
#get start and end years for all GCMs
start_end_yrs = {}
for model_name in list(PI_paths.keys()):
    all_files_start = []
    all_files_end = []
    for file_ in list(glob.glob(PI_paths[model_name]+'*')):
        all_files_start.append(int(file_[-16:-12]))
        all_files_end.append(int(file_[-9:-5]))
        
    start_end_yrs[model_name] = [int(np.min(all_files_start)), int(np.max(all_files_end))]

In [107]:
 dates = xr.cftime_range(
    start=str(int(150+(100*file_ith_))).zfill(4), 
    periods=len(to_save['time']), 
    freq='MS', calendar='noleap')

to_save['time'] = dates

to_save.to_netcdf(
    '/glade/work/cwpowell/low-frequency-variability/raw_data/'\
    f'regional_SIC_PI_Control/{model_name}_PI_Control_regional_SIC_'\
    f'{str(int(150+(100*file_ith_)))}_{str(int((150+(100*file_ith_))+50))}.nc')

In [110]:
for model_name in ['GFDL-CM4_r1i1p1f1']:#list(PI_paths.keys())[::-1]:
    print(datetime.datetime.now(), model_name)
    
    for file_ith_ in [3.5,4,4.5]:
    
        if f'{model_name}_PI_Control_regional_SIC.nc' in os.listdir(
                '/glade/work/cwpowell/low-frequency-variability/raw_data/regional_SIC_PI_Control/'):
            continue

        #load the region masks
        region_mask = xr.open_dataset(
            '/glade/work/cwpowell/low-frequency-variability/raw_data/'\
            +'masie_masks/masiemask_{}.nc'.format(model_name[:-9])
        )

        #load the areacello file
        areacello_ = xr.open_dataset(
            '/glade/work/cwpowell/low-frequency-variability/'\
            +'raw_data/masie_masks/areacello_{}_30N.nc'.format(model_name[:-9])
        )

        if model_name[:-9] in ['CESM2','CESM2-FV2','CESM2-WACCM','CESM2-WACCM-FV2']:
            region_mask = region_mask.rename({'nlat':'nj', 'nlon':'ni'})
            region_mask = region_mask.drop('lat_2').drop('lon_2')
            areacello_ = areacello_.rename({'nlat':'nj', 'nlon':'ni'})

        areacello_with_nan = areacello_['areacello']

        with dask.config.set(**{'array.slicing.split_large_chunks': True}):
            single_GCM = regional_calc_dask(model_name, areacello_with_nan, 
                                            region_mask, file_ith=file_ith_)

        to_save = single_GCM.compute()

        # dates = xr.cftime_range(
        #     start=str(start_end_yrs[model_name][0]).zfill(4), 
        #     periods=(start_end_yrs[model_name][1]-start_end_yrs[model_name][0]+1)*12, 
        #     freq='MS', calendar='noleap')
        dates = xr.cftime_range(
            start=str(int(150+(100*file_ith_))).zfill(4), 
            periods=len(to_save['time']), 
            freq='MS', calendar='noleap')

        to_save['time'] = dates

        to_save.to_netcdf(
            '/glade/work/cwpowell/low-frequency-variability/raw_data/'\
            f'regional_SIC_PI_Control/{model_name}_PI_Control_regional_SIC_'\
            f'{str(int(150+(100*file_ith_)))}_{str(int((150+(100*file_ith_))+50))}.nc')

2023-05-09 13:44:31.293908 GFDL-CM4_r1i1p1f1


In [105]:
def regional_calc_dask(model_name, areacello_with_nan, region_mask, 
                       file_ith=False):
    
    '''
    Calculate regional SIA, SIC from files of concentration (siconc or 
    siconca) from pre-industrial control runs. Regions based on NSIDC MASIE 
    regions. Also compute the pan-Arctic sea ice area.

    Parameters
    ----------
    model_name: string
        The name of the Source ID and Variant ID e.g. CanESM5_r1i1p1f1
    areacello_with_nan: xarray dataarray
        areacello or areacella with latitudes below 30N set to np.nan
    region_mask: xarray dataarray
        dataarray with the same coordinates and dimensions as the sea ice 
        variable data but with values corresponding to region or np.nan for 
        outside of the regions domain
    file_ith:
        For large PI Control datasets, select only a subsection of the full 
        run, based on the ordered list of files. This should be used for 
        HadGEM3-GC31-MM and GFDL-CM4

    Returns
    ----------
        xarray.Dataset with variables of:
        regional SIA, regional average SIC, pan-Arctic SIA
    '''  
    ########################### load the data files ############################
    if model_name == 'GISS-E2-1-G_r1i1p1f1':
        var_name = 'siconca'
    elif model_name == 'CESM2-LENS':
        var_name = 'aice'
    else:
        var_name = 'siconc'
    
    #load the SIC data and select above 30N
    SIC = []
    
    #split up the high resolution PI Controls
    if model_name == 'HadGEM3-GC31-MM_r1i1p1f1': 
        for file_ in np.sort(list(glob.glob(PI_paths[model_name]+'*')))[file_ith:file_ith+2]:
            SIC_single = xr.open_dataset(file_, chunks={'time':250})     
            SIC.append(SIC_single[var_name])
    elif model_name == 'GFDL-CM4_r1i1p1f1':
        SIC = xr.open_dataset(
            np.sort(list(glob.glob(PI_paths[model_name]+'*')))[int(np.floor(file_ith))], 
            chunks={'time':250})[var_name]
        SIC = SIC.sel(time=slice(str(int(150+(100*file_ith_))).zfill(4),
                                 str(int((150+(100*file_ith_))+50)).zfill(4)))
    else:
        for file_ in list(glob.glob(PI_paths[model_name]+'*')):
            SIC_single = xr.open_dataset(file_, chunks={'time':250})     
            SIC.append(SIC_single[var_name])
    
    SIC = xr.concat((SIC),dim='time')
    SIC = SIC.sortby('time')
    
    #remove concentration values below 30N and convert to fraction from %
    #only exceptions where siconc not availible
    if model_name == 'GISS-E2-1-G_r1i1p1f1':
        SIC = SIC.where(
                ~xr.ufuncs.isnan(areacella_GISS_E2_1_G), drop=False)/100
    elif model_name == 'CESM2-LENS':
        SIC = SIC.where(areacello_with_nan, drop=False)
    else:
        SIC = SIC.where(areacello_with_nan, drop=False)/100
    
    ############### compute SIA, average SIC, and pan-Arctic SIA ###############
    #only do the calculation where there is sea ice
    SIC = SIC.where(SIC>0)
    
    pan_Arctic_SIA = (SIC * areacello_with_nan).sum(
        x_y_names[model_name[:-9]][0]).sum(x_y_names[model_name[:-9]][1])
    
    #calculate regional data
    SIA_regions = []
    SIC_regions_av = []

    for region_ in np.arange(1,17):
        area_region = areacello_with_nan.where(
                region_mask['regions']==region_).sum()
        
        SIC_region = SIC.where((region_mask['regions']==region_))
        SIA_region = (SIC_region * areacello_with_nan).sum(
            x_y_names[model_name[:-9]][0]).sum(x_y_names[model_name[:-9]][1])

        SIA_regions.append(SIA_region)
        SIC_regions_av.append(SIA_region / area_region)            
    
    ######################### concatenate all regions ##########################
    SIA_regions =  xr.concat((SIA_regions),dim='region')
    SIA_regions['region'] = np.arange(1,17)
    SIC_regions_av = xr.concat((SIC_regions_av),dim='region')
    SIC_regions_av['region'] = np.arange(1,17)

    final_dataset = xr.Dataset(
            {'regional_SIA':SIA_regions, 'regional_SIC':SIC_regions_av,
             'Arctic_SIA':pan_Arctic_SIA, 
            }
        )
    
    return(final_dataset)

## Check files

In [113]:
#concat partial files
all_GFDL_CM4 = []
for start_ in np.arange(150,601,50):
    single_file = xr.open_dataset(
        '/glade/work/cwpowell/low-frequency-variability/raw_data/'\
        'regional_SIC_PI_Control/GFDL-CM4_r1i1p1f1_PI_Control_regional_SIC_'\
        f'{start_}_{start_+50}.nc')
    
    dates = xr.cftime_range(
        start=str(int(start_+1)).zfill(4), 
        periods=len(to_save['time']), 
        freq='MS', calendar='noleap')
    
    if start_ % 100 == 0:
        single_file = single_file.sel(
            time=slice(str(int(1+start_)).zfill(4),
                       str(int(start_+50)).zfill(4)))
    
    all_GFDL_CM4.append(single_file)
    
all_GFDL_CM4 = xr.concat((all_GFDL_CM4),dim='time')
all_GFDL_CM4.to_netcdf(
    '/glade/work/cwpowell/low-frequency-variability/raw_data/regional_SIC_'\
    'PI_Control/GFDL-CM4_r1i1p1f1_PI_Control_regional_SIC.nc')

In [126]:
#concat partial files
all_HadGEM3_GC31_MM = []
for start_ in np.arange(1850,2331,40):
    single_file = xr.open_dataset(
        '/glade/work/cwpowell/low-frequency-variability/raw_data/'\
        'regional_SIC_PI_Control/HadGEM3-GC31-MM_r1i1p1f1_PI_Control_regional_'\
        f'SIC_{start_}_{start_+39}.nc')
      
    all_HadGEM3_GC31_MM.append(single_file)
    
all_HadGEM3_GC31_MM = xr.concat((all_HadGEM3_GC31_MM),dim='time')
all_HadGEM3_GC31_MM.to_netcdf(
    '/glade/work/cwpowell/low-frequency-variability/raw_data/regional_SIC_'\
    'PI_Control/HadGEM3_GC31_MM_r1i1p1f1_PI_Control_regional_SIC.nc')

### Split into 'members' of 95 years and then lowpass filter

In [1]:
for file_ in np.sort(glob.glob(
    '/glade/work/cwpowell/low-frequency-variability/raw_data/regional_SIC_'\
    'PI_Control/*.nc')):
    
    model_name = file_[80:-27]
    print(datetime.datetime.now(), model_name)
    
    orig_file = xr.open_dataset(file_).sortby('time')
    
    all_members = []
    for mem_i, mem_ in enumerate(
        range(int(np.floor(len(orig_file['time'])/(95*12))))):
        
        temp_mem = orig_file.isel(time=slice(mem_i*95*12,(mem_i+1)*(95*12)))
        
        temp_mem['time'] = np.arange('1920-01', '2015-01', 
                                     dtype='datetime64[M]')

        all_members.append(temp_mem)
    
    all_members_xr = xr.concat((all_members),dim='member')
    all_members_xr['member'] = np.arange(1,len(all_members)+1)
    
    #apply a 2-year lowpass filter for each month
    all_months = []
    for month_ in np.arange(1,13):
        b, a = signal.butter(5, 0.25, btype='lowpass') #low pass filter
        #apply the filter forward and backward along a given axis
        monthly_data = all_members_xr['regional_SIC'].sel(
            time=all_members_xr['time.month']==month_)
        monthly_data = monthly_data - monthly_data.mean('time')
        
        filtered = signal.filtfilt(b, a, monthly_data, axis=2) 
        filtered_xr = (monthly_data * 0) + filtered
        filtered_xr['time'] = np.arange(1920,2015)
        filtered_xr = filtered_xr.rename({'time':'year'})
        all_months.append(filtered_xr)
    
    all_months_xr = xr.concat((all_months), dim='month')
    all_months_xr['month'] = np.arange(1,13)
    
    #add attributes and save to NetCDF
    all_months_xr.attrs = {
        'Description': 'The pre-industrial control run for the model '\
            f'{model_name} for the model years between '\
            '{}and'.format(str(orig_file['time'].min().values)[:-8])\
            +'{}is divided '.format(str(orig_file['time'].max().values)[:-8])\
            +'up into 95-year divisions, called members whose time dimensions '\
            'are all 1920-2014 to match the years used in the historical '\
            'simulations. These members are then separated by month and '\
            'region, the average is removed and a 2-year lowpass filter is '\
            'applied. Regions as defined for NSIDC MASIE-NH Version 1, '\
            'doi:10.7265/N5GT5K3K.',
        'Timestamp'  : str(datetime.datetime.utcnow().strftime(
            "%H:%M UTC %a %Y-%m-%d")),
        'Data source': 'CMIP6 pre-industrial model output for siconc from '\
            f'{model_name}.', 
        'Analysis'   : 'https://github.com/chrisrwp/'\
            'low-fequency-variability/tree/main/input_data/'\
            'Extract_SIC_CVDP_from_PI_Control.ipynb'
    }

    all_months_xr.to_netcdf('/glade/work/cwpowell/low-frequency-variability/'\
        +f'input_data/Regional_SIC_lowpass_filter_{model_name}_PI_Control.nc')

        

# Extract CVDP variables

In [2]:
PI_CVDP_path = '/glade/work/cwpowell/low-frequency-variability/raw_data/'\
    +'CMIP6_CVDP_picontrol/'

good_CVDP_list = [
    'ACCESS-CM2_r1i1p1f1.cvdp_data.950-1449.nc',
    'ACCESS-ESM1-5_r1i1p1f1.cvdp_data.101-1100.nc', 
    'BCC-CSM2-MR_r1i1p1f1.cvdp_data.1850-2449.nc',
    'BCC-ESM1_r1i1p1f1.cvdp_data.1850-2300.nc',
    'CanESM5_r1i1p1f1.cvdp_data.5201-6200.nc',
    'CanESM5_r1i1p2f1.cvdp_data.5550-6600.nc',
    'CanESM5-CanOE_r1i1p2f1.cvdp_data.5550-6050.nc',
    'CESM2-FV2_r1i1p1f1.cvdp_data.1-500.nc',
    'CESM2_r1i1p1f1.cvdp_data.1-1200.nc',
    'CESM2-WACCM-FV2_r1i1p1f1.cvdp_data.1-500.nc',
    'CMCC-CM2-SR5_r1i1p1f1.cvdp_data.1850-2349.nc',
    'CMCC-ESM2_r1i1p1f1.cvdp_data.1850-2349.nc',
    'CNRM-CM6-1_r1i1p1f2.cvdp_data.1850-2349.nc',
    'CNRM-ESM2-1_r1i1p1f2.cvdp_data.1850-2349.nc',
    'E3SM-1-0_r1i1p1f1.cvdp_data.1-500.nc',
    'EC-Earth3_r1i1p1f1.cvdp_data.2259-2759.nc',
    'EC-Earth3-CC_r1i1p1f1.cvdp_data.1850-2354.nc',
    'EC-Earth3-Veg_r1i1p1f1.cvdp_data.1850-2349.nc',
    'EC-Earth3-Veg-LR_r1i1p1f1.cvdp_data.2300-2800.nc',
    'GFDL-ESM4_r1i1p1f1.cvdp_data.1-500.nc',
    'GFDL-CM4_r1i1p1f1.cvdp_data.151-650.nc',
    'GISS-E2-1-G_r1i1p1f1.cvdp_data.4150-5000.nc',
    'GISS-E2-1-H_r1i1p1f1.cvdp_data.3180-3980.nc',
    'HadGEM3-GC31-LL_r1i1p1f1.cvdp_data.1850-2349.nc',
    'HadGEM3-GC31-MM_r1i1p1f1.cvdp_data.1850-2349.nc',
    'INM-CM4-8_r1i1p1f1.cvdp_data.1850-2380.nc',
    'INM-CM5-0_r1i1p1f1.cvdp_data.1996-3196.nc',
    'IPSL-CM6A-LR_r1i1p1f1.cvdp_data.1850-3849.nc', 
    'MIROC-ES2L_r1i1p1f2.cvdp_data.1850-2349.nc',
    'MIROC6_r1i1p1f1.cvdp_data.3200-3999.nc',
    'MPI-ESM-1-2-HAM_r1i1p1f1.cvdp_data.1850-2629.nc',
    'MPI-ESM1-2-HR_r1i1p1f1.cvdp_data.1850-2349.nc',
    'MPI-ESM1-2-LR_r1i1p1f1.cvdp_data.1850-2849.nc',
    'NorCPM1_r1i1p1f1.cvdp_data.1-500.nc',
    'NorESM2-MM_r1i1p1f1.cvdp_data.1200-1699.nc',
    'UKESM1-0-LL_r1i1p1f2.cvdp_data.1960-3839.nc', 
]

In [3]:
var_extract_monthly = [
    'amo_timeseries_mon', 'ipo_timeseries_mon', 'nino34', 'pdo_timeseries_mon', 
     'atlantic_nino'
]

var_extract_seasonally = [
    'npo_timeseries_','pna_timeseries_', 'nao_timeseries_', 'tas_global_avg_'
]

var_extract_all = [
    'amo_timeseries_mon', 'ipo_timeseries_mon', 'nino34', 
    'pdo_timeseries_mon', 'atlantic_nino', 
    'npo_timeseries_djf','pna_timeseries_djf', 
    'nao_timeseries_djf', 'tas_global_avg_djf',
    'npo_timeseries_mam','pna_timeseries_mam', 
    'nao_timeseries_mam', 'tas_global_avg_mam',
    'npo_timeseries_jja','pna_timeseries_jja', 
    'nao_timeseries_jja', 'tas_global_avg_jja',
    'npo_timeseries_son','pna_timeseries_son', 
    'nao_timeseries_son', 'tas_global_avg_son',
]

var_dataset_names = {
    'amo_timeseries_mon':'AMO', 'ipo_timeseries_mon':'IPO', 'nino34':'NINO34', 
    'pdo_timeseries_mon':'PDO', 'atlantic_nino':'ATN', 
    'npo_timeseries_':'NPO', 'pna_timeseries_':'PNA', 'nao_timeseries_':'NAO',
    'tas_global_avg_':'TAS',
}

CVDP_var_descriptions = {
    'AMO': 'Atlantic Multi-decadal Oscillation',
    'IPO': 'Interdecadal Pacific Oscillation',
    'NINO34': 'Niño 3.4 index',
    'PDO': 'Pacific Decadal Oscillation',
    'ATN': 'Atlantic Nino',
    'NPO': 'North Pacific Oscillation',
    'PNA': 'Pacific/North American Teleconnection Pattern',
    'NAO': 'North Atlantic Oscillation',
    'TAS': 'Global Near-Surface Air Temperature'
}

In [10]:
files_to_reduce = [
    'ACCESS-ESM1-5_r1i1p1f1.cvdp_data.101-1100.nc',
    'IPSL-CM6A-LR_r1i1p1f1.cvdp_data.1850-3849.nc',
    'UKESM1-0-LL_r1i1p1f2.cvdp_data.1960-3839.nc',
]

for CVDP_file in good_CVDP_list[2:]:
    
    if CVDP_file not in files_to_reduce:
        continue
    
    
    model_name = CVDP_file.split('.')[0]
    print(datetime.datetime.now(), model_name)
    
    CVDP_raw = xr.open_dataset(PI_CVDP_path+CVDP_file, decode_times=False)
    #only select the useful variables
    CVDP_raw = CVDP_raw[var_extract_all]
    
    if CVDP_file == files_to_reduce[0]:
        CVDP_raw = CVDP_raw.sel(TIME=slice(101,999))
    elif CVDP_file == files_to_reduce[1]:
        CVDP_raw = CVDP_raw.sel(TIME=slice(3050,3849))
    elif CVDP_file == files_to_reduce[2]:
        CVDP_raw = CVDP_raw.sel(TIME=slice(1960,3059))
    
    if CVDP_file in files_to_reduce:
        CVDP_raw = CVDP_raw.sel(time=slice(0,(len(CVDP_raw['TIME'])*12)-1))
        
    #define datetime arrays with different orders
    #1920-01,1920-02...2014-11, 2014-12
    time_linear = np.arange(np.datetime64('1920-01'),
                                np.datetime64('2015-01'), 
                                np.timedelta64(1, 'M')).astype(np.datetime64)

    #1920-01, 1921-01...2014-01, 1920-02, 1921-02....2014-02.......2014-12
    time_single_month = np.array([]).astype(np.datetime64)

    for month_ in np.arange(1,13):
        time_single_month = np.append(time_single_month, 
            np.arange(np.datetime64('1920-{}'.format(str(month_).zfill(2))), 
                      np.datetime64('2015-{}'.format(str(month_).zfill(2))), 
                      np.timedelta64(1, 'Y')
                     ).astype(np.datetime64)
                 )    
    
    #seprate into 95-year members
    all_members = []
    for mem_i, mem_ in enumerate(range(int(np.floor(len(CVDP_raw['TIME'])/95)))):
        temp_mem = CVDP_raw.isel(time=slice(mem_i*95*12,(mem_i+1)*(95*12)))
        temp_mem = temp_mem.isel(TIME=slice(mem_i*95,(mem_i+1)*95))
        
        temp_mem['time'] = np.arange('1920-01', '2015-01', 
                                     dtype='datetime64[M]')
        temp_mem['TIME'] = np.arange('1920', '2015', 
                                     dtype='datetime64[Y]')

        all_members.append(temp_mem)
    
    all_members_xr = xr.concat((all_members),dim='member')
    all_members_xr['member'] = np.arange(1,len(all_members)+1)
    
    #now compute the CVDP extraction
    CVDP_model_data = []

    for mem_ in np.arange(1,len(all_members)+1):
        
        CVDP_mem = all_members_xr.sel(member=mem_)
        CVDP_mem_dict = {}

        for var_name in var_extract_monthly:
            try:
                #save the monthly variables without alteration
                CVDP_mem_dict[var_name] = xr.DataArray(
                    data = np.ravel(CVDP_mem[var_name].copy()),
                    coords = {'time':time_linear},
                    dims = ['time']
                ).sortby('time')
            
            except KeyError: #variable missing
                # print(mem_, var_name)
                CVDP_mem_dict[var_name] = xr.DataArray(
                    data = np.zeros(len(time_linear))*np.nan,
                    coords = {'time':time_linear},
                    dims = ['time']
                ).sortby('time')

        for var_name in var_extract_seasonally:
            temp_seaonally_mem = []
            for month_ in np.arange(1,13):
                #stricly 1 and 2 should be the following year, but there 
                #wouldn't be enough data points for 1920-2014, but 1921-2015 
                #instead which is unhelpful for this analysis
                if month_ in [12,1,2]:
                    temp_seaonally_mem.append(CVDP_mem[var_name+'djf'])
                elif month_ in [3,4,5]:
                    temp_seaonally_mem.append(CVDP_mem[var_name+'mam'])
                elif month_ in [6,7,8]:
                    temp_seaonally_mem.append(CVDP_mem[var_name+'jja'])
                elif month_ in [9,10,11]:
                    temp_seaonally_mem.append(CVDP_mem[var_name+'son'])

            CVDP_mem_dict[var_name] = xr.DataArray(
                data = np.ravel(temp_seaonally_mem),
                coords = {'time':time_single_month},
                dims = ['time']
            ).sortby('time')

        CVDP_model_data.append(xr.Dataset(CVDP_mem_dict))

    CVDP_extracted = xr.concat((CVDP_model_data),dim='member')
    
    for var_name in list(CVDP_extracted.keys()):
        CVDP_extracted = CVDP_extracted.rename(
            {var_name:var_dataset_names[var_name]})
        
    CVDP_extracted['member'] = np.arange(1,len(all_members)+1)
    
    ### now compute seasonal average and standardize before saving to NetCDF ###
    
    all_var_data = {}
    for key_ in CVDP_extracted.keys():

        all_season_data = []
        for season_months in [[12,1,2],[3,4,5],[6,7,8],[9,10,11]]:
            #select the variable data for each month of the season
            raw_season0 = CVDP_extracted[key_].sel(
                time=CVDP_extracted['time.month']==season_months[0])
            raw_season1 = CVDP_extracted[key_].sel(
                time=CVDP_extracted['time.month']==season_months[1])
            raw_season2 = CVDP_extracted[key_].sel(
                time=CVDP_extracted['time.month']==season_months[2])
            
            #take the monthly average for the season
            raw_seaon_av = xr.concat(
                (raw_season0, raw_season1, raw_season2),dim='season_month')
            raw_seaon_av = raw_season1*0 + raw_seaon_av.mean('season_month')
            
            #linearly detrend and standardize the seasonal average over time
            all_season_data.append(
                (raw_seaon_av - raw_seaon_av.mean('time')) \
                / raw_seaon_av.std('time')
            )
            
        all_var_data[key_] = xr.concat(
            (all_season_data),dim='time').sortby('time')

    all_var_data = xr.Dataset(all_var_data)

    all_var_data.attrs = {
        'Description': 'Standardized variables from the CVDP (Climate '\
            +'Variability Diagnostics Package) for global climate model'\
            +f'{model_name} pre-industrial control. Seasonal data for '\
            +'available years, divided into 95-year partitions called members',
        'Units'      : 'standardized values',
        'Timestamp'  : str(datetime.datetime.utcnow().strftime(
            "%H:%M UTC %a %Y-%m-%d")),
        'Data source': 'CVDP doi: 10.1002/2014EO490002.',
        'Analysis'   : 'https://github.com/chrisrwp/low-frequency-variability/'\
            +'blob/main/input_data/Extract_SIC_CVDP_from_PI_Control.ipynb'
    }

    for data_var in list(all_var_data.keys()):
        all_var_data[data_var].attrs = {
            'Description':CVDP_var_descriptions[data_var]}

    all_var_data.to_netcdf('/glade/work/cwpowell/low-frequency-variability/'\
        +f'input_data/CVDP_standardized_PI_Control_{model_name}.nc')
    
    

2023-05-10 14:21:33.283498 IPSL-CM6A-LR_r1i1p1f1
2023-05-10 14:21:44.559859 UKESM1-0-LL_r1i1p1f2


## Make SIC and CVDP consolidated files 

In [68]:
#gather all of the members together and save to NetCDF
#make members 1st,2nd,3rd for training, 4th for validation, 5th+ for testing
CVDP_CMIP6 = []
SIC_CMIP6 = []

train_mem_i0 = 1000
train_mem_i1 = 2000
train_mem_i2 = 3000
valid_mem_i = 20000
test_mem_i  = 30000

for GCM in good_CVDP_list:
    model_name = GCM.split('.')[0]
    
    CVDP_data = xr.open_dataset(
        '/glade/work/cwpowell/low-frequency-variability/'\
        +f'input_data/CVDP_standardized_PI_Control_{model_name}.nc'
    )
    
    #make a new member classification    
    new_mem_list = np.arange(test_mem_i, test_mem_i+len(CVDP_data['member'])-4)
    new_mem_list = np.insert(new_mem_list, 0, [train_mem_i0, train_mem_i1, 
                                               train_mem_i2,valid_mem_i])
    
    CVDP_data['member'] = new_mem_list
    CVDP_CMIP6.append(CVDP_data)
    
    #now do the same for the SIC data
    SIC_data = xr.open_dataset(
        '/glade/work/cwpowell/low-frequency-variability/'\
        +f'input_data/Regional_SIC_lowpass_filter_{model_name}_PI_Control.nc'
    )
    
    SIC_data['member'] = new_mem_list   
    try:
        SIC_CMIP6.append(SIC_data.drop('type'))
    except ValueError:
        SIC_CMIP6.append(SIC_data)
    
    #now increase the initial value of the training, validation, and test member
    #element numbers
    train_mem_i0 += 1
    train_mem_i1 += 1
    train_mem_i2 += 1
    valid_mem_i += 1
    test_mem_i = test_mem_i + 1000

In [69]:
#save this CMIP6 data to NetCDF and include metadata
CVDP_CMIP6_xr = xr.concat((CVDP_CMIP6),dim='member').sortby('member')

CVDP_CMIP6_xr.attrs = {
    'Description' : 'Standardized variables from the CVDP (Climate '\
        'Variability Diagnostics Package) for all CMIP6 global climate models '\
        'with at least 500 years of pre-industrial control run available. '\
        'Which are then split into 95-year long so-called members. '\
        'Seasonal data with year indexing 1920-2014. The members can be '\
        'decoded as follows: 1000-3999 are the training members, 20000-29999 '\
        'are the validation members, and 30000+ are the test members. The '\
        'GCM is encoded as the last 2 digits for the training and validation '\
        'members, and the first 2 digits +10 for the testing members. Note '\
        'there are always 3 members from each GCM for training, 1 for testing '\
        'and all remaining members are used for testing. The GCM numbering '\
        'refers to the following: 00 ACCESS-CM2_r1i1p1f1, '\
        '01 ACCESS-ESM1-5_r1i1p1f1, 02 BCC-CSM2-MR_r1i1p1f1, '\
        '03 BCC-ESM1_r1i1p1f1, 04 CanESM5_r1i1p1f1, 05 CanESM5_r1i1p2f1, '\
        '06 CanESM5-CanOE_r1i1p2f1, 07 CESM2-FV2_r1i1p1f1, 08 CESM2_r1i1p1f1, '\
        '09 CESM2-WACCM-FV2_r1i1p1f1, 10 CMCC-CM2-SR5_r1i1p1f1, '\
        '11 CMCC-ESM2_r1i1p1f1, 12 CNRM-CM6-1_r1i1p1f2, '\
        '13 CNRM-ESM2-1_r1i1p1f2, 14 E3SM-1-0_r1i1p1f1, '\
        '15 EC-Earth3_r1i1p1f1, 16 EC-Earth3-CC_r1i1p1f1, '\
        '17 EC-Earth3-Veg_r1i1p1f1, 18 EC-Earth3-Veg-LR_r1i1p1f1, '\
        '19 GFDL-ESM4_r1i1p1f1, 20 GFDL-CM4_r1i1p1f1, '\
        '21 GISS-E2-1-G_r1i1p1f1, 22 GISS-E2-1-H_r1i1p1f1, '\
        '23 HadGEM3-GC31-LL_r1i1p1f1, 24 HadGEM3-GC31-MM_r1i1p1f1, '\
        '25 INM-CM4-8_r1i1p1f1, 26 INM-CM5-0_r1i1p1f1, '\
        '27 IPSL-CM6A-LR_r1i1p1f1, 28 MIROC-ES2L_r1i1p1f2, '\
        '29 MIROC6_r1i1p1f1, 30 MPI-ESM-1-2-HAM_r1i1p1f1, '\
        '31 MPI-ESM1-2-HR_r1i1p1f1, 32 MPI-ESM1-2-LR_r1i1p1f1, '\
        '33 NorCPM1_r1i1p1f1, 34 NorESM2-MM_r1i1p1f1, '\
        '35 UKESM1-0-LL_r1i1p1f2.',
    'Units' :'standardized values',
    'Timestamp' : str(datetime.datetime.utcnow().strftime(
        "%H:%M UTC %a %Y-%m-%d")),
    'Data source': 'CMIP6 pre-industrial control simulations, computed by '\
        +'CVDP: doi: 10.1002/2014EO490002.',
    'Analysis'   : 'https://github.com/chrisrwp/low-fequency-variability/'\
            +'input_data/Extract_SIC_CVDP_from_PI_Control.ipynb',
}
    
CVDP_CMIP6_xr.to_netcdf(
    '/glade/work/cwpowell/low-frequency-variability/input_data/'\
    +'CVDP_standardized_PI_Control_MMLE_500_first_3_train.nc')

SIC_CMIP6_xr = xr.concat((SIC_CMIP6),dim='member').sortby('member').drop(
    'depth')
SIC_CMIP6_xr = SIC_CMIP6_xr.rename({'regional_SIC':'SIC'})
SIC_CMIP6_xr.attrs = {
    'Description' : '2-year lowpass filter of regional average sea ice '\
        'concentration (SIC) in % for all CMIP6 global climate models '\
        'with at least 500 years of pre-industrial control run available. '\
        'Which are then split into 95-year long so-called members. Seasonal '\
        'data with year indexing 1920-2014. The members can be '\
        'decoded as follows: 1000-3999 are the training members, 20000-29999 '\
        'are the validation members, and 30000+ are the test members. The '\
        'GCM is encoded as the last 2 digits for the training and validation '\
        'members, and the first 2 digits +10 for the testing members. Note '\
        'there are always 3 members from each GCM for training, 1 for testing '\
        'and all remaining members are used for testing. The GCM numbering '\
        'refers to the following: 00 ACCESS-CM2_r1i1p1f1, '\
        '01 ACCESS-ESM1-5_r1i1p1f1, 02 BCC-CSM2-MR_r1i1p1f1, '\
        '03 BCC-ESM1_r1i1p1f1, 04 CanESM5_r1i1p1f1, 05 CanESM5_r1i1p2f1, '\
        '06 CanESM5-CanOE_r1i1p2f1, 07 CESM2-FV2_r1i1p1f1, 08 CESM2_r1i1p1f1, '\
        '09 CESM2-WACCM-FV2_r1i1p1f1, 10 CMCC-CM2-SR5_r1i1p1f1, '\
        '11 CMCC-ESM2_r1i1p1f1, 12 CNRM-CM6-1_r1i1p1f2, '\
        '13 CNRM-ESM2-1_r1i1p1f2, 14 E3SM-1-0_r1i1p1f1, '\
        '15 EC-Earth3_r1i1p1f1, 16 EC-Earth3-CC_r1i1p1f1, '\
        '17 EC-Earth3-Veg_r1i1p1f1, 18 EC-Earth3-Veg-LR_r1i1p1f1, '\
        '19 GFDL-ESM4_r1i1p1f1, 20 GFDL-CM4_r1i1p1f1, '\
        '21 GISS-E2-1-G_r1i1p1f1, 22 GISS-E2-1-H_r1i1p1f1, '\
        '23 HadGEM3-GC31-LL_r1i1p1f1, 24 HadGEM3-GC31-MM_r1i1p1f1, '\
        '25 INM-CM4-8_r1i1p1f1, 26 INM-CM5-0_r1i1p1f1, '\
        '27 IPSL-CM6A-LR_r1i1p1f1, 28 MIROC-ES2L_r1i1p1f2, '\
        '29 MIROC6_r1i1p1f1, 30 MPI-ESM-1-2-HAM_r1i1p1f1, '\
        '31 MPI-ESM1-2-HR_r1i1p1f1, 32 MPI-ESM1-2-LR_r1i1p1f1, '\
        '33 NorCPM1_r1i1p1f1, 34 NorESM2-MM_r1i1p1f1, '\
        '35 UKESM1-0-LL_r1i1p1f2.',
    'Timestamp' : str(datetime.datetime.utcnow().strftime(
        "%H:%M UTC %a %Y-%m-%d")),
    'Data source': 'CMIP6 historical simulations',
    'Analysis'   : 'https://github.com/chrisrwp/low-fequency-variability/'\
            +'input_data/Extract_SIC_CVDP_from_PI_Control.ipynb',
}
    
SIC_CMIP6_xr.to_netcdf(
    '/glade/work/cwpowell/low-frequency-variability/input_data/'\
    +'Regional_SIC_lowpass_filter_PI_Control_MMLE_500_first_3_train.nc')