# Bring together all CMIP6 data including paths on `/glade`, doi, modeling centers etc.

### Author - Chris Wyburn-Powell, see the latest version on [github](https://github.com/chrisrwp/low-frequency-variability/blob/main/input_data/CMIP6_glade_info.ipynb)

**Input:**
- The `/glade` file directory of CMIP6 model data
- A csv file of model DOIs obtained from WCPR

**Method:**
- Record the correct paths to each of the model data files
- Manually check the x,y, dimensions and record this
- Locate the correct `areacello`/`areacella` files and record their location after verifying their match with the SIC data

**Output:**
- Record the modeling center names, members and DOIs corresponding to each model in this NetCDF file: `CMIP6_modeling_center_members_doi.nc`
- Save picked Python dictionaries for the following data: `CMIP6_areacello_paths.pickle`, `CMIP6_areacello_lat_names.pickle`, `CMIP6_x_y_names.pickle`, `CMIP6_SImon_siconc_paths.pickle`

In [1]:
import xarray as xr
import pandas as pd
import numpy as np
import datetime
import os
import glob
import re
import pickle
print(datetime.datetime.now())

2023-01-23 14:53:43.959069


# Make a NetCDF file for useful info for CMIP6 models with variables:
- **`modeling_center`**
- **`members`**
- **`doi`**


In [2]:
#list all of the model names - N.B. manually add CESM2-LENS as this GCM has
#a different naming convention without vairant ID, also remove ICON-ESM-LR
#as this has a columnar format data and only AMOC was able to be processed
CMIP6_CVDP_fnames = np.sort(list(os.listdir('/glade/work/cwpowell/'\
    +'low-frequency-variability/raw_data/CMIP6_CVDP_historical/')))

model_names = []
for i in CMIP6_CVDP_fnames:
    try:
        model_names.append(re.findall('.*?(?=_r.*.cvdp)', i)[0])
    except IndexError:
        pass
        # print(i) #this is to list all of the observational records
    
model_names = np.unique(model_names)
model_names = np.append(model_names,'CESM2-LENS') #add CESM2-LENS
model_names = model_names[model_names != 'ICON-ESM-LR'] #remove ICON-ESM-LR

In [14]:
#make a dictionary of the corresponding Institution_ID with Source_ID
CMIP6_IDs = pd.read_csv('/glade/work/cwpowell/low-frequency-variability/'\
                        +'raw_data/CMIP6_info/'\
                        +'CMIP6_source_IDs_institution_IDs.txt')

ID_model_names = []
ID_model_centers = []
for i, model_name in enumerate(xr.DataArray(CMIP6_IDs[0:1])['dim_1'].values):
    ID_model_names.append(model_name)
    ID_model_centers.append(str(xr.DataArray(CMIP6_IDs[0:1]).values[0][i]))
    
ID_name_centers_xr = xr.DataArray(data=ID_model_centers,
                                  coords={'model':ID_model_names},
                                  dims='model')

In [22]:
#make a dictionary of all variant IDs (members) for each model
CVDP_cmip6_hist_list = np.sort(np.array(os.listdir(
    '/glade/work/cwpowell/low-frequency-variability/raw_data/'\
    +'CMIP6_CVDP_historical')))

CVDP_mem_list = np.zeros((69,100)).astype('str')
for model_i, model_name in enumerate(model_names):
    temp_mem_list = []

    for i in CVDP_cmip6_hist_list:
        #search for a substring beginning with the model name and '_' then
        #return substring between that and 'cvdp'
        if re.search('(?<={}_)(.*)(?=cvdp)'.format(model_name), i):
            #if such a string is found, append it to the list
            temp_mem_list.append(re.search('(?<={}_)(.*)(?=cvdp)'.format(
                model_name), i)[0][:-1])
            
    CVDP_mem_list[model_i][0:len(temp_mem_list)] = np.ravel(temp_mem_list)
    
CVDP_mem_list_xr = xr.DataArray(
    data=CVDP_mem_list,
    coords={'model':model_names, 'mem_i':np.arange(0,100)},
    dims=('model','mem_i')
)

In [24]:
#make a list of the dois for all the model names
#from https://doi.org/10.5194/esd-12-253-2021
doi_list = pd.read_csv('/glade/work/cwpowell/low-frequency-variability/'\
                       +'raw_data/CMIP6_info/CMIP6_doi.txt', header=None)

doi_list_xr = xr.DataArray(data=doi_list[1].values, 
                           coords={'model':doi_list[0].values},
                           dims=('model'))

In [27]:
CMIP6_info = xr.Dataset(
    {'modeling_center':ID_name_centers_xr.sel(model=model_names),
     'members':CVDP_mem_list_xr,
     'doi':doi_list_xr
    }
)

CMIP6_info.to_netcdf('/glade/work/cwpowell/low-frequency-variability/raw_data/'\
                     +'CMIP6_info/CMIP6_modeling_center_members_doi.nc')

In [2]:
CMIP6_info = xr.open_dataset(
    '/glade/work/cwpowell/low-frequency-variability/raw_data/'\
    +'CMIP6_info/CMIP6_modeling_center_members_doi.nc')

# Make paths for `siconc` for `SImon` output
**The following realizations available from CVDP do not have available `siconc` or `siconca` files:** <br>
- AWI-CM-1-1-MR, x5, r<1-5>i1p1f1
- CNRM-CM6-1, x9, r<21-28,30>i1p1f2 
- CNRM-ESM2-1, x4, r<7-10>i1p1f2    
- GISS-E2-1-G, x3, r<9-10>i1p5f1,r10i1p2f1 
- GISS-E2-1-G-CC, x1, r1i1p1f1 
- GISS-E3-G, x1, r1i1p1f1
- IITM-ESM, x1, r1i1p1f1
- KACE-1-0-G, x3, r<1-3>i1p1f1
- MCM-UA-1-0, x2, r1i1p1f<1-2>
- TaiESM1, x1, r2i1p1f1
- UKESM1-0-LL x2, r<14-15>i1p1f2 <br>

**CESM2-LENS is a different case:** <br>
50 members use different biomass burning forcing from CMIP6, do not use <br>
- CESM2-LENS, x50, 1011.001, 1031.002, 1051.003, 1071.004, 1091.005, 1111.006, 1131.007, 1151.008, 1171.009, 1191.010, 1231.0<11-20>, 1251.0<11-20>, 1281.0<11-20>, 1301.0<11-20>

In [7]:
#make a list of all paths for all models and variant labels for siconc
var_name = 'siconc'
siconc_paths = {}
for model_name in CMIP6_info['model'].values:
    mem_list = CMIP6_info['members'].sel(model=model_name).where(
        CMIP6_info['members'].sel(model=model_name)!='0.0', drop=True).values    
    
    center_name = CMIP6_info['modeling_center'].sel(model=model_name).values
    
    for mem_ in mem_list:

        if model_name in ['E3SM-1-1', 'E3SM-1-0', 'E3SM-1-1-ECA',
                          'GISS-E2-1-H','GISS-E2-2-H']:
            g_i = 'gr'
        elif model_name in ['INM-CM4-8','INM-CM5-0','KIOST-ESM']:
            g_i = 'gr1'
        else:
            g_i = 'gn'

        if model_name in ['GISS-E2-1-G']:
            var_name = 'siconca'

        try:
            version_dir = np.sort(os.listdir(
                f'/glade/collections/cmip/CMIP6/CMIP/{center_name}/'\
                +f'{model_name}/historical/{mem_}/SImon/{var_name}/{g_i}/'))
        except FileNotFoundError:
            pass #have file in /glade/work or does not exist, no version needed

        
        if model_name in ['CESM2-WACCM-FV2', 'CESM2-WACCM',
                          'CESM2-FV2', 'CESM2',]:
            siconc_paths[model_name+'_'+mem_] = glob.glob(
                f'/glade/collections/cmip/CMIP6/CMIP/{center_name}/'\
                +f'{model_name}/historical/{mem_}/SImon/{var_name}/{g_i}/'\
                +f'{version_dir[-1]}/*.nc')

        elif model_name in ['CESM2-LENS']:
            siconc_paths[model_name+'_'+mem_] = glob.glob(
                '/glade/campaign/cgd/cesm/CESM2-LE/timeseries/ice/proc/'\
                +'tseries/month_1/aice/b.e21.BHISTcmip6.f09_g17'\
                +f'.LE2-{mem_}*.nc')
            
        elif model_name+'_'+mem_ == 'EC-Earth3_r3i1p1f1':
            siconc_paths[model_name+'_'+mem_] = glob.glob(
                f'/glade/collections/cmip/CMIP6/CMIP/{center_name}/'\
                +f'{model_name}/historical/{mem_}/SImon/{var_name}/{g_i}/'\
                +f'{version_dir[-1]}/{var_name}/*.nc')

        elif model_name+'_'+mem_ == 'EC-Earth3-Veg_r12i1p1f1':
            siconc_paths[model_name+'_'+mem_] = glob.glob(
                f'/glade/collections/cmip/CMIP6/CMIP/{center_name}/'\
                +f'{model_name}/historical/{mem_}/SImon/{var_name}/gr/'\
                +f'v20200925/{var_name}/*.nc')
        
        #don't use the latest version
        elif model_name in ['EC-Earth3','EC-Earth3-Veg','EC-Earth3-Veg-LR',
                            'NorESM2-LM ','HadGEM3-GC31-LL']:
            siconc_paths[model_name+'_'+mem_] = glob.glob(
                f'/glade/collections/cmip/CMIP6/CMIP/{center_name}/'\
                +f'{model_name}/historical/{mem_}/SImon/{var_name}/{g_i}/'\
                +f'{version_dir[0]}/{var_name}/*.nc')

        #UKESM1-0-LL under NIMS-KMA for 3 realizations and HOMC for others
        elif model_name in ['UKESM1-0-LL'] and mem_ in ['r13i1p1f2',
            'r14i1p1f2','r15i1p1f2']:
            siconc_paths[model_name+'_'+mem_] = glob.glob(
                f'/glade/collections/cmip/CMIP6/CMIP/NIMS-KMA/UKESM1-0-LL/'\
                +f'historical/{mem_}/SImon/siconc/gn/v20200611/siconc/*.nc')

        #use data downloaded from ESGF directly in /glade/work/cwpowell
        elif model_name in ['CanESM5-1','E3SM-2-0','GISS-E2-2-G']:
            siconc_paths[model_name+'_'+mem_] = glob.glob(
                f'/glade/work/cwpowell/low-frequency-variability/raw_data/'\
                +f'CMIP6_siconc/*{model_name}*{mem_}*.nc')
            
        elif model_name == 'GISS-E2-1-G'and mem_ in ['r10i1p5f1', 
                                                     'r11i1p1f2', 'r9i1p5f1']:
            siconc_paths[model_name+'_'+mem_] = glob.glob(
                f'/glade/work/cwpowell/low-frequency-variability/raw_data/'\
                +f'CMIP6_siconc/*{model_name}*{mem_}*.nc')

        elif model_name in ['EC-Earth3-AerChem'] and mem_ == 'r3i1p1f1':
            siconc_paths[model_name+'_'+mem_] = glob.glob(
                f'/glade/work/cwpowell/low-frequency-variability/raw_data/'\
                +f'CMIP6_siconc/*{model_name}*{mem_}*.nc')

        elif model_name in ['ACCESS-CM2'] and mem_ in ['r6i1p1f1',
            'r7i1p1f1','r8i1p1f1','r9i1p1f1','r10i1p1f1']:
            siconc_paths[model_name+'_'+mem_] = glob.glob(
                f'/glade/work/cwpowell/low-frequency-variability/raw_data/'\
                +f'CMIP6_siconc/*{model_name}*{mem_}*.nc')

        else:
            siconc_paths[model_name+'_'+mem_] = glob.glob(
                f'/glade/collections/cmip/CMIP6/CMIP/{center_name}/'\
                +f'{model_name}/historical/{mem_}/SImon/{var_name}/{g_i}/'\
                +f'{version_dir[-1]}/{var_name}/*.nc')


        if model_name in ['GISS-E2-1-G']:
            var_name = 'siconc'

siconc_paths['EC-Earth3_r7i1p1f1'].append(
    '/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_siconc/'\
    +'siconc_SImon_EC-Earth3_historical_r7i1p1f1_gn_193701-193712.nc')

In [12]:
#save
# with open('/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_info/'\
#           +'CMIP6_SImon_siconc_paths.pickle', 'wb') as handle:
#     pickle.dump(siconc_paths, handle, protocol=pickle.HIGHEST_PROTOCOL)

#load 
with open('/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_info/'\
          +'CMIP6_SImon_siconc_paths.pickle', 'rb') as handle:
    siconc_paths = pickle.load(handle)

In [17]:
#list of models to use which variable
sivol_model_list = [
     'ACCESS-CM2', 'ACCESS-ESM1-5', 'AWI-CM-1-1-MR', 'AWI-ESM-1-1-LR',
     'BCC-CSM2-MR', 'BCC-ESM1', 'CAMS-CSM1-0', 'CAS-ESM2-0', 'CESM2',
     'CESM2-FV2', 'CESM2-WACCM', 'CESM2-WACCM-FV2', 'CIESM', 'CMCC-CM2-SR5',
     'CNRM-CM6-1', 'CNRM-CM6-1-HR', 'CNRM-ESM2-1', 'EC-Earth3', 'EC-Earth3-Veg',
     'EC-Earth3-Veg-LR', 'FGOALS-f3-L', 'FIO-ESM-2-0', 'GFDL-CM4', 'GFDL-ESM4', 
     'GISS-E2-1-G', 'GISS-E2-1-H', 'HadGEM3-GC31-LL', 'HadGEM3-GC31-MM', 
     'IPSL-CM6A-LR', 'MPI-ESM-1-2-HAM', 'MPI-ESM1-2-HR', 'MPI-ESM1-2-LR', 
     'MRI-ESM2-0', 'NorESM2-LM', 'NorESM2-MM', 'SAM0-UNICON', 'TaiESM1', 
     'UKESM1-0-LL',
]
    
sithick_model_list = [
    'CanESM5', 'E3SM-1-0', 'KACE-1-0-G', 'MCM-UA-1-0', 'MIROC-ES2L', 'MIROC6',
    'NESM3', 'NorCPM1',
]

neither_model_list = [
    'CanESM5-CanOE', 'E3SM-1-1', 'E3SM-1-1-ECA', 'FGOALS-g3', 'GISS-E2-1-G-CC',
    'IITM-ESM', 'INM-CM4-8', 'INM-CM5-0', 'KIOST-ESM',
]

In [35]:
#make a list of all paths for all models and variant labels for sivol
var_name = 'sivol'
sivol_paths = {}

for model_name in all_model_names:
    mem_not_found = []
    
    center_name = CMIP6_info['modeling_center'].sel(model=model_name).values
    
    mem_list = CMIP6_info['members'].sel(model=model_name).where(
        CMIP6_info['members'].sel(model=model_name)!='0.0', drop=True).values
    
    for mem_ in mem_list:
        
        #look for the sivol file paths on glade
        try:
            
            if model_name in ['E3SM-1-1', 'E3SM-1-0', 'E3SM-1-1-ECA']:
                g_i = 'gr'
            elif model_name in ['INM-CM4-8','INM-CM5-0','KIOST-ESM']:
                g_i = 'gr1'
            else:
                g_i = 'gn'
                
            version_dir = np.sort(os.listdir(
                f'/glade/collections/cmip/CMIP6/CMIP/{center_name}/'\
                +f'{model_name}/historical/{mem_}/SImon/{var_name}/{g_i}/'))
            
            if model_name in ['CESM2-WACCM-FV2', 'CESM2-WACCM',
                              'CESM2-FV2', 'CESM2',]:
                sivol_paths[model_name+'_'+mem_] = glob.glob(
                    f'/glade/collections/cmip/CMIP6/CMIP/{center_name}/'\
                    +f'{model_name}/historical/{mem_}/SImon/{var_name}/'\
                    +f'{g_i}/{version_dir[-1]}/*.nc')
            else:
                sivol_paths[model_name+'_'+mem_] = glob.glob(
                    f'/glade/collections/cmip/CMIP6/CMIP/{center_name}/'\
                    +f'{model_name}/historical/{mem_}/SImon/{var_name}/{g_i}/'\
                    +f'{version_dir[-1]}/{var_name}/*.nc')
        
            #now set members with incomplete data for 1850-2014 to np.nan
            if model_name+'_'+mem_ in ['NorESM2-LM_r1i1p1f1']:
                sithick_paths[model_name+'_'+mem_] = np.nan
        
        #sivol files were not found on glade
        except FileNotFoundError:
            
            try:
                #look in /glade/home/cwpowell/ for the files
                sivol_paths[model_name+'_'+mem_] = glob.glob(
                    '/glade/work/cwpowell/low-frequency-variability/raw_data/'\
                    +f'CMIP6_sivol/sivol_SImon_{model_name}_historical_'\
                    +f'{mem_}*.nc')
                
                os.path.getsize(sivol_paths[model_name+'_'+mem_][0])
            
            except (IndexError):
                #file not on CMIP6 collection in glade nor in 
                #/glade/work/cwpowell/ so could not be located on ESGF
                mem_not_found.append(mem_)
                sivol_paths[model_name+'_'+mem_] = np.nan
    
    # if len(mem_not_found) != 0: 
    #     print(model_name, 'missing member total:', len(mem_not_found), 
    #           mem_not_found)

In [87]:
#make a list of all paths for all models and variant labels for sithick
var_name = 'sithick'
sithick_paths = {}

for model_name in all_model_names:
    mem_not_found = []
    
    center_name = CMIP6_info['modeling_center'].sel(model=model_name).values
    
    mem_list = CMIP6_info['members'].sel(model=model_name).where(
        CMIP6_info['members'].sel(model=model_name)!='0.0', drop=True).values
    
    for mem_ in mem_list:
        
        #look for the sivol file paths on glade
        try:
            
            if model_name in ['E3SM-1-1', 'E3SM-1-0', 'E3SM-1-1-ECA']:
                g_i = 'gr'
            elif model_name in ['INM-CM4-8','INM-CM5-0','KIOST-ESM']:
                g_i = 'gr1'
            else:
                g_i = 'gn'
                
            version_dir = np.sort(os.listdir(
                f'/glade/collections/cmip/CMIP6/CMIP/{center_name}/'\
                +f'{model_name}/historical/{mem_}/SImon/{var_name}/{g_i}/'))
            
            if model_name in ['CESM2-WACCM-FV2', 'CESM2-WACCM',
                              'CESM2-FV2', 'CESM2',]:
                sithick_paths[model_name+'_'+mem_] = glob.glob(
                    f'/glade/collections/cmip/CMIP6/CMIP/{center_name}/'\
                    +f'{model_name}/historical/{mem_}/SImon/{var_name}/'\
                    +f'{g_i}/{version_dir[-1]}/*.nc')
            
            elif model_name == 'NorCPM1' and mem_[1:3] in ['15','16','22','23',
                                                           '26','28','30']:
                #only select the first file 1850-2014, not ones beyond
                sithick_paths[model_name+'_'+mem_] = glob.glob(
                    f'/glade/collections/cmip/CMIP6/CMIP/{center_name}/'\
                    +f'{model_name}/historical/{mem_}/SImon/{var_name}/'\
                    +f'{g_i}/{version_dir[-1]}/{var_name}/*201412.nc')

            else:
                sithick_paths[model_name+'_'+mem_] = glob.glob(
                    f'/glade/collections/cmip/CMIP6/CMIP/{center_name}/'\
                    +f'{model_name}/historical/{mem_}/SImon/{var_name}/'\
                    +f'{g_i}/{version_dir[-1]}/{var_name}/*.nc')
            
            #now set members with incomplete data for 1850-2014 to np.nan
            if model_name+'_'+mem_ in ['NorESM2-LM_r1i1p1f1']:
                sithick_paths[model_name+'_'+mem_] = np.nan
            
        #sivol files were not found on glade
        except FileNotFoundError:
            
            try:
                #look in /glade/home/cwpowell/ for the files
                sithick_paths[model_name+'_'+mem_] = glob.glob(
                    '/glade/work/cwpowell/low-frequency-variability/raw_data/'\
                    +f'CMIP6_sithick/sithick_SImon_{model_name}_historical_'\
                    +f'{mem_}*.nc')
                
                os.path.getsize(sithick_paths[model_name+'_'+mem_][0])
            
            except (IndexError):
                #file not on CMIP6 collection in glade nor in 
                #/glade/work/cwpowell/ so could not be located on ESGF
                mem_not_found.append(mem_)
                sithick_paths[model_name+'_'+mem_] = np.nan

                
#now add in sithick files not on glade but availible from esgf
home_path = '/glade/work/cwpowell/low-frequency-variability/raw_data/'\
    +'CMIP6_sithick/'

sithick_paths['E3SM-1-0_r1i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r1i1p1f1_gr_186501-186912.nc')
sithick_paths['E3SM-1-0_r1i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r1i1p1f1_gr_189001-189412.nc')
sithick_paths['E3SM-1-0_r1i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r1i1p1f1_gr_190001-190412.nc')
sithick_paths['E3SM-1-0_r1i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r1i1p1f1_gr_193001-193412.nc')
sithick_paths['E3SM-1-0_r1i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r1i1p1f1_gr_200501-200912.nc')

sithick_paths['E3SM-1-0_r2i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r2i1p1f1_gr_186501-186912.nc')
sithick_paths['E3SM-1-0_r2i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r2i1p1f1_gr_188501-188912.nc')
sithick_paths['E3SM-1-0_r2i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r2i1p1f1_gr_192501-192912.nc')
sithick_paths['E3SM-1-0_r2i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r2i1p1f1_gr_196001-196412.nc')

sithick_paths['E3SM-1-0_r3i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r3i1p1f1_gr_188001-188412.nc')
sithick_paths['E3SM-1-0_r3i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r3i1p1f1_gr_191501-191912.nc')
sithick_paths['E3SM-1-0_r3i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r3i1p1f1_gr_199501-199912.nc')
sithick_paths['E3SM-1-0_r3i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r3i1p1f1_gr_200501-200912.nc')
sithick_paths['E3SM-1-0_r3i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r3i1p1f1_gr_201001-201412.nc')

sithick_paths['E3SM-1-0_r5i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r5i1p1f1_gr_187001-187412.nc')
sithick_paths['E3SM-1-0_r5i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r5i1p1f1_gr_195001-195412.nc')
sithick_paths['E3SM-1-0_r5i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r5i1p1f1_gr_197001-197412.nc')
sithick_paths['E3SM-1-0_r5i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r5i1p1f1_gr_198001-198412.nc')
sithick_paths['E3SM-1-0_r5i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r5i1p1f1_gr_199501-199912.nc')
sithick_paths['E3SM-1-0_r5i1p1f1'].append(
    home_path+'sithick_SImon_E3SM-1-0_historical_r5i1p1f1_gr_201001-201412.nc')

In [89]:
model_path_dict = {}
model_var_dict = {}
for model_name in CMIP6_info['model'].values:
    
    mem_list = CMIP6_info['members'].sel(model=model_name).where(
        CMIP6_info['members'].sel(model=model_name)!='0.0', drop=True).values
    
    for mem_ in mem_list:
        if model_name in sivol_model_list:
            model_path_dict[model_name+'_'+mem_] = sivol_paths[model_name+'_'+mem_]
            model_var_dict[model_name] = 'sivol'
        elif model_name in sithick_model_list:
            model_path_dict[model_name+'_'+mem_] = sithick_paths[model_name+'_'+mem_]
            model_var_dict[model_name] = 'sithick'
        elif model_name in neither_model_list:
            model_path_dict[model_name+'_'+mem_] = np.nan
            model_var_dict[model_name] = 'neither'
        else:
            print('missed', model_name)
        

In [90]:
with open('/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_info/'\
          +'CMIP6_SImon_sivol_sithick_paths.pickle', 'wb') as handle:
    pickle.dump(model_path_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_info/'\
          +'CMIP6_SImon_var_availibility.pickle', 'wb') as handle:
    pickle.dump(model_var_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

# `areacello` or `arecella` paths
**List of models without siconc(a), so no need for areacello/a:**
- AWI-CM-1-1-MR/LR (LR has siconc, but columnar data)
- GISS-E2-1-G-CC 
- GISS-E3-G
- IITM-ESM
- KACE-1-0-G
- MCM-UA-1-0 <br>

**Other things to note:**
- CMCC-CM2-HR4 (600, 1050, 1440) (1051, 1442), extra i,j coordinates, but same grid
- CMCC-CM2-SR5 (1980, 291, 360) (292, 362), extra i,j coordinates, but same grid
- CMCC-ESM2 (1980, 291, 360) (292, 362), extra i,j coordinates, but same grid
- NorESM2-LM (120, 384, 360) (385, 360), extra j coordinates but has same grid
- NorESM2-MM (120, 384, 360) (385, 360), extra j coordinates but has same grid

In [12]:
#loop through all model names and try and find areacello that matches siconc
#shape (do this check manually by uncommenting the print statements)

areacello_paths = {}
for model_name in CMIP6_info['model'].values:
    if model_name in ['CNRM-ESM2-1']:
        mem_i_ = 2
    else:
        mem_i_ = 0
    
    try:
        siconc_test = xr.open_dataset(
            siconc_paths[model_name+'_'+str(CMIP6_info['members'].sel(
            model=model_name).sel(mem_i=mem_i_).values)][0])
        
        if model_name == 'CESM2-LENS':
            siconc_test = siconc_test.rename({'aice':'siconc'})
        
    except IndexError:
        print(model_name, 'No siconc')
    
    try:        
        areacello_path = glob.glob(
            f'/glade/work/cwpowell/Data/CMIP6_areacello/*{model_name}_*.nc')

        areacello_test = xr.open_dataset(areacello_path[0])

        areacello_paths[model_name] = areacello_path[0]
        
        try:
            print(model_name, np.shape(siconc_test['siconc']),
                  np.shape(areacello_test['areacello']))
        except KeyError:
            print(model_name, np.shape(siconc_test['siconca']),
                  np.shape(areacello_test['areacello']))
        
    except IndexError:
        
        try:
            areacella_path = glob.glob(
                f'/glade/work/cwpowell/Data/CMIP6_areacella/*{model_name}_*.nc')
            
            areacella_test = xr.open_dataset(areacella_path[0])

            areacello_paths[model_name] = areacella_path[0]
            
            try:
                print(model_name, np.shape(siconc_test['siconc']),
                      np.shape(areacella_test['areacella']))
            except KeyError:
                print(model_name, np.shape(siconc_test['siconca']),
                      np.shape(areacella_test['areacella']))

        except IndexError:
            areacello_paths[model_name] = np.nan #no areacello or areacella
            print(model_name, 'no areacello or areacella')
    
    except TypeError:
        areacello_paths[model_name] = np.nan #no siconc for this model
        print(model_name, 'no siconc for this model')

In [166]:
#now define the names of the latitude, and x,y dimension names for the GCMs
lat_names = {
    'ACCESS-CM2':'latitude', 'ACCESS-ESM1-5':'latitude', 'AWI-ESM-1-1-LR':'lat',
    'AWI-CM-1-1-MR':'ncells', 'BCC-CSM2-MR':'latitude', 'BCC-ESM1':'latitude', 
    'CAMS-CSM1-0':'latitude', 'CAS-ESM2-0':'lat', 'CESM2':'lat',
    'CESM2-LENS':'lat', 'CESM2-FV2':'lat', 'CESM2-WACCM':'lat',
    'CESM2-WACCM-FV2':'lat', 'CIESM':'latitude', 'CMCC-CM2-HR4':'latitude', 
    'CMCC-CM2-SR5':'latitude', 'CMCC-ESM2':'latitude', 'CNRM-CM6-1':'lat',
    'CNRM-CM6-1-HR':'lat', 'CNRM-ESM2-1':'lat', 'CanESM5':'latitude',
    'CanESM5-1':'latitude', 'CanESM5-CanOE':'latitude', 'E3SM-1-0':'lat',
    'E3SM-1-1':'lat', 'E3SM-1-1-ECA':'lat', 'E3SM-2-0':'lat',
    'EC-Earth3':'latitude', 'EC-Earth3-AerChem':'latitude',
    'EC-Earth3-CC':'latitude', 'EC-Earth3-Veg':'latitude', 
    'EC-Earth3-Veg-LR':'latitude', 'FGOALS-f3-L':'latitude', 
    'FGOALS-g3':'latitude',  'FIO-ESM-2-0':'latitude', 'GFDL-CM4':'lat', 
    'GFDL-ESM4':'lat', 'GISS-E2-1-G':'lat', 'GISS-E2-1-G-CC':'lat', 
    'GISS-E2-1-H':'lat', 'GISS-E2-2-G':'lat', 'GISS-E2-2-H':'lat',
    'GISS-E3-G':'lat', 'HadGEM3-GC31-LL':'latitude', 
    'HadGEM3-GC31-MM':'latitude', 'IITM-ESM':'lat', 'INM-CM4-8':'lat', 
    'INM-CM5-0':'lat', 'IPSL-CM5A2-INCA':'nav_lat', 'IPSL-CM6A-LR':'nav_lat',
    'IPSL-CM6A-LR-INCA':'nav_lat', 'KACE-1-0-G':'lat', 'KIOST-ESM':'lat',
    'MCM-UA-1-0':'latitude', 'MIROC-ES2L':'latitude',  'MIROC-ES2H':'latitude', 
    'MIROC6':'latitude', 'MPI-ESM-1-2-HAM':'latitude',
    'MPI-ESM1-2-HR':'latitude', 'MPI-ESM1-2-LR':'latitude', 
    'MRI-ESM2-0':'latitude', 'NESM3':'lat', 'NorCPM1':'latitude',
    'NorESM2-LM':'latitude', 'NorESM2-MM':'latitude', 'SAM0-UNICON':'latitude', 
    'TaiESM1':'latitude', 'UKESM1-0-LL':'latitude', 'UKESM1-1-LL':'latitude',
}
 
x_y_names = {
    'ACCESS-CM2':['j','i'], 'ACCESS-ESM1-5':['j','i'], 
    'AWI-CM-1-1-MR':['ncells'], 'AWI-ESM-1-1-LR':['ncells'], 
    'BCC-CSM2-MR':['lat','lon'], 'BCC-ESM1':['lat','lon'], 
    'CAMS-CSM1-0':['j','i'], 'CAS-ESM2-0':['j','i'], 'CESM2':['nj','ni'],
    'CESM2-LENS':['nj','ni'], 'CESM2-FV2':['nj','ni'], 
    'CESM2-WACCM':['nj','ni'], 'CESM2-WACCM-FV2':['nj','ni'], 
    'CIESM':['nj','ni'], 'CMCC-CM2-HR4':['j','i'], 'CMCC-CM2-SR5':['j','i'], 
    'CMCC-ESM2':['j','i'], 'CNRM-CM6-1':['y','x'], 'CNRM-CM6-1-HR':['y','x'],
    'CNRM-ESM2-1':['y','x'], 'CanESM5':['j','i'], 'CanESM5-1':['j','i'],
    'CanESM5-CanOE':['j','i'], 'E3SM-1-0':['lat','lon'],
    'E3SM-1-1':['lat','lon'], 'E3SM-1-1-ECA':['lat','lon'], 
    'E3SM-2-0':['lat','lon'], 'EC-Earth3':['j','i'], 
    'EC-Earth3-AerChem':['j','i'], 'EC-Earth3-CC':['j','i'],
    'EC-Earth3-Veg':['j','i'], 'EC-Earth3-Veg-LR':['j','i'], 
    'FGOALS-f3-L':['j','i'], 'FGOALS-g3':['j','i'], 'FIO-ESM-2-0':['j','i'], 
    'GFDL-CM4':['y','x'], 'GFDL-ESM4':['y','x'], 'GISS-E2-1-G':['lat','lon'], 
    'GISS-E2-1-G-CC':['lat','lon'], 'GISS-E2-1-H':['lat','lon'],  
    'GISS-E2-1-H':['lat','lon'],   'GISS-E2-2-G':['lat','lon'],  
    'GISS-E2-2-H':['lat','lon'], 'GISS-E3-G':['lat','lon'],
    'HadGEM3-GC31-LL':['j','i'], 'HadGEM3-GC31-MM':['j','i'], 
    'INM-CM4-8':['lat','lon'], 'INM-CM5-0':['lat','lon'], 
    'IPSL-CM5A2-INCA':['y','x'], 'IPSL-CM6A-LR':['y','x'], 
    'IPSL-CM6A-LR-INCA':['y','x'], 'KACE-1-0-G':['lat','lon'], 
    'KIOST-ESM':['lat','lon'], 'MCM-UA-1-0':['latitude','longitude'], 
    'MIROC-ES2L':['y','x'], 'MIROC-ES2H':['y','x'], 'MIROC6':['y','x'],
    'MPI-ESM-1-2-HAM':['j','i'], 'MPI-ESM1-2-HR':['j','i'],
    'MPI-ESM1-2-LR':['j','i'], 'MRI-ESM2-0':['y','x'], 'NESM3':['nj','ni'],
    'NorCPM1':['j','i'], 'NorESM2-LM':['j','i'], 'NorESM2-MM':['j','i'], 
    'SAM0-UNICON':['j','i'], 'TaiESM1':['j','i'], 'UKESM1-0-LL':['j','i'],
    'UKESM1-1-LL':['j','i'],
}

In [167]:
#save all of the areacello paths and grid information
with open('/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_info/'\
          +'CMIP6_areacello_paths.pickle', 'wb') as handle:
    pickle.dump(areacello_paths, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_info/'\
          +'CMIP6_areacello_lat_names.pickle', 'wb') as handle:
    pickle.dump(lat_names, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open('/glade/work/cwpowell/low-frequency-variability/raw_data/CMIP6_info/'\
          +'CMIP6_x_y_names.pickle', 'wb') as handle:
    pickle.dump(x_y_names, handle, protocol=pickle.HIGHEST_PROTOCOL)