### This notebook creates a list of files to download from each model for a given query of variables for the OMIP2 simulations. 
To do so, it uses the ESGF API. Will filter out models which have all of thegiven variables available. Will select one member per model. Will select the last OMIP cycle (last 61 years) for each simulation.

#### Install and import packages

In [9]:
pip install cmaps

Collecting cmaps
  Using cached cmaps-1.0.5-py3-none-any.whl
Installing collected packages: cmaps
Successfully installed cmaps-1.0.5
Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install xmip

Collecting xmip
  Using cached xmip-0.6.1rc0-py3-none-any.whl (54 kB)
Collecting pint-xarray
  Using cached pint_xarray-0.3-py3-none-any.whl (32 kB)
Collecting xgcm<0.7.0
  Using cached xgcm-0.6.1-py3-none-any.whl (60 kB)
Installing collected packages: xgcm, pint-xarray, xmip
  Attempting uninstall: xgcm
    Found existing installation: xgcm 0.8.0
    Uninstalling xgcm-0.8.0:
      Successfully uninstalled xgcm-0.8.0
Successfully installed pint-xarray-0.3 xgcm-0.6.1 xmip-0.6.1rc0
Note: you may need to restart the kernel to use updated packages.


In [3]:
from matplotlib import pyplot as plt
import xmip
import numpy as np
import pandas as pd
import xarray as xr
import cftime
import cmaps
import cartopy.crs as ccrs
import matplotlib.path as mpath

xr.set_options(display_style='html')
%matplotlib inline
%config InlineBackend.figure_format = 'retina' 

### Search function using ESGF API

In [4]:
#!/usr/bin/env python
from __future__ import print_function
import requests
import xml.etree.ElementTree as ET
import numpy

# Author: Unknown
# I got the original version from a word document published by ESGF
# https://docs.google.com/document/d/1pxz1Kd3JHfFp8vR2JCVBfApbsHmbUQQstifhGNdc6U0/edit?usp=sharing

# API AT: https://github.com/ESGF/esgf.github.io/wiki/ESGF_Search_REST_API#results-pagination

def esgf_search(server="https://esgf-node.llnl.gov/esg-search/search",
                files_type="OPENDAP", local_node=True, project="CMIP6",
                verbose=False, format="application%2Fsolr%2Bjson",
                use_csrf=False, **search):
    client = requests.session()
    payload = search
    payload["project"] = project
    payload["type"]= "File"
    if local_node:
        payload["distrib"] = "false"
    if use_csrf:
        client.get(server)
        if 'csrftoken' in client.cookies:
            # Django 1.6 and up
            csrftoken = client.cookies['csrftoken']
        else:
            # older versions
            csrftoken = client.cookies['csrf']
        payload["csrfmiddlewaretoken"] = csrftoken

    payload["format"] = format

    offset = 0
    numFound = 10000
    all_files = []
    files_type = files_type.upper()
    while offset < numFound:
        payload["offset"] = offset
        url_keys = [] 
        for k in payload:
            url_keys += ["{}={}".format(k, payload[k])]

        url = "{}/?{}".format(server, "&".join(url_keys))
        if offset % 100 == 0:
            print(url)
        r = client.get(url)
        r.raise_for_status()
        resp = r.json()["response"]
        numFound = int(resp["numFound"])
        resp = resp["docs"]
        offset += len(resp)
        for d in resp:
            if verbose:
                for k in d:
                    print("{}: {}".format(k,d[k]))
            url = d["url"]
            for f in d["url"]:
                sp = f.split("|")
                if sp[-1] == files_type:
                    all_files.append(sp[0].split(".html")[0])
    return sorted(all_files)

### Perform the search to obtain a list of all available files for all possible models over all variables

In [5]:
variables = ['vmo','thetao','so','umo','siconc','zos','mlotst']
#variables = ['vmo']
fvariables = ['areacello','deptho']

result = []
for var in variables:
    result.append(esgf_search(activity_id='OMIP', variable_id=var,
                              experiment_id='omip2',frequency='mon',grid_label='gn'))
for var in fvariables:
    result.append(esgf_search(activity_id='OMIP',variable_id=var))

https://esgf-node.llnl.gov/esg-search/search/?activity_id=OMIP&variable_id=vmo&experiment_id=omip2&frequency=mon&grid_label=gn&project=CMIP6&type=File&distrib=false&format=application%2Fsolr%2Bjson&offset=0
https://esgf-node.llnl.gov/esg-search/search/?activity_id=OMIP&variable_id=vmo&experiment_id=omip2&frequency=mon&grid_label=gn&project=CMIP6&type=File&distrib=false&format=application%2Fsolr%2Bjson&offset=100
https://esgf-node.llnl.gov/esg-search/search/?activity_id=OMIP&variable_id=vmo&experiment_id=omip2&frequency=mon&grid_label=gn&project=CMIP6&type=File&distrib=false&format=application%2Fsolr%2Bjson&offset=200
https://esgf-node.llnl.gov/esg-search/search/?activity_id=OMIP&variable_id=vmo&experiment_id=omip2&frequency=mon&grid_label=gn&project=CMIP6&type=File&distrib=false&format=application%2Fsolr%2Bjson&offset=300
https://esgf-node.llnl.gov/esg-search/search/?activity_id=OMIP&variable_id=vmo&experiment_id=omip2&frequency=mon&grid_label=gn&project=CMIP6&type=File&distrib=false&f

### Filter models which have all variables available in one member, select the first member, select the last forcing cycle (last 61 years). Save file list for each model to a dictionary.

In [7]:
variables = ['vmo','thetao','so','umo','mlotst','zos']

all_variables = variables + fvariables

raw_results = [item for sublist in result for item in sublist] # concatenate lists from all variables
#print([r for r in raw_results if 'deptho' in r])
results = [r for r in raw_results if r.split('/')[-1].split('_')[0] in all_variables]
files = {} #final dictionary with each usable model as keys

models = set([r.split('_')[3] for r in results]) # models
print('All models that returned some search results:')
print(models)

print('---------------------------------------------------------------')
for model in models: # loop over model
    print()
    print(model)
    # get only the files from that model 
    tmp = [r for r in results if model in r] 
    # which variables does that model have?
    vvars = set([r.split('/')[-1].split('_')[0] for r in tmp])
    print('\tThe model has these variables available:')
    print('\t',vvars)
    
    if len(vvars) < len(all_variables):
        print('\t'+'Not enough variables - skipping model...')
        continue # throw out model if it doesn't have enough variables
        
    # selecting members
    members = list(sorted(set([r.split('_')[5] for r in tmp])))
    
    # make sure to sleect a member that has all the variables available
    good_members = []
    for member in members:
        tmp_member = [r for r in tmp if member in r]
        member_vars = set([r.split('/')[-1].split('_')[0] for r in tmp_member])
        if len(member_vars) == len(all_variables):
            good_members.append(member)
      
    if len(good_members) == 0:
        print('\t'+'No member has all the variables - skipping model...')
        print()
        continue
       
    # Create entry for that model
    files[model] = []
    
        
    print('\t'+'These members have all variables available:')
    print('\t',good_members)

    # select the first member that has all the variables available
    member = good_members[0]
    # cut data from that membe
    tmp_member = [r for r in tmp if member in r]
    
    # add grid variables
    for v in fvariables:
        # add just the first file from that file variable, as they should all be the same
        files[model].append([r for r in tmp_member if v in r][0])
        
    for v in variables: # loop over variables
        # cut out files from that variable
        tmp_var = [r for r in tmp_member if v+'_' in r]

        # select latest version and cut out files from that version
        version = list(sorted(set([r.split('/')[-2] for r in tmp_var])))[-1]
        tmp_version = [r for r in tmp_var if version in r]
        
        # figure out the year range in the files of that variables
        yy = [r.split('_')[-1].split('.')[0] for r in tmp_version]
        years = sorted(set(yy))
        
        # filter out double files from different nodes
        tmp_unique = []
        for y in years:
            tmp_unique.append([r for r in tmp_version if y in r][0])
            
        # figure out the last year, then select only files which contain 
        # data from the last 61 years (last OMIP forcing cycle)
        last_year = int(years[-1].split('-')[-1][0:4])
        print('\t'+v+':\tversion '+version+', Last cycle starts in '+str(last_year - 61+1))
        tmp_year = [tmp_unique[i] for i in range(len(tmp_unique)) if int(yy[i].split('-')[-1][0:4]) > last_year - 61]  
        
        [files[model].append(r) for r in tmp_year]
        print('\tAdded '+str(len(tmp_year))+' file(s)')
        
print('These '+str(len(files.keys()))+' models match the requirements:')
print(sorted(files.keys()))
print('Saving file list for each models in model_files.npy')
np.save('models.npy',files)

All models that returned some search results:
{'TaiESM1-TIMCOM2', 'CanESM5-CanOE', 'NorESM2-LM', 'GFDL-OM4p5B', 'MRI-ESM2-0', 'MIROC-ES2L', 'IPSL-CM6A-LR', 'TaiESM1-TIMCOM', 'FGOALS-f3-H', 'CanESM5', 'CMCC-ESM2', 'CMCC-CM2-SR5', 'FGOALS-f3-L', 'GFDL-CM4', 'ACCESS-OM2', 'ACCESS-OM2-025', 'MIROC6', 'CNRM-ESM2-1', 'CNRM-CM6-1', 'CESM2', 'EC-Earth3', 'CMCC-CM2-HR4', 'CNRM-CM6-1-HR'}
---------------------------------------------------------------

TaiESM1-TIMCOM2
	The model has these variables available:
	 {'deptho', 'mlotst', 'areacello', 'vmo', 'thetao', 'zos', 'umo', 'so'}
	These members have all variables available:
	 ['r1i1p1f1']
	vmo:	version v20211216, Last cycle starts in 306
	Added 1 file(s)
	thetao:	version v20211216, Last cycle starts in 306
	Added 1 file(s)
	so:	version v20211216, Last cycle starts in 306
	Added 1 file(s)
	umo:	version v20211216, Last cycle starts in 306
	Added 1 file(s)
	mlotst:	version v20211216, Last cycle starts in 306
	Added 1 file(s)
	zos:	version v2021121

In [9]:
# How to read in the file
import numpy as np

fl = np.load('models.npy',allow_pickle=True).item()
print(fl.keys())
fl['FGOALS-f3-L']

dict_keys(['TaiESM1-TIMCOM2', 'NorESM2-LM', 'MRI-ESM2-0', 'TaiESM1-TIMCOM', 'CMCC-CM2-SR5', 'FGOALS-f3-L', 'ACCESS-OM2', 'ACCESS-OM2-025', 'MIROC6', 'CNRM-CM6-1', 'EC-Earth3', 'CMCC-CM2-HR4', 'CNRM-CM6-1-HR'])


['http://esg.lasg.ac.cn/thredds/dodsC/esg_dataroot/CMIP6/OMIP/CAS/FGOALS-f3-L/omip1/r1i1p1f1/Ofx/areacello/gn/v20191118/areacello_Ofx_FGOALS-f3-L_omip1_r1i1p1f1_gn.nc',
 'http://esg.lasg.ac.cn/thredds/dodsC/esg_dataroot/CMIP6/OMIP/CAS/FGOALS-f3-L/omip1/r1i1p1f1/Ofx/deptho/gn/v20191118/deptho_Ofx_FGOALS-f3-L_omip1_r1i1p1f1_gn.nc',
 'http://esg.lasg.ac.cn/thredds/dodsC/esg_dataroot/CMIP6/OMIP/CAS/FGOALS-f3-L/omip2/r1i1p1f1/Omon/vmo/gn/v20191217/vmo_Omon_FGOALS-f3-L_omip2_r1i1p1f1_gn_195801-201812.nc',
 'http://esg.lasg.ac.cn/thredds/dodsC/esg_dataroot/CMIP6/OMIP/CAS/FGOALS-f3-L/omip2/r1i1p1f1/Omon/thetao/gn/v20191104/thetao_Omon_FGOALS-f3-L_omip2_r1i1p1f1_gn_195801-201812.nc',
 'http://esg.lasg.ac.cn/thredds/dodsC/esg_dataroot/CMIP6/OMIP/CAS/FGOALS-f3-L/omip2/r1i1p1f1/Omon/so/gn/v20191104/so_Omon_FGOALS-f3-L_omip2_r1i1p1f1_gn_195801-201812.nc',
 'http://esg.lasg.ac.cn/thredds/dodsC/esg_dataroot/CMIP6/OMIP/CAS/FGOALS-f3-L/omip2/r1i1p1f1/Omon/umo/gn/v20191119/umo_Omon_FGOALS-f3-L_omip2_r1i