In [1]:
# Load intake module and the `nci` catalogue
import intake
cat = intake.cat.nci
list(cat)

['era5', 'era5_land', 'ecmwf', 'esgf', 'cosima', 'erai']

In [2]:
# CMIP6 is included in `esgf` which is itself a catalogue
# so we are using list() again to see the sub-catalogues
esgf = cat['esgf']
list(esgf)

['cmip5',
 'cmip5_all',
 'cmip5_gr1p5',
 'cmip6',
 'cmip6_all',
 'cmip6_gr1p5',
 'cordex',
 'cordex_all']

In [3]:
# NB you can also use esgf._entries to see a much more detailed description of `esgf` entries
esgf._entries

{'cmip5': name: cmip5
 container: xarray
 plugin: ['esm_datastore']
 driver: ['esm_datastore']
 description: CMIP5 (Latest Versions)
 
 Datasets on Gadi, both publised and replicated. Only the latest available file versions are in the listing, see catalogue 'cmip5_all' for all available versions
 
 Catalogue columns match those used by ESGF search (esgf.nci.org.au). intake-esm dict keys are in the form '{esgf instance_id}.{variable}'. Columns 'model_id' and 'institution_id' mirror the non-'_id' columns, with minor formatting differences needed to create the 'instance_id'.
 
 Project: al33, rr3
 Maintained By: NCI
 Contact: help@nci.org.au
 References:
     - https://pcmdi.llnl.gov/mips/cmip5/
 
 direct_access: forbid
 user_parameters: []
 metadata: 
 args: 
   esmcol_obj: {{CATALOG_DIR}}/cmip5/catalogue_latest.json,
 'cmip5_all': name: cmip5_all
 container: xarray
 plugin: ['esm_datastore']
 driver: ['esm_datastore']
 description: CMIP5 (All Versions)
 
 Datasets on Gadi, both publised

In [4]:
# NB cmip6_gr1p5 is cmip6 post-processed data and cmip6_all includes all versions and not just the latest.
# We want `cmip6` original data only and we can look at its records as a pandas dataframe
cmip6 = cat['esgf'].cmip6
cmip6.df.head()

Unnamed: 0,project,activity_id,institution_id,source_id,experiment_id,member_id,table_id,variable_id,grid_label,date_range,path,version
0,CMIP6,AerChemMIP,BCC,BCC-ESM1,histSST,r1i1p1f1,AERmon,o3,gn,185001-201412,/g/data/oi10/replicas/CMIP6/AerChemMIP/BCC/BCC...,v20190718
1,CMIP6,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,fco2nat,gn,201501-205512,/g/data/oi10/replicas/CMIP6/AerChemMIP/BCC/BCC...,v20190624
2,CMIP6,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,pr,gn,201501-205512,/g/data/oi10/replicas/CMIP6/AerChemMIP/BCC/BCC...,v20190624
3,CMIP6,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,psl,gn,201501-205512,/g/data/oi10/replicas/CMIP6/AerChemMIP/BCC/BCC...,v20190624
4,CMIP6,AerChemMIP,BCC,BCC-ESM1,ssp370,r1i1p1f1,Amon,rlut,gn,201501-205512,/g/data/oi10/replicas/CMIP6/AerChemMIP/BCC/BCC...,v20190624


In [5]:
# `df.columns` lists all the columns that can be used to query the data
cmip6.df.columns

Index(['project', 'activity_id', 'institution_id', 'source_id',
       'experiment_id', 'member_id', 'table_id', 'variable_id', 'grid_label',
       'date_range', 'path', 'version'],
      dtype='object')

In [6]:
# Other useful attributes of a catalogue:
# - `description` 
# - `aggregation_info` how the data can be aggregated
cmip6.description
cmip6.aggregation_info

AggregationInfo(groupby_attrs=['project', 'activity_id', 'institution_id', 'source_id', 'experiment_id', 'member_id', 'table_id', 'variable_id', 'grid_label', 'version'], variable_column_name='variable_id', aggregations=[{'type': 'join_existing', 'attribute_name': 'date_range', 'options': {'dim': 'time'}}], agg_columns=['date_range'], aggregation_dict={'date_range': {'type': 'join_existing', 'options': {'dim': 'time'}}})

In [10]:
# The method unique() lists all the unique values for each column as a dictionary
# As this would be quite big for CMIP6, we are going to save it in avariable and access only the valid model names (source_id)
values_dict = cmip6.unique()
values_dict['source_id']

{'count': 100,
 'values': ['ECMWF-IFS-LR',
  'IPSL-CM6A-LR',
  'FGOALS-g3',
  'BCC-CSM2-HR',
  'GISS-E2-2-G',
  'TaiESM1',
  'EC-Earth3-Veg-LR',
  'KACE-1-0-G',
  'MPI-ESM1-2-LR',
  'IPSL-CM5A2-INCA',
  'NICAM16-9S',
  'ACCESS-CM2',
  'GFDL-ESM2M',
  'IPSL-CM6A-ATM-HR',
  'INM-CM5-0',
  'GFDL-ESM4',
  'CMCC-CM2-VHR4',
  'E3SM-1-0',
  'ICON-ESM-LR',
  'CESM2',
  'BCC-ESM1',
  'MCM-UA-1-0',
  'MRI-AGCM3-2-S',
  'GFDL-OM4p5B',
  'TaiESM1-TIMCOM',
  'BCC-CSM2-MR',
  'MIROC6',
  'CAS-ESM2-0',
  'HiRAM-SIT-LR',
  'ACCESS-OM2',
  'E3SM-1-1',
  'GFDL-CM4',
  'CMCC-CM2-SR5',
  'MIROC-ES2H',
  'GFDL-AM4',
  'CAMS-CSM1-0',
  'SAM0-UNICON',
  'NICAM16-7S',
  'CNRM-CM6-1',
  'GISS-E2-1-G',
  'ECMWF-IFS-MR',
  'CanESM5-CanOE',
  'MPI-ESM1-2-XR',
  'UKESM1-0-LL',
  'FGOALS-f3-H',
  'GISS-E2-1-H',
  'CNRM-ESM2-1',
  'FIO-ESM-2-0',
  'NorESM2-MM',
  'AWI-ESM-1-1-LR',
  'NorCPM1',
  'HadGEM3-GC31-MM',
  'CESM2-WACCM',
  'GISS-E2-1-G-CC',
  'INM-CM5-H',
  'EC-Earth3-LR',
  'E3SM-1-1-ECA',
  'CIESM',
  'F

In [8]:
# Let's select a subset passing the search() method some constraints
subset = cmip6.search(activity_id='CMIP', experiment_id='historical', source_id='ACCESS-CM2', table_id='day', variable_id='tas')
subset

Unnamed: 0,unique
project,1
activity_id,1
institution_id,1
source_id,1
experiment_id,1
member_id,5
table_id,1
variable_id,1
grid_label,1
date_range,4


In [11]:
# to_dataset_dict() returns a dictionary listing all the datasets in our subset
dset_dict = subset.to_dataset_dict()
dset_dict


--> The keys in the returned dictionary of datasets are constructed as follows:
	'project.activity_id.institution_id.source_id.experiment_id.member_id.table_id.variable_id.grid_label.version'


{'CMIP6.CMIP.CSIRO-ARCCSS.ACCESS-CM2.historical.r3i1p1f1.day.tas.gn.v20200306': <xarray.Dataset>
 Dimensions:    (time: 60265, bnds: 2, lat: 144, lon: 192)
 Coordinates:
   * time       (time) datetime64[ns] 1850-01-01T12:00:00 ... 2014-12-31T12:00:00
   * lat        (lat) float64 -89.38 -88.12 -86.88 -85.62 ... 86.88 88.12 89.38
   * lon        (lon) float64 0.9375 2.812 4.688 6.562 ... 355.3 357.2 359.1
     height     float64 2.0
 Dimensions without coordinates: bnds
 Data variables:
     time_bnds  (time, bnds) datetime64[ns] dask.array<chunksize=(18262, 2), meta=np.ndarray>
     lat_bnds   (lat, bnds) float64 -90.0 -88.75 -88.75 ... 88.75 88.75 90.0
     lon_bnds   (lon, bnds) float64 0.0 1.875 1.875 3.75 ... 358.1 358.1 360.0
     tas        (time, lat, lon) float32 dask.array<chunksize=(18262, 144, 192), meta=np.ndarray>
 Attributes: (12/48)
     Conventions:             CF-1.7 CMIP-6.2
     initialization_index:    1
     branch_method:           standard
     further_info_url:

In [12]:
# Finally we can simply load a dataset using its key
ds = dset_dict['CMIP6.CMIP.CSIRO-ARCCSS.ACCESS-CM2.historical.r5i1p1f1.day.tas.gn.v20210607']
ds

Unnamed: 0,Array,Chunk
Bytes,0.92 MiB,285.34 kiB
Shape,"(60265, 2)","(18262, 2)"
Count,12 Tasks,4 Chunks
Type,datetime64[ns],numpy.ndarray
"Array Chunk Bytes 0.92 MiB 285.34 kiB Shape (60265, 2) (18262, 2) Count 12 Tasks 4 Chunks Type datetime64[ns] numpy.ndarray",2  60265,

Unnamed: 0,Array,Chunk
Bytes,0.92 MiB,285.34 kiB
Shape,"(60265, 2)","(18262, 2)"
Count,12 Tasks,4 Chunks
Type,datetime64[ns],numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,6.21 GiB,1.88 GiB
Shape,"(60265, 144, 192)","(18262, 144, 192)"
Count,12 Tasks,4 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 6.21 GiB 1.88 GiB Shape (60265, 144, 192) (18262, 144, 192) Count 12 Tasks 4 Chunks Type float32 numpy.ndarray",192  144  60265,

Unnamed: 0,Array,Chunk
Bytes,6.21 GiB,1.88 GiB
Shape,"(60265, 144, 192)","(18262, 144, 192)"
Count,12 Tasks,4 Chunks
Type,float32,numpy.ndarray
