# This script is to download CMIP6 data using intake-esm library

Read more from link

https://intake-esm.readthedocs.io/en/stable/tutorials/loading-cmip6-data.html

How to use intake to download GCM


In [5]:
%load_ext autoreload
%autoreload 2

In [12]:
import re
from pathlib import Path
from typing import Iterable

import chunk_util
import intake
import pandas as pd
import xarray as xr

In [2]:
# Information of all CMIP6 files that one can download from intake esm data store
url = "https://raw.githubusercontent.com/NCAR/intake-esm-datastore/master/catalogs/pangeo-cmip6.json"
url = "https://storage.googleapis.com/cmip6/pangeo-cmip6.json"

# open the catalog
catalog = intake.open_esm_datastore(url)

In [3]:
def natural_sort(l: Iterable[str]) -> list[str]:
    """
    Sort names like r1i1p1f1, r1i2p1f1 in a natural (numeric) order.
    - r1: Realization (initial condition run),
    - i1: Initialization method,
    - p1: Physical parameters,
    - f1: External forcings.

    Numeric order means that r1i1p1f1 < r2i1p1f1 < r11i1p1f1.

    :param l: list of names to be sorted
    """

    convert = lambda text: int(text) if text.isdigit() else text.lower()
    alphanum_key = lambda key: [convert(c) for c in re.split("([0-9]+)", key)]
    return sorted(l, key=alphanum_key)

In [27]:
def download_files(data_source, sid: str, exp: str, var: str):
    """
    Download files from the CMIP6 data store

    :param data_source: intake esm data store
    :param sid: source_id
    :param exp: experiment_id
    :param var: variable_id
    """

    models = data_source.search(
        experiment_id=exp,
        table_id="Amon",
        variable_id=var,
        source_id=sid,
    )
    # then one might get several files with the same conditions
    # r1: Realization (initial condition run)
    # i1: Initialization method
    # p1: Physical parameters
    # f1: External forcings

    # if no files exist then print out error
    if len(models.df) == 0:
        print("*** No data found for", var, exp, sid)
        return False

    member_ids = natural_sort(models.df.member_id.values)

    # get the first one only then seach again
    first_member = data_source.search(
        experiment_id=exp,
        table_id="Amon",
        variable_id=var,
        source_id=sid,
        member_id=member_ids[0],
    )

    # if no files exist then print out error
    if len(first_member.df) == 0:
        print(
            "*** This is impossible, there must be data for",
            var,
            exp,
            sid,
            member_ids[0],
        )
        return False

    try:
        datasets: dict[str, xr.Dataset] = first_member.to_dataset_dict(
            xarray_open_kwargs={"consolidated": True}
        )

        odir = Path("Download") / sid / exp
        odir.mkdir(parents=True, exist_ok=True)

        for key, ds in datasets.items():
            ofile = odir / f"{var}_{key}_{member_ids[0]}.nc"

            # Compression
            encoding = {
                var_name: {
                    "zlib": True,
                    "complevel": 1,
                    "chunksizes": chunk_util.chunk_shape_nD(
                        data.shape, chunkSize=64 * 2**10
                    ),
                }
                for var_name, data in ds.data_vars.items()
            }

            ds.to_netcdf(
                ofile, format="NETCDF4_CLASSIC", engine="netcdf4", encoding=encoding
            )

    except Exception as e:
        print("*** Couldn't download", var, exp, sid, e)
        return False

    return True


def download_data():
    status = []
    source_ids = [
        "EC-Earth3",
        "MIROC6",
        "MRI-ESM2-0",
        "ACCESS-CM2",
        "IPSL-CM6A-LR",
        "MPI-ESM1-2-HR",
    ]
    experiments = ["historical", "ssp585", "ssp126", "ssp370", "ssp245"][:1]
    variables = ["tas", "ta", "ua", "va", "hur", "zg", "ts"][:1]
    for sid in source_ids:
        for exp in experiments:
            for var in variables:
                try:
                    success = download_files(catalog, sid, exp, var)
                except:
                    success = False
                status.append(
                    {
                        "source_id": sid,
                        "experiment": exp,
                        "variable": var,
                        "success": success,
                    }
                )

    return pd.DataFrame(status)


download_status = download_data()
print(f"Successfully download {download_status["success"].sum()} files")
print("The following files couldn't be downloaded")
download_status[~download_status["success"]]


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'



--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


Successfully download 6 files
The following files couldn't be downloaded


Unnamed: 0,source_id,experiment,variable,success
