In [1]:
from matplotlib import pyplot as plt
import xarray as xr
import numpy as np
import dask
import intake
import collections
import fsspec
import seaborn as sns
from xmip.preprocessing import combined_preprocessing

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'
plt.rcParams['figure.figsize'] = 12, 6

In [38]:
def match_lat_lon_names(ds):
    for lat_name in ['y', 'latitude', 'nav_lat']:
        if (lat_name in ds.coords) and ('lat' not in ds.coords):
            ds=ds.rename({lat_name: 'lat'})
        else:
            ds=ds
    for lon_name in ['x', 'longitude', 'nav_lon']:
        if (lon_name in ds.coords) and ('lon' not in ds.coords):
            ds = ds.rename({lon_name: 'lon'})
        else:
            ds=ds
    return ds


def drop_all_bounds(ds):
    drop_vars = [vname for vname in ds.coords
                 if ( (('_bounds') in vname) or 
                     (('_bnds') in vname) )]
    return ds.drop(drop_vars)

def add_source_id_coord(ds):
    ds = ds.assign_coords(source_id=ds.attrs.get('source_id'))
    return ds

def annual_climatology(ds):
    # First calculate the monthly climatologies
    ds_avg = ds.groupby('time.month').mean(dim='time', keep_attrs=True).mean(dim='month', keep_attrs=True)
    return(ds_avg)

def monthly_climatology(ds):
    # First calculate the monthly climatologies
    ds_mon = ds.groupby('time.month').mean(dim='time', keep_attrs=True)
    return(ds_mon)

In [3]:
## Open up the PANGEO-CMIP6 repository
url='https://storage.googleapis.com/cmip6/pangeo-cmip6.json'
cmip6=intake.open_esm_datastore(url)

Limiting the analysis to just the first realization (r1) from each group significantly reduces the amount of data to pull.

### Precip

In [None]:
## Search the repository

# Define query info
query = dict(activity_id='CMIP',
             experiment_id='historical',
             table_id='Amon',
             variable_id='pr',
             member_id = 'r1i1p1f1' )

# extract info for subset of models that match query
subset = cmip6.search(require_all_on=["source_id"], **query)

# print verbose list of results
#subset.df 
# print compact list of results
subset.df.groupby("source_id")[["experiment_id", "variable_id", "table_id"]].nunique()

In [16]:
## Store results in a dataset dictionary
zarr_kwargs={'consolidated': True,
             'decode_times': False}
subset_dict = subset.to_dataset_dict(**zarr_kwargs) 
#list(subset_dict.keys())


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


In [227]:
## Create a new dictionary for manipulating

# initialize
cmip6_pr=dict()
models_to_del=[]
nlat_threshold=90

for key in subset_dict.keys():
    ## create shortened key name that only includes institution and source id
    _, inst_id, src_id, _, _, _ = key.split(sep='.')
    short_key=inst_id + '.' + src_id
    
    ## clean up dataset info
    ds=subset_dict[key].pr[0] # specifying 0 uses only the first member_id
    # make sure names of coordinate variables match
    ds=match_lat_lon_names(ds)
    # make a list of models with low resolution (<2.5°), for later
    if np.size(ds.lat)<nlat_threshold:
        models_to_del.append(short_key)
    # add a source_id coordinate
    ds=add_source_id_coord(subset_dict[key]) #ds.assign_coords(source_id=src_id)
    
    ## process data
    # convert precip to mm/day
    ds=ds*86400
    # Trim the time range
    #ds=ds.sel(time=slice('1948','2014'))
    # calculate time-mean
    ds=monthly_climatology(ds)
    
    ## update new dictionary with processed data
    cmip6_pr[short_key]=ds

# delete low res models
for model in models_to_del:
    del cmip6_pr[model]
# also delete this one because its not on a regular grid?
del cmip6_pr['MPI-M.ICON-ESM-LR'] 

In [213]:
## Create a new dictionary for storing regridded data

# initialize
cmip6_pr_same_grid=dict()

# Define standard coordinates based on lowest resolution model
lats = cmip6_pr['NASA-GISS.GISS-E2-1-G'].lat.values
lons = cmip6_pr['NASA-GISS.GISS-E2-1-G'].lon.values

# interp model grids to lower resolution
# all models are on a 0:360 grid so no need to convert lons
for key,ds in cmip6_pr.items():
    ds_lr=ds.interp(lat=lats, lon=lons)
    ds_lr_clean=drop_all_bounds(ds_lr)
    cmip6_pr_same_grid[key]=ds_lr_clean
    #cmip6_pr_same_grid=xr.concat([cmip6_pr_same_grid,ds_lr], dim='source_id', coords='minimal')

In [237]:
## Create a new xarray dataset for storing regridded data

# initialize
cmip6_pr_same_grid=[]

for index, (key, ds) in enumerate(cmip6_pr.items()):
    # interp to standard grid
    ds_lr=ds.interp(lat=lats, lon=lons)
    ds_lr_clean=drop_all_bounds(ds_lr)
    # concatenate
    if index==0:
        cmip6_pr_same_grid=ds_lr_clean
    else:
        cmip6_pr_same_grid=xr.concat([cmip6_pr_same_grid,ds_lr_clean], dim='source_id', coords='minimal')

# clean
cmip6_pr_same_grid=cmip6_pr_same_grid.drop(['bnds','member_id','dcpp_init_year'])
cmip6_pr_same_grid=cmip6_pr_same_grid.drop_dims(['bnds','member_id','dcpp_init_year'])

In [None]:
# save output
cmip6_pr_same_grid.to_netcdf('cmip6.pr.climo.nc')

### SST

In [25]:
## Search the repository

# Define query info
# only want to use simulations with a rectified grid since ocean grids are weird
query = dict(activity_id='CMIP',
             experiment_id='historical',
             table_id='Omon',
             variable_id='tos',
             grid_label='gr',
             member_id = 'r1i1p1f1' )

# extract info for subset of models that match query
subset = cmip6.search(require_all_on=["source_id"], **query)

# print verbose list of results
#subset.df 
# print compact list of results
subset.df.groupby("source_id")[["experiment_id", "variable_id", "table_id"]].nunique()

Unnamed: 0_level_0,experiment_id,variable_id,table_id
source_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CESM2,1,1,1
CESM2-FV2,1,1,1
CESM2-WACCM,1,1,1
CESM2-WACCM-FV2,1,1,1
E3SM-1-0,1,1,1
E3SM-1-1,1,1,1
E3SM-1-1-ECA,1,1,1
GFDL-CM4,1,1,1
GFDL-ESM4,1,1,1
KACE-1-0-G,1,1,1


In [26]:
## Store results in a dataset dictionary
zarr_kwargs={'consolidated': True,
             'decode_times': False}
subset_dict = subset.to_dataset_dict(**zarr_kwargs) 
#list(subset_dict.keys())


--> The keys in the returned dictionary of datasets are constructed as follows:
	'activity_id.institution_id.source_id.experiment_id.table_id.grid_label'


In [27]:
## Create a new dictionary for manipulating

# initialize
cmip6_sst=dict()
models_to_del=[]
nlat_threshold=90

for key in subset_dict.keys():
    ## create shortened key name that only includes institution and source id
    _, inst_id, src_id, _, _, _ = key.split(sep='.')
    short_key=inst_id + '.' + src_id
    
    ## clean up dataset info
    ds=subset_dict[key].tos[0] # specifying 0 uses only the first member_id
    # make sure names of coordinate variables match
    ds=match_lat_lon_names(ds)
    # make a list of models with low resolution (<2.5°), for later
    if np.size(ds.lat)<nlat_threshold:
        models_to_del.append(short_key)
    # add a source_id coordinate
    ds=add_source_id_coord(subset_dict[key]) #ds.assign_coords(source_id=src_id)
    
    ## process data
    # Trim the time range
    #ds=ds.sel(time=slice('1948','2014'))
    # calculate monthly climatologies
    ds=monthly_climatology(ds)
    
    ## update new dictionary with processed data
    cmip6_sst[short_key]=ds

# delete low res models
for model in models_to_del:
    del cmip6_sst[model]

In [40]:
## Create a new dictionary for storing regridded data

# initialize
cmip6_sst_same_grid=dict()

# Use coordinates from a standard 1°x1° ocean model
lats = cmip6_sst['E3SM-Project.E3SM-1-0'].lat.values
lons = cmip6_sst['E3SM-Project.E3SM-1-0'].lon.values

# interp model grids to lower resolution
# all models are on a 0:360 grid so no need to convert lons
for key,ds in cmip6_sst.items():
    ds_lr=ds.interp(lat=lats, lon=lons)
    ds_lr_clean=drop_all_bounds(ds_lr)
    cmip6_sst_same_grid[key]=ds_lr_clean

In [44]:
## Create a new xarray dataset for storing regridded data

# Use coordinates from a standard 1°x1° ocean model
lats = cmip6_sst['E3SM-Project.E3SM-1-0'].lat.values
lons = cmip6_sst['E3SM-Project.E3SM-1-0'].lon.values

# initialize
cmip6_sst_same_grid=[]

for index, (key, ds) in enumerate(cmip6_sst.items()):
    # interp to standard grid
    ds_lr=ds.interp(lat=lats, lon=lons)
    ds_lr_clean=drop_all_bounds(ds_lr)
    ds_lr_clean=ds_lr_clean.squeeze()
    # concatenate
    if index==0:
        cmip6_sst_same_grid=ds_lr_clean
    else:
        cmip6_sst_same_grid=xr.concat([cmip6_sst_same_grid,ds_lr_clean], dim='source_id', coords='minimal')

In [None]:
# save output
cmip6_sst_same_grid.to_netcdf('cmip6.sst.climo.nc')