In [1]:
# export PYTHONPATH="${PYTHONPATH}:/uio/kant/geo-geofag-u1/franzihe/Documents/Python/globalsnow/CloudSat_ERA5_CMIP6_analysis/utils/"



# Example with CMIP6 models (100 - 500 km)


# Table of Contents
<ul>
<li><a href="#introduction">1. Introduction</a></li>
<li><a href="#data_wrangling">2. Data Wrangling</a></li>
<li><a href="#exploratory">3. Exploratory Data Analysis</a></li>
<li><a href="#conclusion">4. Conclusion</a></li>
<li><a href="#references">5. References</a></li>
</ul>



# 1. Introduction <a id='introduction'></a>
Cloud feedbacks are a major contributor to the spread of climate sensitivity in global climate models (GCMs) [Zelinka et al. (2020)](https://doi-org.ezproxy.uio.no/10.1029/2019GL085782). Among the most poorly understood cloud feedbacks is the one associated with the cloud phase, which is expected to be modified with climate change [Bjordal et al. (2020)](https://doi-org.ezproxy.uio.no/10.1038/s41561-020-00649-1). Cloud phase bias, in addition, has significant implications for the simulation of radiative properties and glacier and ice sheet mass balances in climate models. 

In this context, this work aims to expand our knowledge on how the representation of the cloud phase affects snow formation in GCMs. Better understanding this aspect is necessary to develop climate models further and improve future climate predictions. 

* Retrieve CMIP6 data through [ESGF](https://esgf-node.llnl.gov/search/cmip6/)
* Hybrid sigma-pressure coordinates to isobaric pressure levels of the European Centre for Medium-Range Weather Forecast Re-Analysis 5 (ERA5) with [GeoCAT-comb](https://geocat-comp.readthedocs.io/en/latest/index.html)
* Regridd the CMIP6 variables to the exact horizontal resolution with [`xesmf`](https://xesmf.readthedocs.io/en/latest/)
* Calculate an ensemble mean of all used models
* Calculate and plot the seasonal mean of the ensemble mean

**Questions**
* How is the cloud phase and snowfall varying between 2007 and 2010?

> **_NOTE:_** We answer questions related to the comparison of CMIP models to ERA5 in another [Jupyter Notebook](../CMIP6_ERA5_CloudSat/plt_seasonal_mean.ipynb).

# 2. Data Wrangling <a id='data_wrangling'></a>

This study will compare surface snowfall, ice, and liquid water content from the Coupled Model Intercomparison Project Phase 6 ([CMIP6](https://esgf-node.llnl.gov/projects/cmip6/)) climate models to the European Centre for Medium-Range Weather Forecast Re-Analysis 5 ([ERA5](https://www.ecmwf.int/en/forecasts/datasets/reanalysis-datasets/era5)) data from **2006 to 2009**. We conduct statistical analysis at the annual and seasonal timescales to determine the biases in cloud phase and precipitation (liquid and solid) in the CMIP6 models and their potential connection between them. 

- Time period: 2006 to 2009
- horizonal resolution: depending on model
- time resolution: daily mean atmospheric data (CFday, day)
- Variables:
  
| shortname     |             Long name                   |      Units    |  levels |
| ------------- |:---------------------------------------:| -------------:|--------:|
|  prsn         |    Snowfall Flux                        | [kg m-2 s-1]  | surface |
| clw           |    Mass Fraction of Cloud Liquid Water  |  [kg kg-1]    |    ml   | 
|               |                                         | to calculate lwp use integral clw -dp/dg | |
| tas           |    Near-Surface Air Temperature         |   [K]         | surface |
| clivi         |    Ice Water Path                       | [kg m-2]      |         |
| lwp           |    Liquid Water Path                    | [kg m-2]      |         |

- CMIP6 models:

| Institution                                            |     Model name    | Reference                                                     |
| ------------------------------------------------------ |:-----------------:|--------------------------------------------------------------:|
| [MIROC]() | MIROC6           | [Tatebe et al. (2019)]() |
| [NCAR]()  | CESM2            | [Danabasoglu et al. (2020)]()  |
| [CCCma]() | CanESM5          | [Swart et al. (2019)]()     |
| [AWI]()   | AWI-ESM-1-1-LR   | []() |
| []()      | MPI-ESM1-2-LR    | []() |
| [MOHC]()  | UKESM1-0-LL      | []() |
| [MOHC]()  | HadGem3-GC31-LL  | [Roberts et al. (2019)]() |
| [CNRM-CERFACS]() | CNRM-CM6-1 | [Voldoire et al. (2019)]() |
| [CNRM-CERFACS]() | CNRM-ESM2-1| [Seferian et al. (2019)]() |
| [IPSL]() | IPSL-CM6A-LR | [Boucher et al. (2020)]() |
| [IPSL]() | IPSL-CM5A2-INCA | []()|

## Organize my data

- Define a prefix for my project (you may need to adjust it for your own usage on your infrastructure).
    - input folder where all the data used as input to my Jupyter Notebook is stored (and eventually shared)
    - output folder where all the results to keep are stored
    - tool folder where all the tools

`/input/cmip6_hist/daily_means`.


In [2]:
import os
import pathlib
import sys
import socket
hostname = socket.gethostname()

abs_path = str(pathlib.Path(hostname).parent.absolute())
WORKDIR = abs_path[:- (len(abs_path.split('/')[-2] + abs_path.split('/')[-1])+1)]


if "mimi" in hostname:
    print(hostname)
    DATA_DIR = "/scratch/franzihe/"
elif "glefsekaldt" in hostname: 
    DATA_DIR = "/home/franzihe/Data/"

INPUT_DATA_DIR = os.path.join(DATA_DIR, 'input')
OUTPUT_DATA_DIR = os.path.join(DATA_DIR, 'output')
UTILS_DIR = os.path.join(WORKDIR, 'utils/')

sys.path.append(UTILS_DIR)


mimi.uio.no


## Import python packages
- `Python` environment requirements: file [requirements_globalsnow.txt](../../requirements_globalsnow.txt) 
- load `python` packages from [imports.py](../../utils/imports.py)
- load `functions` from [functions.py](../../utils/functions.py)


In [3]:
# supress warnings
import warnings
warnings.filterwarnings('ignore') # don't output warnings

# import packages
from imports import (xr, intake, cftime, xe, glob, np, cm, pd, fct,ccrs, cy, plt, da, gc, datetime, LogNorm, distributed,)# dask)
xr.set_options(display_style="html")

<xarray.core.options.set_options at 0x7fddcc379f60>

In [4]:
# reload imports
%load_ext autoreload
%autoreload 2

In [5]:
# Create a client for distributed computing with dask
# client = dask.distributed.Client()

## Open CMIP6 variables
Get the data required for the analysis. Beforehand we downloaded the daily averaged data on single levels and model levels via.

In [6]:
cmip_in = os.path.join(INPUT_DATA_DIR, 'cmip6_hist/')



try:
    os.mkdir(os.path.join(cmip_in, 'common_grid'))
    os.mkdir(os.path.join(cmip_in, 'single_model'))
except OSError:
    pass



In [7]:
variable_id = ['clw', 'cli', 'clivi', 'tas', 'prsn', 'pr', 'pfull', 'phalf',  'areacella']

At the moment we have downloaded the end of the historical simulations for CMIP6 models. We define start and end year to ensure to only extract the 4-year period between 2006 and 2009.

$\rightarrow$ Define a start and end year

We will load all available models into one dictonary, which includes an xarray dataset with `xarray.open_mfdataset(file)` and select the time range [by name](https://xarray.pydata.org/en/stable/user-guide/indexing.html).

In [8]:
# source_id
list_models = [
               'MIROC6', #area
               # 'CESM2', #area # this model has snowfall only over land
               'CanESM5', # area
               'AWI-ESM-1-1-LR', # area
               'MPI-ESM1-2-LR', # area
               'UKESM1-0-LL', 
               'HadGEM3-GC31-LL',
               'CNRM-CM6-1', #area
               'CNRM-ESM2-1', #area
               'IPSL-CM6A-LR', #area
               'IPSL-CM5A2-INCA' #area
            ]

## experiment
experiment_id = ['historical']

## time resolution
t_res = ['day',]

## Search corresponding data
Get the data required for the analysis. Define variables, models, experiment, and time resolution as defined in <a href="#data_wrangling">2. Data Wrangling</a>
. 

In [9]:
def open_ds_var_id(files, year_range, var_id,model):
    if len(files) > 0:
        if var_id == 'pr':
            ds_var_id = xr.open_mfdataset(files,drop_variables=['prsn'])
            
        elif var_id == 'ta':
            ds_var_id = xr.open_mfdataset(files,drop_variables=['tas', 'height'])
        elif var_id == 'phalf':
            ds_var_id = xr.open_mfdataset(files,drop_variables=['b','orog','b_bnds', 'ps','ap','ap_bnds',])
            # if model == 'IPSL-CM6A-LR':
            #     ds_var_id = ds_var_id.assign_coords({'presnivs': ds_var_id['klev'].data})
            #     ds_var_id = ds_var_id.drop_dims({'klev'}).rename({'presnivs':'klev'})
                
            #     # ds_var_id = ds_var_id.rename({'presnivs':'half_lev',})
            # #     
            # else:
            ds_var_id = ds_var_id.rename({'lev':'half_lev','lev_bnds':'half_lev_bnds'})
        
           
        else:
            ds_var_id = xr.open_mfdataset(files)
        # select only years needed for analysis
        ds_var_id = ds_var_id.sel(time=ds_var_id['time'].dt.year.isin(year_range)).squeeze()
        
        
            
        if var_id not in list(ds_var_id.keys()):
            print(f'{var_id} do not exists in {model}')
            return(xr.Dataset())
        else:
            # ds_var_id[var_id] = ds_var_id[var_id].where(ds_var_id[var_id] >= 0.)
            return(ds_var_id)
    else:
        print(f'no files found for {var_id} in {model}')
        return(xr.Dataset())



We can call dask.delayed on our funtions to make them lazy. Rather than compute their results immediately, they record what we want to compute as a task into a graph that we’ll run later on parallel hardware.

In [10]:

    
def search_data(cmip_in, t_res,model,experiment_id,variable_id, year_range, variant_label=None):
    if model == 'CNRM-CM6-1':
        file_paths = [
        os.path.join(
            cmip_in, f"single_model/{model}/{var}_*{t_res[0]}_{model}_{experiment_id[0]}_{variant_label}*"
        ) for var in variable_id[:-1]
    ]
    else:
        file_paths = [
        os.path.join(
            cmip_in, f"single_model/{model}/{var}_*{t_res[0]}_{model}_{experiment_id[0]}*"
        ) for var in variable_id[:-1]
    ]
    
    
    # file_paths = (dask.delayed(glob)(file_path) for file_path in file_paths)
    file_paths = [glob(file_path) for file_path in file_paths]
    
    # ds_clw, ds_cli, ds_clivi, ds_tas, ds_prsn, ds_pr, ds_pfull, ds_ta = ([
    #     dask.delayed(open_ds_var_id)(files, year_range, var_id)
    #     for files, var_id in zip(file_paths, variable_id)
    # ])
    ds_clw, ds_cli, ds_clivi, ds_tas, ds_prsn, ds_pr, ds_pfull, ds_phalf = ([
        open_ds_var_id(files, year_range, var_id, model)
        for files, var_id in zip(file_paths, variable_id)
    ])
    
    dsets = [ds_clw, ds_cli, ds_clivi, ds_tas, ds_prsn, ds_pr, ds_pfull, ds_phalf]
    
        
    
    # Combine datasets by coordinates
    # dset = dask.delayed(xr.combine_by_coords)([ds for ds in dsets])
    dset = xr.combine_by_coords([ds for ds in dsets if not (len(ds.dims) == 0)], combine_attrs ='drop_conflicts')
    
    if not len(ds_pfull) == 0: # sort to start at the top of the atmosphere
        dset = dset.reindex(lev= dset.lev[::-1])
        dset = dset.reindex(half_lev= dset.half_lev[::-1])
    
    try:
        ds_areacella = xr.open_dataset(
            glob(f"{cmip_in}/single_model/{model}/areacella_fx_*{model}_{experiment_id[0]}*.nc")[0],
            drop_variables=["lat_bnds", "lon_bnds"],
        )
        dset = xr.merge([dset, ds_areacella])
    except IndexError:
        print(f'areacella does not exist in {model}')
        
    # shift longitude to be from -180 to 180
    dset = dset.assign_coords(lon=(((dset['lon'] + 180) % 360) - 180)).sortby('lon').sortby('time')
    dset = dset.drop({'height'})
    
    return dset


In [11]:
year=2006
dset = dict()
for model in list_models:
        if model == 'CNRM-CM6-1':
                dset[model] = search_data(cmip_in, t_res,model,experiment_id,variable_id, range(year, year+1), variant_label='r1i1p1f2')
        else:
                dset[model] = search_data(cmip_in, t_res,model,experiment_id,variable_id, range(year, year+1), )
                
# dset[model]#.attrs['references']

no files found for clivi in MIROC6
no files found for pfull in MIROC6
no files found for phalf in MIROC6
no files found for pfull in CanESM5
no files found for phalf in CanESM5
no files found for cli in AWI-ESM-1-1-LR
no files found for pfull in AWI-ESM-1-1-LR
no files found for phalf in AWI-ESM-1-1-LR
no files found for cli in MPI-ESM1-2-LR
no files found for pfull in MPI-ESM1-2-LR
no files found for phalf in MPI-ESM1-2-LR
no files found for cli in UKESM1-0-LL
areacella does not exist in UKESM1-0-LL
no files found for cli in HadGEM3-GC31-LL
areacella does not exist in HadGEM3-GC31-LL
no files found for cli in CNRM-CM6-1
no files found for pfull in CNRM-CM6-1
no files found for phalf in CNRM-CM6-1
no files found for cli in CNRM-ESM2-1
no files found for pfull in CNRM-ESM2-1
no files found for phalf in CNRM-ESM2-1
no files found for cli in IPSL-CM6A-LR
no files found for pfull in IPSL-CM6A-LR
no files found for phalf in IPSL-CM6A-LR
no files found for cli in IPSL-CM5A2-INCA
no files fou

## Assign attributes to the variables
 
We will assign the attributes to the variables as in ERA5 to make CMIP6 and ERA5 variables comperable.
 
* [`pr`](http://clipc-services.ceda.ac.uk/dreq/u/62f26742cf240c1b5169a5cd511196b6.html) and [`prsn`](http://clipc-services.ceda.ac.uk/dreq/u/051919eddec810e292c883205c944ceb.html) in **kg m-2 s-1** $\rightarrow$ Multiply by **3600** to get **mm h-1** $\rightarrow$ Multiply by **24** to get **mm day-1**
 

In [12]:
def assign_att(dset):
    now = datetime.utcnow()
    # 
    for var_id in dset.keys():
            
            if var_id == 'prsn':
                dset[var_id] = dset[var_id]*3600
                dset[var_id] = dset[var_id].assign_attrs({'standard_name': 'snowfall_flux',
                                                            'comment': 'At surface; includes precipitation of all forms of water in the solid phase',
                                                            'units': 'kg m-2 h-1',
                                                            'original_units': 'kg m-2 s-1',
                                                            'history': "{}Z altered by F. Hellmuth: Converted units from 'kg m-2 s-1' to 'kg m-2 h-1'.".format(now.strftime("%d/%m/%Y %H:%M:%S")),
                                                            'cell_methods': 'area: time: mean',
                                                            'cell_measures': 'area: areacella'})
                
            if var_id == 'pr':
                dset[var_id] = dset[var_id]*3600
                dset[var_id] = dset[var_id].assign_attrs({'standard_name': 'precipitation_flux',
                                                          'comment': 'includes both liquid and solid phases',
                                                          'units': 'kg m-2 h-1',
                                                          'original_units': 'kg m-2 s-1',
                                                          'history':"{}Z altered by F. Hellmuth: Converted units from 'kg m-2 s-1' to 'kg m-2 h-1'.".format(now.strftime("%d/%m/%Y %H:%M:%S")),
                                                          'cell_methods': 'area: time: mean',
                                                          'cell_measures': 'area: areacella'})
                
                
    return dset

In [13]:
# for model in list_models:
#     dset[model] = assign_att(dset[model])


## Interpolate from CMIP6 hybrid sigma-pressure levels to isobaric pressure levels

The vertical variables in the CMIP6 models are in hybrid sigma-pressure levels. Hence the vertical variable in the xarray datasets in `dset_dict` will be calculated by using the formula:
$$ P(i,j,k) = hyam(k) p0 + hybm(k) ps(i,j)$$
to calculate the pressure

In [14]:
def interp_hybrid_plev(dset, model):
    
    # You can simplify the if conditions by using the "in" operator to check if the required keys are present in the dictionary.
    # Rename datasets with different naming convention for constant hyam
    if 'a' in dset and 'a_bnds' in dset:
        dset = dset.rename({'a':'ap', 'a_bnds': 'ap_bnds'})
    if 'nbnd' in dset.dims:
        dset = dset.rename({'nbnd':'bnds'})
    if 'presnivs' in dset.dims:
        dset = dset.rename({'presnivs':'plev'})
 
    if model == 'IPSL-CM5A2-INCA' and 'lev' in dset.dims:
        dset = dset.rename({'lev':'plev'})
        
    
    
    # if all the necessary variables are present in the dataset before performing any calculations. If any of the 
    # variables are missing, the function can simply move on to the next variable. This will reduce the number of 
    # unnecessary calculations that are performed.
    for var_id in ['clw', 'cli']:
        
        # Convert the model level to isobaric levels
        # Instead of checking if a key is present in the list of keys returned by dset.keys(), it would be faster 
        # to use the in operator to check if the key is present in the dictionary. For example, instead of 
        # if ('a' in list(dset.keys())) == True:, it would be faster to use if 'a' in dset:.
        if 'ap' in dset and 'ps' in dset and 'p0' in dset:
            if 'lev' in dset[var_id].coords and 'lev' in dset['ap'].coords and 'lev' in dset['b'].coords:
                # Convert to pressure levels
                dset['plev'] = dset['ap']*dset['p0'] + dset['b']*dset['ps']
                # dset['plev'] = dset['plev'].transpose('time', 'lev','lat','lon')
                
            # Create plev_bnds array
            dset['plev_bnds'] = dset['ap_bnds']*dset['p0'] + dset['b_bnds']*dset['ps']
            # Instead of using the transpose method on the entire array, you can specify the dimensions you want to transpose
            # dset['plev_bnds'] = dset['plev_bnds'].transpose('time', 'lev','lat','lon', 'bnds')
                
        elif 'ap' in dset and 'ps' in dset and 'p0' not in dset:
            if 'lev' in dset[var_id].coords and 'lev' in dset['ap'].coords and 'lev' in dset['b'].coords:
                dset['plev'] = dset['ap'] + dset['b']*dset['ps']
                # dset['plev'] = dset['plev'].transpose('time', 'lev','lat','lon')
                
            dset['plev_bnds'] = dset['ap_bnds'] + dset['b_bnds']*dset['ps']
            # dset['plev_bnds'] = dset['plev_bnds'].transpose('time', 'lev','lat','lon', 'bnds')
        
        
        dset = dset.transpose('time', 'lat', 'lon', 'plev', 'lev', 'bnds','axis_nbounds', 'half_lev', 'klevp1', missing_dims="ignore" )

        # Remove unnecessary variables
        dset = dset.drop_vars(['ap', 'b', 'ps', 'p0', 'ap_bnds', 'b_bnds', 'lev_bnds', 'orog'], errors='ignore')
        
        
                
                
    return dset

In [15]:
# for model in list_models:
#     dset[model] = interp_hybrid_plev(dset[model], model)

## Calculate liquid water path from content

Once the pressure levels are calculated the daily average LWP (IWP) is calculated for each CMIP6 model.
\begin{equation}
        LWP = \rho_{air} \cdot \Delta clw \cdot \Delta Z 
\end{equation}

with hydrostatic equation

\begin{equation}
         \frac{\Delta p}{\Delta Z}  = -\rho_{air} \cdot g  
\end{equation}

\begin{equation}
         \leftrightarrow LWP = - \frac{\rho_{air}}{\rho_{air} g} \cdot \Delta clw \Delta p
\end{equation}

with $\Delta clw = clw(NLEV-k)$ and $\Delta p = p(NLEV-k + 1/2) - p(NLEV-k - 1/2)$ follows for the total liquid water path in the column:

\begin{equation}
         -\frac{1}{g} \sum_{k=0}^{NLEV+1} LWP(k) = -\frac{1}{g} \sum_{k=0}^{NLEV+1} clw(NLEV-k) \cdot [p(NLEV-k + 1/2) - p(NLEV-k - 1/2)]
\end{equation}



In [16]:
def calc_water_path(dset,model, g=9.81, now = datetime.utcnow()):
    
    
    if 'plev' in dset:
        dp = (dset['plev_bnds'].diff(dim='bnds')).squeeze()
        if model == 'IPSL-CM6A-LR' or model == 'IPSL-CM5A2-INCA':
            dp = dp.sel(klevp1=slice(0,len(dset['klevp1'])-1)).assign_coords({'klevp1': dset['clw'].plev.data})
            dp = dp.rename({'klevp1':'lev'})
            dset = dset.rename({'plev':'lev'})
    elif 'phalf' in dset:
        dp = (dset['phalf'].diff(dim='half_lev')).squeeze()
        dp = dp.assign_coords({'half_lev': dset['clw'].lev.data})
        dp = dp.rename({'half_lev':'lev'})
    else:
        return dset
    
    _lwp = - dp / g * dset['clw']
    lwp_sum = np.sum(_lwp, axis=_lwp.get_axis_num('lev'), keepdims=True).squeeze()
    dset['lwp'] = xr.DataArray(lwp_sum, coords=dset['prsn'].coords, dims=dset['prsn'].dims)
    
    dset['lwp'] = dset['lwp'].assign_attrs(dset['clw'].attrs)
    dset['lwp'] = dset['lwp'].assign_attrs({
            'long_name': 'Daily average Liquid Water Path',
            'units': 'kg m-2',
            'mipTable': '',
            'out_name': 'lwp',
            'standard_name': 'atmosphere_mass_content_of_cloud_liquid_water',
            'title': 'Liquid Water Path',
            'variable_id': 'lwp',
            'original_units': 'kg/kg',
            'history': "{}Z altered by F. Hellmuth: Interpolate data from hybrid-sigma levels to isobaric levels with P=a*p0 + b*psfc. Calculate lwp with hydrostatic equation.".format(now.strftime("%d/%m/%Y %H:%M:%S"))
        })
        
    dset = dset.drop_vars(['clw'])
    
    if 'cli' in dset:
        _iwp = - dp / g * dset['cli']
        iwp_sum = np.sum(_iwp, axis=_iwp.get_axis_num('lev'), keepdims=True).squeeze()
        dset['clivi'] = xr.DataArray(iwp_sum, coords=dset['prsn'].coords, dims=dset['prsn'].dims)
        
        dset['clivi'] = dset['clivi'].assign_attrs(dset['cli'].attrs)
        dset['clivi'] = dset['clivi'].assign_attrs({
                'long_name':'Daily average Ice Water Path', 
                'units' : 'kg m-2',
                'mipTable':'', 
                'out_name': 'clivi',
                'standard_name': 'atmosphere_mass_content_of_cloud_ice_water',
                'title': 'Ice Water Path',
                'variable_id': 'clivi', 
                'original_units': 'kg/kg',
                'history': "{}Z altered by F. Hellmuth: Interpolate data from hybrid-sigma levels to isobaric levels with P=a*p0 + b*psfc. Calculate clivi with hydrostatic equation.".format(now.strftime("%d/%m/%Y %H:%M:%S"))})
            
        dset = dset.drop_vars(['cli'])
        
    return dset

In [17]:
# for model in list_models:
#     dset[model] = calc_water_path(dse[model],model, g=9.81, now = datetime.utcnow())

## Regrid CMIP6 data to IPSL-CM6A-LR and IPSL-CM5A2_INCA grid <a id='regrid_hz'></a>

We want to conduct statistical analysis at the annual and seasonal timescales to determine the biases in cloud phase and precipitation (liquid and solid) in the CMIP6 models. At the moment we have all historical data from the CMIP6 models. For this, we will have to extract the 4-year period between 2006 and 2009.

$\rightarrow$ Define a start and end year.

The CMIP6 high resolution models have approximately a nominal resolution of 250km. But not all have identical grid spacing. Hence we will make use of the python package `xesmf` and the documentation on [decreasing resolution](https://xesmf.readthedocs.io/en/latest/notebooks/Compare_algorithms.html#Decreasing-resolution), [Limitations and warnings](https://xesmf.readthedocs.io/en/latest/notebooks/Masking.html?highlight=conservative#Limitations-and-warnings). 

IPSL-CM6A-LR will be the reference grid for models with 250km resolution.

$\rightarrow$ Define IPSL-CM6A-LR as the reference grid `ds_out`.

Create a new Python dictionary (`dset_gridded`) with the regridded CMIP6 `xarray` datasets between 2006 an 2009. Save each regridded model to a `netcdf`, locally. 

> **_NOTE:_** This step may take several minutes!

In [18]:
def regrid_and_save(dset, ds_out, grid_model, model, cmip_in, year):

    if 'lat_bnds' in dset or 'lon_bnds' in dset:
        dset = dset.drop_vars({'lat_bnds', 'lon_bnds'})
    
    regridder = xe.Regridder(ds_in=dset, ds_out=ds_out, method="conservative")
    dset_regrid = regridder(dset)
    if 'areacella' not in list(dset_regrid.keys()):
        dset_regrid['areacella'] = ds_out['areacella']

    # var_ids = ['clivi', 'lwp', 'pr', 'prsn', 'tas', 'areacella']
    var_ids = ['areacella']

    for var_id in var_ids:
        if var_id in list(dset_regrid.keys()):
            print(f'Writing regrid files: var_id: {var_id}, year: {year}, model: {model}')

            variant_label = dset.attrs['variant_label']
            NH_file_grid = f'{cmip_in}/common_grid/{model}/{var_id}_{grid_model}_{model}_40_90_{experiment_id[0]}_{variant_label}_{year}0101-{year}1231.nc'
            SH_file_grid = f'{cmip_in}/common_grid/{model}/{var_id}_{grid_model}_{model}_-40_-90_{experiment_id[0]}_{variant_label}_{year}0101-{year}1231.nc'
            (dset_regrid[var_id].sel(lat=slice(40, 90))).to_netcdf(NH_file_grid)
            (dset_regrid[var_id].sel(lat=slice(-90, -40))).to_netcdf(SH_file_grid)
        else:
                print(f'{var_id} does not exist in {model}')


In [19]:

# # Create a memory object with a cache directory
# cache_dir = '/scratch/franzihe/cache_dir/'
# memory = Memory(location=cache_dir)

# # Decorate the search_data function with the cache memory
# @memory.cache
# def search_data_cached(t_res, model, experiment_id, variable_id, year_range):
#     return search_data(t_res, model, experiment_id, variable_id, year_range)

def process_year(year, list_models,cmip_in, t_res, experiment_id,variable_id):
    for model in list_models:
        if model == 'CNRM-CM6-1':
                dset = search_data(cmip_in, t_res,model,experiment_id,variable_id, range(year, year+1), variant_label='r1i1p1f2')
        else:
                dset = search_data(cmip_in, t_res,model,experiment_id,variable_id, range(year, year+1), )
        dset = assign_att(dset)
        dset = interp_hybrid_plev(dset,model)
        dset = calc_water_path(dset,model, g=9.81, now = datetime.utcnow())
        
       
        # for var_id in ['clivi', 'lwp', 'pr', 'prsn', 'tas', 'areacella']:
        for var_id in ['areacella',]:
            if var_id in list(dset.keys()):
                print(f'Writing files: var_id: {var_id}, year: {year}, model: {model}')
                variant_label = dset.attrs['variant_label']
                NH_file = f'{cmip_in}/single_model/{model}/{var_id}_{model}_40_90_{experiment_id[0]}_{variant_label}_{year}0101-{year}1231.nc'
                SH_file = f'{cmip_in}/single_model/{model}/{var_id}_{model}_-40_-90_{experiment_id[0]}_{variant_label}_{year}0101-{year}1231.nc'
                
                (dset[var_id].sel(lat=slice(40,90))).to_netcdf(NH_file)
                (dset[var_id].sel(lat=slice(-90,-40))).to_netcdf(SH_file)
            
            else:
                print(f'{var_id} does not exist in {model}')
            
        for grid_model in ['IPSL-CM6A-LR', 'IPSL-CM5A2-INCA']:
                ds_out = xr.open_dataset(glob(f"{cmip_in}/single_model/{grid_model}/areacella_*{grid_model}_{experiment_id[0]}*.nc")[0],)
                # Shift longitude to be from -180 to 180
                ds_out = ds_out.assign_coords(lon=(((ds_out['lon'] + 180) % 360) - 180)).sortby('lon')
                if grid_model == 'IPSL-CM6A-LR':
                        if model == 'CanESM5' or model == 'IPSL-CM5A2-INCA':
                                continue
                        else:
                                regrid_and_save(dset, ds_out, grid_model, model, cmip_in, year)
                elif grid_model == 'IPSL-CM5A2-INCA':
                        regrid_and_save(dset, ds_out, grid_model, model, cmip_in, year)    




In [20]:
  
for year in [2006, 2007, 2008, 2009]:
    # This will run the process_year function for each year in parallel, using all available cores (n_jobs=-1).
    # Parallel(n_jobs=-1)(delayed(process_year)(year, list_models, t_res, experiment_id,variable_id))# for year in [2006, 2007, 2008, 2009])
    process_year(year, list_models,cmip_in, t_res, experiment_id,variable_id)

no files found for clivi in MIROC6
no files found for pfull in MIROC6
no files found for phalf in MIROC6
Writing files: var_id: areacella, year: 2006, model: MIROC6
Writing regrid files: var_id: areacella, year: 2006, model: MIROC6
Writing regrid files: var_id: areacella, year: 2006, model: MIROC6
no files found for pfull in CanESM5
no files found for phalf in CanESM5
Writing files: var_id: areacella, year: 2006, model: CanESM5
Writing regrid files: var_id: areacella, year: 2006, model: CanESM5
no files found for cli in AWI-ESM-1-1-LR
no files found for pfull in AWI-ESM-1-1-LR
no files found for phalf in AWI-ESM-1-1-LR
Writing files: var_id: areacella, year: 2006, model: AWI-ESM-1-1-LR
Writing regrid files: var_id: areacella, year: 2006, model: AWI-ESM-1-1-LR
Writing regrid files: var_id: areacella, year: 2006, model: AWI-ESM-1-1-LR
no files found for cli in MPI-ESM1-2-LR
no files found for pfull in MPI-ESM1-2-LR
no files found for phalf in MPI-ESM1-2-LR
Writing files: var_id: areacell

In [73]:
# for year in [2006, 2007, 2008, 2009]:
#     for model in list_models:
#         for var_id in ['clivi', 'lwp', 'pr', 'prsn', 'tas', 'areacella']:
#             for grid_model in ['IPSL-CM6A-LR', 'IPSL-CM5A2-INCA']:
#                 file_list = sorted(glob(f'{cmip_in}/common_grid/{model}/{var_id}_{grid_model}_{model}_40_90*_{year}0101-{year+1}1231.nc'))
#                 if len(file_list) > 0:
#                     old_name = file_list[0]
#                     variant_label = old_name.split('_')[-2]
#                     print(old_name)
#                     new_name = f'{cmip_in}/common_grid/{model}/{var_id}_{grid_model}_{model}_40_90_{experiment_id[0]}_{variant_label}_{year}0101-{year}1231.nc'
#                     print(new_name)
#                     os.rename(old_name, new_name)
                    
#                 file_list = sorted(glob(f'{cmip_in}/common_grid/{model}/{var_id}_{grid_model}_{model}_-40_-90*_{year}0101-{year+1}1231.nc'))
#                 if len(file_list) > 0:
#                     old_name = file_list[0]
#                     variant_label = old_name.split('_')[-2]
#                     # print(old_name)
#                     new_name = f'{cmip_in}/common_grid/{model}/{var_id}_{grid_model}_{model}_-40_-90_{experiment_id[0]}_{variant_label}_{year}0101-{year}1231.nc'
#                     # print(new_name)
#                     os.rename(old_name, new_name)
        


In [81]:
# for year in [2006, 2007, 2008, 2009]:
#     for model in list_models:
#         for var_id in ['clivi', 'lwp', 'pr', 'prsn', 'tas', 'areacella']:
#             file_list = sorted(glob(f'{cmip_in}/single_model/{model}/{var_id}_{model}_40_90*_{year}0101-{year+1}1231.nc'))
#             if len(file_list) > 0:
#                 old_name = file_list[0]
#                 print(old_name)
#                 variant_label = old_name.split('_')[-2]
#                 new_name = f'{cmip_in}/single_model/{model}/{var_id}_{model}_40_90_{experiment_id[0]}_{variant_label}_{year}0101-{year}1231.nc'
#                 print(new_name)
#                 os.rename(old_name, new_name)
            
#             file_list = sorted(glob(f'{cmip_in}/single_model/{model}/{var_id}_{model}_-40_-90*_{year}0101-{year+1}1231.nc'))
#             if len(file_list) > 0:
#                 old_name = file_list[0]
#                 print(old_name)
#                 variant_label = old_name.split('_')[-2]
#                 new_name = f'{cmip_in}/single_model/{model}/{var_id}_{model}_-40_-90_{experiment_id[0]}_{variant_label}_{year}0101-{year}1231.nc'
#                 print(new_name)
#                 os.rename(old_name, new_name)


<img src="https://drive.google.com/uc?id=1zb0LHvipx8JOXLLrCxzYToJM7eNK4eaw"  height="100" />
<img src="https://reliance.rohub.org/static/media/Reliance-logo.433dc2e9.png"  height="100" />

<img src="https://www.uio.no/vrtx/decorating/resources/dist/src2/images/footer/uio-logo-en.svg"  height="100" />
<img src="https://erc.europa.eu/sites/default/files/logo_0.png"  height="100" />
