In [1]:
# export PYTHONPATH="${PYTHONPATH}:/uio/kant/geo-geofag-u1/franzihe/Documents/Python/globalsnow/CloudSat_ERA5_CMIP6_analysis/utils/"

# Analysis of CMIP6, ERA5, and CloudSat


# Table of Contents
<ul>
<li><a href="#introduction">1. Introduction</a></li>
<li><a href="#data_wrangling">2. Data Wrangling</a></li>
<li><a href="#exploratory">3. Exploratory Data Analysis</a></li>
<li><a href="#conclusion">4. Conclusion</a></li>
<li><a href="#references">5. References</a></li>
</ul>

# 1. Introduction <a id='introduction'></a>


**Questions**
* How is the cloud phase and snowfall 


> **_NOTE:_** .

# 2. Data Wrangling <a id='data_wrangling'></a>


## Organize my data

- Define a prefix for my project (you may need to adjust it for your own usage on your infrastructure).
    - input folder where all the data used as input to my Jupyter Notebook is stored (and eventually shared)
    - output folder where all the results to keep are stored
    - tool folder where all the tools

The ERA5 0.25deg data is located in the folder `\scratch\franzihe\`, CloudSat at ...



In [2]:
import os
import pathlib
import sys
import socket
hostname = socket.gethostname()

abs_path = str(pathlib.Path(hostname).parent.absolute())
WORKDIR = abs_path[:- (len(abs_path.split('/')[-2] + abs_path.split('/')[-1])+1)]


if "mimi" in hostname:
    print(hostname)
    DATA_DIR = "/mn/vann/franzihe/"
    # FIG_DIR = "/uio/kant/geo-geofag-u1/franzihe/Documents/Figures/ERA5/"
    FIG_DIR = "/uio/kant/geo-geofag-u1/franzihe/Documents/Python/globalsnow/CloudSat_ERA5_CMIP6_analysis/Figures/CS_ERA5_CMIP6/"
elif "glefsekaldt" in hostname: 
    DATA_DIR = "/home/franzihe/Data/"
    FIG_DIR = "/home/franzihe/Documents/Figures/ERA5/"

INPUT_DATA_DIR = os.path.join(DATA_DIR, 'input')
OUTPUT_DATA_DIR = os.path.join(DATA_DIR, 'output')
UTILS_DIR = os.path.join(WORKDIR, 'utils')

sys.path.append(UTILS_DIR)
# make figure directory
try:
    os.mkdir(FIG_DIR)
except OSError:
    pass

mimi.uio.no


## Import python packages
- `Python` environment requirements: file [requirements_globalsnow.txt](../../requirements_globalsnow.txt) 
- load `python` packages from [imports.py](../../utils/imports.py)
- load `functions` from [functions.py](../../utils/functions.py)


In [3]:
# supress warnings
import warnings
warnings.filterwarnings('ignore') # don't output warnings

# import packages
from imports import(xr, intake, ccrs, cy, plt, glob, cm, fct, np, da, LogNorm, pd)
xr.set_options(display_style='html')

<xarray.core.options.set_options at 0x7f2c854db730>

In [4]:
# reload imports
%load_ext autoreload
%autoreload 2

## Open variables
Get the data requried for the analysis. 



In [5]:
era_in = os.path.join(INPUT_DATA_DIR, 'ERA5')
cmip_in = os.path.join(INPUT_DATA_DIR, 'cmip6_hist')
dat_out = os.path.join(OUTPUT_DATA_DIR, 'CS_ERA5_CMIP6')

# make output data directory
try:
    os.mkdir(dat_out)
except OSError:
    pass

In [6]:
variable_id = ['tas', 'prsn', 'pr', 'lwp', 'clivi', 'areacella']

We will load all available models into one dictonary, which includes an xarray dataset with `xarray.open_mfdataset(file)` and select the time range [by name](https://xarray.pydata.org/en/stable/user-guide/indexing.html).

In [7]:
list_models = [
               'ERA5',
               'MIROC6', 
               'CanESM5', 
               'AWI-ESM-1-1-LR', 
               'MPI-ESM1-2-LR', 
               'UKESM1-0-LL', 
               'HadGEM3-GC31-LL',
               'CNRM-CM6-1',
               'CNRM-ESM2-1',
               'IPSL-CM6A-LR',
               'IPSL-CM5A2-INCA'
            ]



In [8]:
era_dict = {}

era_files = [sorted(glob(f'{era_in}/daily_means/40NS/daily_mean_40*ERA5*.nc'))] + \
        sorted(glob(f'{era_in}/common_grid/40NS/*ERA5_daily_mean*_IPSL-CM6A-LR*.nc')) + \
        sorted(glob(f'{era_in}/common_grid/40NS/*ERA5_daily_mean*_IPSL-CM5A2-INCA*.nc'))


for file in era_files:
    if 'IPSL-CM6A-LR' in file:
        res = 'era_250'
    elif 'IPSL-CM5A2-INCA' in file:
        res = 'era_500'
    else:
        res = 'era_30'
    # res = '250' if 'IPSL-CM6A-LR' in file else '500'
    era_dict[res] = xr.open_mfdataset(file)

# Remove leap day for ERA files
for res in era_dict.keys():
    era_dict[res] = era_dict[res].sel(time=~((era_dict[res].time.dt.month == 2) & (era_dict[res].time.dt.day == 29)))



In [12]:
era_dict['era_500'].time

In [None]:
cmip_dict = {}

for model in list_models[1:]:
    cmip_file_in = sorted(glob(f'{cmip_in}/single_model/{model}/*_{model}_*40*.nc'))
    cmip_dict[model] = xr.open_mfdataset(cmip_file_in, decode_times =True, use_cftime=True).rename_vars({'clivi': 'iwp'})
    
    cmip_dict[model]['twp'] = cmip_dict[model]['lwp'] + cmip_dict[model]['iwp']
    cmip_dict[model] = fct.to_ERA5_date(cmip_dict[model], model)


In [None]:
cmip_250 = {}
cmip_500 = {}
for model in list_models[1:]:
    # Read CMIP files
    cmip_files_250 = sorted(glob(f'{cmip_in}/common_grid/{model}/*_IPSL-CM6A-LR_{model}*40*.nc'))
    cmip_files_500 = sorted(glob(f'{cmip_in}/common_grid/{model}/*_IPSL-CM5A2-INCA_{model}*40*.nc'))
    if len(cmip_files_250) != 0:
        cmip_250[model] = xr.open_mfdataset(cmip_files_250, decode_times =True, use_cftime=True).rename_vars({'clivi': 'iwp'})
    cmip_500[model] = xr.open_mfdataset(cmip_files_500, decode_times =True, use_cftime=True).rename_vars({'clivi': 'iwp'})


# Calculate 'twp' variable and convert calendar
for cmip in [cmip_250, cmip_500]:
    for model in cmip.keys():
        cmip[model]['twp'] = cmip[model]['lwp'] + cmip[model]['iwp']
        cmip[model] = fct.to_ERA5_date(cmip[model], model)

_coord = list(cmip_250.keys())
_ds = list(cmip_250.values())
cmip_dict['cmip_250'] = xr.concat(objs=_ds, dim=_coord, coords='all').rename({'concat_dim':'model'})

_coord = list(cmip_500.keys())
_ds = list(cmip_500.values())
cmip_dict['cmip_500'] = xr.concat(objs=_ds, dim=_coord, coords='all').rename({'concat_dim':'model'})

## Statistics
For variables:
- Snowfall [prsn]
- Total column cloud liquid, supercooled liqid, and rain water [twp]
- Total column cloud ice, snow water [iwp]
- 2m-Temperature [tas]

1. Find where liquid water path is $\ge$ 5 g m-2 
2. Find where snowfall is $\ge$ 0.01mm h-1
3. Find where 2m-temperature $\le$ 0 $^o$ C 

In [None]:
# th_lcc = 0.005
# th_2t = 273.15
# th_frac_days = 0.1
# th_tp = 0.01
# th_sf = 0.01


In [None]:
def calc_days_season(ds):
    days_season = xr.DataArray(data = [xr.concat([ds.sel(time=fct.is_season(ds['time.month'], 1, 2)), 
                                                  ds.sel(time=fct.is_season(ds['time.month'],12,12))], dim='time').sizes['time'],
                                       ds.sel(time=fct.is_season(ds['time.month'], 6, 8)).sizes['time'],
                                       ds.sel(time=fct.is_season(ds['time.month'], 3, 5)).sizes['time'],
                                       ds.sel(time=fct.is_season(ds['time.month'], 9, 11)).sizes['time'],], 
                                dims={'season'}, 
                                coords={'season':['DJF', 'JJA', 'MAM', 'SON']})
    
    _days = []
    for month in np.arange(1,13):
        _days.append(ds.sel(time=fct.is_season(ds['time.month'], month, month)).sizes['time'])
        # print(month, )
    days_month = xr.DataArray(data= np.array(_days),
                            dims={'month'}, 
                            coords={'month':np.arange(1,13)} )
    
    return(days_season, days_month)

In [None]:
def find_lcc_sf(ds, lwp_threshold):
    days_season, days_month = calc_days_season(ds)
    
    # find where 2m-temperature <= 0C or <= threshold
    # This should automatically assume that it is already only snow, but it could include supercooled 
    # rain in the case of total precipitation
    th_2t = 273.15
    ds_2t = ds.where(ds['tas'] <= th_2t, other=np.nan)
    
    # 1. find where liquid water >= 0.005 kgm-2 or >= threshold
    th_lcc = 0.001*lwp_threshold
    ds_lcc_2t = ds_2t.where(ds_2t['lwp']>=th_lcc, other=np.nan)
    ds_lcc = ds.where(ds['lwp']>=th_lcc, other=np.nan)
    

    # amount of freezing rain
    ds_lcc_2t['prfr'] = (ds_lcc_2t['pr'] - ds_lcc_2t['prsn'])
    ds_lcc_2t['prfr'].attrs = {'units': 'kg m-2 h-1', 'long_name': 'Mean freezing rain rate'}

    # # if we want a precip or snowfall threshold apply here
    # # find where total precipitation >0 kgm-2h-1 threshold in these liquid containg clouds
    # # th_tp = 0.01
    # # ds_lcc_2t = ds_lcc_2t.where(ds['pr']>=th_tp, other=np.nan) 
    # # 2.1 find where snowfall >= 0.24 mmday-1 or >= threshold in these liquid containing clouds, but not temperature threshold
    # # multiply by 24 to make it comparable to McIllhattan et al. As they use 0.01mmh-1 as lower threshold
    # # applying snowfall days, based on threshold (th_sf). Gives days where snowfall above th_sf and counts days in season and 
    # # devides by season days
    # th_sf = 0.01
    # ds_lcc_2t_sf = ds_lcc_2t.where(ds['prsn']>=th_sf, other=np.nan) 
    # # th_days = (ds_lcc_2t_sf['twp'].groupby('time.season').count(dim='time',keep_attrs=False))/days_season


    # create dataset to use for calculating the precipitation efficency. For the precipitation efficency we want to remove th_frac 
    # days where liquid water content and temperature requirements are met. 
    # assign percent of snowfall days, required in a pixle, which should be included in the statistics
    th_frac = 0.1
    th_days_lcc_2t = (ds_lcc_2t['lwp'].groupby('time.season').count(dim='time',keep_attrs=False))/days_season
    th_days_lcc    = (ds_lcc['lwp'].groupby('time.season').count(dim='time',keep_attrs=False))/days_season

    ds_lcc_2t_season = ds_lcc_2t.groupby('time.season').mean('time', skipna=True, keep_attrs=True)
    ds_lcc_2t_season = ds_lcc_2t_season.where(th_days_lcc_2t>=th_frac, other=np.nan)

    ds_lcc_season = ds_lcc.groupby('time.season').mean('time', skipna=True, keep_attrs=True)
    ds_lcc_season = ds_lcc_season.where(th_days_lcc >= th_frac, other=np.nan)
    
    # Now create daily dataset based on seasonal supercooled liquid containing cloud days above th_sf, and th_frac
    _mam = ((ds_lcc_2t.sel(time=fct.is_season(ds_lcc_2t['time.month'], 3, 5))).where(th_days_lcc_2t.sel(season='MAM') >=th_frac, other=np.nan)).drop('season')
    _jja = ((ds_lcc_2t.sel(time=fct.is_season(ds_lcc_2t['time.month'], 6, 8))).where(th_days_lcc_2t.sel(season='JJA') >=th_frac, other=np.nan)).drop('season')
    _son = ((ds_lcc_2t.sel(time=fct.is_season(ds_lcc_2t['time.month'], 9, 11))).where(th_days_lcc_2t.sel(season='SON') >=th_frac, other=np.nan)).drop('season')
    _djf = ((xr.concat([ds_lcc_2t.sel(time=fct.is_season(ds_lcc_2t['time.month'], 1, 2)), 
                    ds_lcc_2t.sel(time=fct.is_season(ds_lcc_2t['time.month'],12,12))], dim='time')).where(th_days_lcc_2t.sel(season='DJF') >=th_frac, other=np.nan)).drop('season')
    ds_lcc_2t_days = xr.merge(objects=[_djf, _jja, _mam, _son])

    _mam = ((ds_lcc.sel(time=fct.is_season(ds_lcc['time.month'], 3, 5))).where(th_days_lcc_2t.sel(season='MAM') >=th_frac, other=np.nan)).drop('season')
    _jja = ((ds_lcc.sel(time=fct.is_season(ds_lcc['time.month'], 6, 8))).where(th_days_lcc_2t.sel(season='JJA') >=th_frac, other=np.nan)).drop('season')
    _son = ((ds_lcc.sel(time=fct.is_season(ds_lcc['time.month'], 9, 11))).where(th_days_lcc_2t.sel(season='SON') >=th_frac, other=np.nan)).drop('season')
    _djf = ((xr.concat([ds_lcc.sel(time=fct.is_season(ds_lcc['time.month'], 1, 2)), 
                    ds_lcc.sel(time=fct.is_season(ds_lcc['time.month'],12,12))], dim='time')).where(th_days_lcc_2t.sel(season='DJF') >=th_frac, other=np.nan)).drop('season')
    ds_lcc_days = xr.merge(objects=[_djf, _jja, _mam, _son])



    # for all the other statistics we want to remove th_frac days where liquid content, temperature, and snowfall requirements are met
    # which also means we have to apply the threshold for the total precipitation
    # find where total precipitation >= 0.01 kg m-2 h-1 in LCCs with T2<0C
    # th_tp = 0.01
    # ds_lcc_2t_sf = ds_lcc_2t_days.where(ds_lcc_2t_days['pr'] >=th_tp, other=np.nan)
    # find where snowfall >= 0.01 kg m-2 h-1 or >= threshold in these liquid containing clouds. 
    th_sf = 0.01
    # ds_lcc_2t_sf = ds_lcc_2t_sf.where(ds_lcc_2t_sf['prsn'] >= th_sf, other=np.nan)
    # ds_lcc_2t_sf = ds_lcc_2t_days.where(ds_lcc_2t_days['prsn'] >= th_sf, other=np.nan)
    ds_lcc_2t_sf = ds_lcc_2t.where(ds_lcc_2t['prsn'] >= th_sf, other=np.nan)
    # applying snowfall days, based on threshold (th_sf). Gives days where snowfall above th_sf and counts days in season and devides 
    # by season days
    # th_days_sf = (ds_lcc_2t_sf['twp'].groupby('time.season').count(dim='time', keep_attrs=False))/days_season
    ds_lcc_2t_sf_season = ds_lcc_2t_sf.groupby('time.season').mean('time', skipna=True, keep_attrs=True)
    # ds_lcc_2t_sf_season = ds_lcc_2t_season.where(th_days_sf>=th_frac, other=np.nan)   # not needed for statistic
    ds_lcc_2t_sf_season
    
    ds_lcc_sf = ds_lcc.where(ds_lcc['prsn'] >= th_sf, other=np.nan)
    ds_lcc_sf_season = ds_lcc_sf.groupby('time.season').mean('time', skipna=True,keep_attrs=True)
    
    ds_sf = ds.where(ds['prsn'] >= th_sf, other=np.nan)
    
    # Now create daily dataset based on seasonal supercooled liquid containing cloud days above th_sf, and th_frac
    # _mam = ((ds_lcc_2t_sf.sel(time=fct.is_season(ds_lcc_2t_sf['time.month'], 3, 5))).where(th_days_sf.sel(season='MAM') >=th_frac)).drop('season')
    # _jja = ((ds_lcc_2t_sf.sel(time=fct.is_season(ds_lcc_2t_sf['time.month'], 6, 8))).where(th_days_sf.sel(season='JJA') >=th_frac)).drop('season')
    # _son = ((ds_lcc_2t_sf.sel(time=fct.is_season(ds_lcc_2t_sf['time.month'], 9, 11))).where(th_days_sf.sel(season='SON') >=th_frac)).drop('season')
    # _djf = ((xr.concat([ds_lcc_2t_sf.sel(time=fct.is_season(ds_lcc_2t_sf['time.month'], 1, 2)), 
    #                 ds_lcc_2t_sf.sel(time=fct.is_season(ds_lcc_2t_sf['time.month'],12,12))], dim='time')).where(th_days_sf.sel(season='DJF') >=th_frac)).drop('season')

    # ds_lcc_2t_sf_days = xr.merge(objects=[_djf, _jja, _mam, _son])
    
    
    # ds_lcc, ds_lcc_2t_days, #ds_lcc_2t_sf_days
    return(ds_2t, ds_sf,
           ds_lcc_2t, ds_lcc_2t_days, ds_lcc_2t_sf,
           ds_lcc,    ds_lcc_days, ds_lcc_sf, 
           days_season, days_month)

In [None]:
def statistic_to_netcdf(dict_ds, statistic, days_season, days_month):
    for model in dict_ds.keys():
        dict_ds[model]['days_season'] = days_season[model]
        dict_ds[model]['days_month']  = days_month[model]
        
        if 'areacella' not in list(dict_ds[model].keys()):
            weights = fct.area_grid(dict_ds[model]['lat'].data, dict_ds[model]['lon'].data)
            weights = weights.fillna(0)
            dict_ds[model]['areacella'] = weights
        if 'areacella' in list(dict_ds[model].keys()):
            weights = dict_ds[model]['areacella'].fillna(0)
            dict_ds[model]['areacella'] = weights
            
        if 'time' in dict_ds[model]['areacella'].coords:
            dict_ds[model]['areacella'] = dict_ds[model]['areacella'].isel(time=0).squeeze()    
        # try:
        #     dict_ds[model]['areacella'] = dict_ds[model]['areacella'].isel(time=0).squeeze()
        # except ValueError:
        #     print('...')
            
        starty = dict_ds[model].indexes['time'].year.unique()[0]
        endy = dict_ds[model].indexes['time'].year.unique()[-1]
        out_dir = f'{dat_out}/{statistic}'
        try:
            os.mkdir(out_dir)
        except OSError:
            pass
        
        dict_ds_NH = dict_ds[model].sel(lat=slice(45,90))
        dict_ds_SH = dict_ds[model].sel(lat=slice(-90,-45))
        
        ds_out = xr.concat([dict_ds_SH, dict_ds_NH], dim='lat')
        # if 'model' in dict_ds[model].dims:
        #     ds_out = ds_out[['time', 'lat', 'lon', 'season', 'month', 'model']]
        # else:
        #     ds_out = ds_out[['time', 'lat', 'lon', 'season', 'month']]

        
        file_out = f'{out_dir}/{model}_{statistic}_{starty}0101-{endy}1231.nc'
        print(f'writing file ... {file_out}')
        if 'model' in dict_ds[model].dims:
            (ds_out.transpose('time', 'lat', 'lon', 'season', 'month', 'model')).to_netcdf(file_out)
        else:
        # ds_out.to_netcdf(file_out)
            (ds_out.transpose('time', 'lat', 'lon', 'season', 'month')).to_netcdf(file_out)
        
        

In [None]:
cmip_dict.keys()

dict_keys(['MIROC6', 'CanESM5', 'AWI-ESM-1-1-LR', 'MPI-ESM1-2-LR', 'UKESM1-0-LL', 'HadGEM3-GC31-LL', 'CNRM-CM6-1', 'CNRM-ESM2-1', 'IPSL-CM6A-LR', 'IPSL-CM5A2-INCA', 'cmip_250', 'cmip_500'])

In [16]:
for threshold in [3, 5, 10, 15, 20]:
    # print(threshold*0.001)
    ds_2t      = {}
    ds_sf      = {}
    ds_lcc_2t   = {}
    ds_lcc_2t_days  = {} 
    ds_lcc_2t_sf = {}

    ds_lcc = {}
    ds_lcc_days = {}
    ds_lcc_sf = {}
    days_season = {}
    days_month  = {}
    print('Calculate statistics in ERA5 ...')
    # for model in era_dict.keys():
    model = 'era_500'
    ds_2t[model], ds_sf[model], ds_lcc_2t[model], ds_lcc_2t_days[model], ds_lcc_2t_sf[model], \
            ds_lcc[model], ds_lcc_days[model], ds_lcc_sf[model], \
                days_season[model], days_month[model] = find_lcc_sf(era_dict[model], threshold)
                
    print('Calculate statistics in CMIP6 ...')
    # for model in cmip_dict.keys():       
    model = 'cmip_500' 
    ds_2t[model], ds_sf[model], ds_lcc_2t[model], ds_lcc_2t_days[model], ds_lcc_2t_sf[model], \
            ds_lcc[model], ds_lcc_days[model], ds_lcc_sf[model], \
                    days_season[model], days_month[model] = find_lcc_sf(cmip_dict[model], threshold)
                    
    statistic_to_netcdf(era_dict, 'orig', days_season, days_month)
    statistic_to_netcdf(cmip_dict, 'orig', days_season, days_month)
    statistic_to_netcdf(ds_2t, '2t', days_season, days_month)
    statistic_to_netcdf(ds_sf, 'sf', days_season, days_month)
    statistic_to_netcdf(ds_lcc_2t, f'{threshold}_lcc_2t', days_season, days_month )
    statistic_to_netcdf(ds_lcc_2t_days, f'{threshold}_lcc_2t_days', days_season, days_month)
    statistic_to_netcdf(ds_lcc_2t_sf, f'{threshold}_lcc_2t_sf', days_season, days_month)
    statistic_to_netcdf(ds_lcc, f'{threshold}_lcc', days_season, days_month)
    statistic_to_netcdf(ds_lcc_days, f'{threshold}_lcc_days', days_season, days_month)
    statistic_to_netcdf(ds_lcc_sf, f'{threshold}_lcc_sf', days_season, days_month)


Calculate statistics in ERA5 ...


Calculate statistics in CMIP6 ...
writing file ... /mn/vann/franzihe/output/CS_ERA5_CMIP6/3_lcc_2t/era_500_3_lcc_2t_20070101-20101231.nc
writing file ... /mn/vann/franzihe/output/CS_ERA5_CMIP6/3_lcc_2t/cmip_500_3_lcc_2t_20070101-20101231.nc
writing file ... /mn/vann/franzihe/output/CS_ERA5_CMIP6/3_lcc_2t_days/era_500_3_lcc_2t_days_20070101-20101231.nc
writing file ... /mn/vann/franzihe/output/CS_ERA5_CMIP6/3_lcc_2t_days/cmip_500_3_lcc_2t_days_20070101-20101231.nc
writing file ... /mn/vann/franzihe/output/CS_ERA5_CMIP6/3_lcc_2t_sf/era_500_3_lcc_2t_sf_20070101-20101231.nc
writing file ... /mn/vann/franzihe/output/CS_ERA5_CMIP6/3_lcc_2t_sf/cmip_500_3_lcc_2t_sf_20070101-20101231.nc
writing file ... /mn/vann/franzihe/output/CS_ERA5_CMIP6/3_lcc/era_500_3_lcc_20070101-20101231.nc
writing file ... /mn/vann/franzihe/output/CS_ERA5_CMIP6/3_lcc/cmip_500_3_lcc_20070101-20101231.nc
writing file ... /mn/vann/franzihe/output/CS_ERA5_CMIP6/3_lcc_days/era_500_3_lcc_days_20070101-20101231.nc
writing fil