# Analysis of CMIP6, ERA5, and CloudSat


# Table of Contents
<ul>
<li><a href="#introduction">1. Introduction</a></li>
<li><a href="#data_wrangling">2. Data Wrangling</a></li>
<li><a href="#exploratory">3. Exploratory Data Analysis</a></li>
<li><a href="#conclusion">4. Conclusion</a></li>
<li><a href="#references">5. References</a></li>
</ul>

# 1. Introduction <a id='introduction'></a>


**Questions**
* How is the cloud phase and snowfall 


> **_NOTE:_** .

# 2. Data Wrangling <a id='data_wrangling'></a>


## Organize my data

- Define a prefix for my project (you may need to adjust it for your own usage on your infrastructure).
    - input folder where all the data used as input to my Jupyter Notebook is stored (and eventually shared)
    - output folder where all the results to keep are stored
    - tool folder where all the tools

The ERA5 0.25deg data is located in the folder `\scratch\franzihe\`, CloudSat at ...



In [1]:
lwp_threshold = 20

In [2]:
import os
import pathlib
import sys
import socket
hostname = socket.gethostname()

abs_path = str(pathlib.Path(hostname).parent.absolute())
WORKDIR = abs_path[:- (len(abs_path.split('/')[-2] + abs_path.split('/')[-1])+1)]


if "mimi" in hostname:
    print(hostname)
    DATA_DIR = "/mn/vann/franzihe/"
    # FIG_DIR = "/uio/kant/geo-geofag-u1/franzihe/Documents/Figures/ERA5/"
    FIG_DIR = f"/uio/kant/geo-geofag-u1/franzihe/Documents/Python/globalsnow/CloudSat_ERA5_CMIP6_analysis/Figures/CS_ERA5_CMIP6_hourly_{lwp_threshold}/"
elif "glefsekaldt" in hostname: 
    DATA_DIR = "/home/franzihe/Data/"
    FIG_DIR = "/home/franzihe/Documents/Figures/ERA5/"

INPUT_DATA_DIR = os.path.join(DATA_DIR, 'input')
OUTPUT_DATA_DIR = os.path.join(DATA_DIR, 'output')
UTILS_DIR = os.path.join(WORKDIR, 'utils')
FIG_DIR_mci = os.path.join(FIG_DIR, 'McIlhattan/')

sys.path.append(UTILS_DIR)
# make figure directory
try:
    os.mkdir(FIG_DIR)
except OSError:
    pass

try:
    os.mkdir(FIG_DIR_mci)
except OSError:
    pass

mimi.uio.no


## Import python packages
- `Python` environment requirements: file [requirements_globalsnow.txt](../../requirements_globalsnow.txt) 
- load `python` packages from [imports.py](../../utils/imports.py)
- load `functions` from [functions.py](../../utils/functions.py)


In [3]:
# supress warnings
import warnings
warnings.filterwarnings('ignore') # don't output warnings

# import packages
from imports import(xr, ccrs, cy, plt, glob, cm, fct, np, pd, add_cyclic_point)
# from matplotlib.lines import Line2D
# from matplotlib.patches import Patch
# from sklearn.metrics import r2_score


xr.set_options(display_style='html')

<xarray.core.options.set_options at 0x7f034012f220>

In [4]:
# reload imports
%load_ext autoreload
%autoreload 2

## Open variables
Get the data requried for the analysis. 



In [5]:
dat_in = os.path.join(OUTPUT_DATA_DIR, 'CS_ERA5_CMIP6')
dat_in
# make output data directory
# try:
#     os.mkdir(dat_out)
# except OSError:
#     pass

'/mn/vann/franzihe/output/CS_ERA5_CMIP6'

In [6]:
# variable_id = ['tas', 'prsn', 'pr', 'lwp', 'clivi', 'areacella']

In [7]:
# Define list of models
list_models = [#'cloudsat_250',
               'cloudsat_500',
               # 'era_30',
               # 'era_250',
               'era_500',
               # 'cmip_250',
               'cmip_500',
               # 'MIROC6', 
               # 'CanESM5', 
               # 'AWI-ESM-1-1-LR', 
               # 'MPI-ESM1-2-LR', 
               # 'UKESM1-0-LL', 
               # 'HadGEM3-GC31-LL',
               # 'CNRM-CM6-1',
               # 'CNRM-ESM2-1',
               # 'IPSL-CM6A-LR',
               # 'IPSL-CM5A2-INCA'
            ]



In [8]:
# Create empty dictionaries to store the Xarray datasets for different variables:
variables = ['orig', '2t', 'lcc', 'lcc_2t', 'lcc_sf', 'lcc_2t_days', 'lcc_2t_sf', ]
ds = {var: {} for var in variables}
ds_mci = {var: {} for var in variables}
ds_hourly = {var: {} for var in variables}

In [9]:
# for model in list_models:
    
#     for var in variables:
#         if model == 'cloudsat_250' or model == 'cloudsat_500' or var == 'orig' or var == '2t':
#             file_pattern = f'{dat_in}/{var}/{model}_{var}*.nc'
#         else:
#             file_pattern = f'{dat_in}/{lwp_threshold}_{var}/{model}_{lwp_threshold}_{var}*.nc'
#         # print(file_pattern)
#         files = sorted(glob(file_pattern))
#         # print(files)
#         for file in files:
#             _ds = xr.open_mfdataset(file)
#             # [var][model]
#             ds[var][model] = xr.Dataset()
#             # ds_mci[var][model] = xr.Dataset()
#             # make the data cyclic going from -180 to 180
#             for var_id in _ds.keys():
#                 data = _ds[var_id]
                
#                 if 'lon' in _ds[var_id].dims and (data['lon'][0] != data['lon'][-1]*(-1)):
#                     lon = _ds.coords['lon']
#                     lon_idx = data.dims.index('lon')
#                     wrap_data, wrap_lon = add_cyclic_point(data, coord=lon, axis=lon_idx)
                    
#                     if len(wrap_data.shape) == 2:
#                         ds[var][model][var_id] = xr.DataArray(data = wrap_data, coords=dict(lat=data['lat'],
#                                                                                             lon=np.append(data['lon'].values, data['lon'][0].values*(-1))))
                    
#                     if len(wrap_data.shape) == 3:
#                         if 'time' in data.dims:
#                             ds[var][model][var_id] = xr.DataArray(data = wrap_data, coords=dict(time=data['time'],
#                                                                                                 lat=data['lat'],
#                                                                                                 lon=np.append(data['lon'].values, data['lon'][0].values*(-1))))
#                         elif 'model' in data.dims:
#                             ds[var][model][var_id] = xr.DataArray(data = wrap_data, coords=dict(lat=data['lat'],
#                                                                                                 lon=np.append(data['lon'].values, data['lon'][0].values*(-1)),
#                                                                                                 model=data['model']), 
#                                                                   )
#                     if len(wrap_data.shape) == 4:
#                         ds[var][model][var_id] = xr.DataArray(data = wrap_data, coords=dict(time=data['time'],
#                                                                                             lat=data['lat'],
#                                                                                             lon=np.append(data['lon'].values, data['lon'][0].values*(-1)),
#                                                                                             model=data['model']))
                        
#                 else:
#                     ds[var][model][var_id] = data
                    
#                 ds[var][model][var_id].attrs = data.attrs
                
#             # ds_mci[var][model] = xr.concat([ds[var][model].sel(lat=slice(-90,-66.91)),
#             #                                 ds[var][model].sel(lat=slice(66.91,90))], dim ='lat')
#             # ds_mci[var][model] = ds[var][model].where(np.logical_and((ds[var][model].lat<=66.91),(ds[var][model].lat>=66.91)), other=np.nan )
            
# # Access the datasets using  ds[var][model]
# # For example:
# # lcc_2t_days_dataset = ds['lcc_2t_days']['era_30']




In [10]:
for model in list_models:
# for model in list_models[1:2]:

    
    for var in variables:
        if model == 'cloudsat_250' or model == 'cloudsat_500':
            file_pattern = f'{dat_in}/{var}/{model}_{var}*.nc'
        elif model == 'era_30' or model == 'era_250' or model == 'era_500':
            if var == 'orig' or var == '2t':
                file_pattern = f'{dat_in}_hourly/{var}/{model}_{var}*.nc'
            else:
                file_pattern = f'{dat_in}_hourly/{lwp_threshold}_{var}/{model}_{lwp_threshold}_{var}*.nc'
        else:
            if var == 'orig' or var == '2t':
                file_pattern = f'{dat_in}/{var}/{model}_{var}*.nc'
            else:
                file_pattern = f'{dat_in}/{lwp_threshold}_{var}/{model}_{lwp_threshold}_{var}*.nc'
                
                
        files = sorted(glob(file_pattern))
        # print(files)
        for file in files:
            _ds_hourly = xr.open_mfdataset(file)
            # [var][model]
            ds_hourly[var][model] = xr.Dataset()
            # ds_hourly_mci[var][model] = xr.Dataset()
            # make the data cyclic going from -180 to 180
            for var_id in _ds_hourly.keys():
                data = _ds_hourly[var_id]
                
                if 'lon' in _ds_hourly[var_id].dims and (data['lon'][0] != data['lon'][-1]*(-1)):
                    lon = _ds_hourly.coords['lon']
                    lon_idx = data.dims.index('lon')
                    wrap_data, wrap_lon = add_cyclic_point(data, coord=lon, axis=lon_idx)
                    
                    if len(wrap_data.shape) == 2:
                        ds_hourly[var][model][var_id] = xr.DataArray(data = wrap_data, coords=dict(lat=data['lat'],
                                                                                            lon=np.append(data['lon'].values, data['lon'][0].values*(-1))))
                    
                    if len(wrap_data.shape) == 3:
                        if 'time' in data.dims:
                            ds_hourly[var][model][var_id] = xr.DataArray(data = wrap_data, coords=dict(time=data['time'],
                                                                                                lat=data['lat'],
                                                                                                lon=np.append(data['lon'].values, data['lon'][0].values*(-1))))
                        elif 'model' in data.dims:
                            ds_hourly[var][model][var_id] = xr.DataArray(data = wrap_data, coords=dict(lat=data['lat'],
                                                                                                lon=np.append(data['lon'].values, data['lon'][0].values*(-1)),
                                                                                                model=data['model']), 
                                                                  )
                    if len(wrap_data.shape) == 4:
                        ds_hourly[var][model][var_id] = xr.DataArray(data = wrap_data, coords=dict(time=data['time'],
                                                                                            lat=data['lat'],
                                                                                            lon=np.append(data['lon'].values, data['lon'][0].values*(-1)),
                                                                                            model=data['model']))
                        
                else:
                    ds_hourly[var][model][var_id] = data
                    
                ds_hourly[var][model][var_id].attrs = data.attrs
                
            # ds_hourly_mci[var][model] = xr.concat([ds_hourly[var][model].sel(lat=slice(-90,-66.91)),
            #                                 ds_hourly[var][model].sel(lat=slice(66.91,90))], dim ='lat')
            # ds_hourly_mci[var][model] = ds_hourly[var][model].where(np.logical_and((ds_hourly[var][model].lat<=66.91),(ds_hourly[var][model].lat>=66.91)), other=np.nan )
            
# Access the datasets using  ds_hourly[var][model]
# For example:
# lcc_2t_days_dataset = ds_hourly['lcc_2t_days']['era_30']




In [11]:
ds_hourly['orig'].keys()

dict_keys(['cloudsat_500', 'era_500', 'cmip_500'])

In [12]:
# for var in ds.keys():
#     for model in ['cloudsat_250', 'cloudsat_500', 'cmip_250', 'cmip_500']:
#         # ds_hourly[var][model] = xr.Dataset()
#         try: 
#             ds_hourly[var][model] = ds[var][model]
#         except KeyError:
#             # print(var, model)
#             continue

In [13]:
dict_label = {
     # 'lcc_wo_snow': {'cb_label':'FsLCC (%)', 'levels':np.arange(0,110,10), 'vmin': 0, 'vmax':100, 'diff_levels':np.arange(-30,35,5), 'diff_vmin':-30, 'diff_vmax':30},
#      'lcc_w_snow':  {'cb_label':'FoS in sLCCs (%)', 'levels':np.arange(0,110,10), 'vmin': 0, 'vmax':100, 'diff_levels':np.arange(-60,65,5), 'diff_vmin':-60, 'diff_vmax':60},
#      'sf_eff':      {'cb_label':'SE in sLCCs (h$^{-1}$)', 'levels':np.arange(0,5.5,.5), 'vmin':0, 'vmax':5, 'diff_levels':np.arange(-1.2,1.4,.2), 'diff_vmin':-1.2, 'diff_vmax':1.2}#'Relative snowfall efficiency (h$^{-1}$)'
     
     'FLCC' : {'cb_label':'FLCC (%)',             'levels':np.arange(0,105.,5.), 'vmin':0, 'vmax': 100.,   'diff_levels':np.arange(-100,110,10),   'diff_vmin':-100, 'diff_vmax':100},
     'FsLCC': {'cb_label':'FsLCC (%)',            'levels':np.arange(0,105.,5.), 'vmin':0, 'vmax': 100,   'diff_levels':np.arange(-100,110,10),   'diff_vmin':-100, 'diff_vmax':100},
     # # 'FoP'  : {'cb_label':'FoP in LCCs (%)',      'levels':np.arange(0,105.,5.), 'vmin':0, 'vmax': 100,   'diff_levels':np.arange(-100,110,10),   'diff_vmin':-100, 'diff_vmax':100},
     # 'FoS'  : {'cb_label':'FoS in sLCCs (%)',     'levels':np.arange(0,105.,5.), 'vmin':0, 'vmax': 100,   'diff_levels':np.arange(-100,110,10),   'diff_vmin':-100, 'diff_vmax':100},
     # 'sf_eff': {'cb_label':'SE in sLCCs (h$^{-1}$)','levels':np.arange(0,5.5,.5), 'vmin':0, 'vmax': 5,   'diff_levels':np.arange(-1.2,1.4,.2),   'diff_vmin':-1.2, 'diff_vmax':1.2},
     # # 'pr_eff': {'cb_label':'PE in sLCCs (h$^{-1}$)', 'levels':np.arange(0,550.,50.), 'vmin':0, 'vmax':500,   'diff_levels':np.arange(-120,140,20),   'diff_vmin':-120, 'diff_vmax':120},
     'FLCC-FsLCC': {'cb_label':'FLCC (%), FsLCC (%)',  'levels':np.arange(0,105.,5.), 'vmin':0, 'vmax': 100,   'diff_levels':np.arange(-100,110,10),   'diff_vmin':-100, 'diff_vmax':100}}


In [14]:
for var in ds.keys():
    print(var, ds[var].keys())

orig dict_keys([])
2t dict_keys([])
lcc dict_keys([])
lcc_2t dict_keys([])
lcc_sf dict_keys([])
lcc_2t_days dict_keys([])
lcc_2t_sf dict_keys([])


In [15]:
# ratios = fct.get_ratios_dict(list_models, ds,seasons='normal')
ratios_hourly = fct.get_ratios_dict(list_models, ds_hourly,seasons='normal')



In [16]:
for variables in ratios_hourly['cloudsat_500'].variables:
    for var_name in dict_label.keys():
        if var_name in variables:
            # ratios = fct.get_only_valid_values(ratios, '500', variables)
            ratios_hourly = fct.get_only_valid_values(ratios_hourly, '500', variables)
    

In [17]:
# # # Calculate weighted averages

# for model in ratios.keys():
#     weights = ds['orig'][model]['areacella']
#     for vars in ratios[model].keys():
#         ratios[model][vars+'_mean'], ratios[model][vars+'_std'], ratios[model][vars+'_stats'] = fct.weighted_average(ratios[model][vars], weights)
        


In [18]:
for model in ratios_hourly.keys():
    weights_hourly = ds_hourly['orig'][model]['areacella']
    for vars in ratios_hourly[model].keys():
        ratios_hourly[model][vars+'_mean'], ratios_hourly[model][vars+'_std'], ratios_hourly[model][vars+'_stats'] = fct.weighted_average(ratios_hourly[model][vars], weights_hourly)
     

In [19]:
ratios_hourly['500'] = xr.concat([ratios_hourly['cloudsat_500'].assign_coords(coords={'model':'CloudSat'}),
                                  ratios_hourly['era_500'].assign_coords(coords={'model':'ERA5'}),
                                  ratios_hourly['cmip_500']], dim=("model"))

In [20]:
file_dir_hourly = os.path.join(OUTPUT_DATA_DIR, 'CS_ERA5_CMIP6_hourly/ratios_500/')
try:
    os.mkdir(file_dir_hourly)
except OSError:
    pass

In [21]:
for var_name in dict_label.keys():
    file_name = f'{var_name}_LWP{lwp_threshold}_2007_2010.nc'
    (ratios_hourly['500'][[f'{var_name}_season', f'{var_name}_season_cs_mean',
                           f'{var_name}_month_mean', f'{var_name}_year_mean', f'{var_name}_month_cs_mean', f'{var_name}_year_cs_mean',
                           f'{var_name}_month_years_mean', f'{var_name}_year_years_mean', f'{var_name}_season_mean',
                           f'{var_name}_month',f'{var_name}_year',f'{var_name}_year_cs_mean',
                           ]]).to_netcdf(path=os.path.join(file_dir_hourly, file_name), format="NETCDF4")
    

In [22]:
lwp_threshold

20