# Calculate PSL and save as a file
CESM2-LE ONLY

* **Description**: Reads in and creates seasonal and ensemble means and vertically interpolates
* **Input data**: CESM2-LE output in timeseries format from intake-esm
* **Output data**: Netcdf file with output
* **Creator**: Alice DuVivier
* **Date**: March 2022

In [1]:
import xarray as xr
import numpy as np
from datetime import timedelta
import glob

import pop_tools

import matplotlib.pyplot as plt
import matplotlib.path as mpath
from matplotlib.gridspec import GridSpec

import geocat.datafiles as gdf
import geocat.viz.util as gvutil
from geocat.viz import cmaps as gvcmaps
import geocat.comp as gcomp

import cartopy.crs as ccrs
import cartopy.feature as cfeature
from scipy.stats import linregress,pearsonr, t

import dask
import intake
from distributed import Client
from ncar_jobqueue import NCARCluster

  from distributed.utils import tmpfile


In [2]:
# spin up dask cluster

import dask

# Use dask jobqueue
from dask_jobqueue import PBSCluster

# Import a client
from dask.distributed import Client

# Setup your PBSCluster
cluster = PBSCluster(
    cores=36, # The number of cores you want
    memory='300 GB', # Amount of memory
    processes=9, # How many processes
    queue='casper', # The type of queue to utilize (/glade/u/apps/dav/opt/usr/bin/execcasper)
    local_directory='$TMPDIR', # Use your local directory
    resource_spec='select=1:ncpus=36:mem=300GB', # Specify resources
    project='P93300665', # Input your project ID here
    walltime='06:00:00', # Amount of wall time
    interface='ib0', # Interface to use
)
# Scale up
cluster.scale(jobs=8)

# Change your url to the dask dashboard so you can see it
dask.config.set({'distributed.dashboard.link':'https://jupyterhub.hpc.ucar.edu/stable/user/{USER}/proxy/{port}/status'})

# Setup your client
client = Client(cluster)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 40391 instead
  f"Port {expected} is already in use.\n"


In [3]:
client

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/duvivier/proxy/40391/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/duvivier/proxy/40391/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://10.12.206.42:44378,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/duvivier/proxy/40391/status,Total threads: 0
Started: Just now,Total memory: 0 B


## Manually set variables

In [4]:
# list the variables to load
var_in_1 = 'PSL'

## Load the CESM-LE data 

We will use [`intake-esm`](https://intake-esm.readthedocs.io/en/latest/), which is a data catalog tool.
It enables querying a database for the files we want, then loading those directly as an `xarray.Dataset`.

First step is to set the "collection" for the CESM-LE, which depends on a json file conforming to the [ESM Catalog Specification](https://github.com/NCAR/esm-collection-spec).

In [5]:
catalog_file = '/glade/collections/cmip/catalog/intake-esm-datastore/catalogs/glade-cesm2-le.json'

cat = intake.open_esm_datastore(catalog_file)

  self._df, self.catalog_file = _fetch_catalog(self.esmcol_data, esmcol_obj, csv_kwargs)


In [6]:
forcing = 'cmip6'  # do not want smbb data
expt = 'ssp370'
comp = 'atm'
freq = 'month_1'

subset_1 = cat.search(variable=var_in_1, forcing_variant=forcing, experiment=expt, component=comp, frequency=freq )

In [7]:
subset_1.df.head()

Unnamed: 0,component,stream,case,member_id,variable,start_time,end_time,time_range,long_name,units,vertical_levels,frequency,path,experiment,forcing_variant,cesm_member_id,control_branch_year,cmip_experiment_id
0,atm,cam.h0,b.e21.BSSP370cmip6.f09_g17.LE2-1001.001,r1i1001p1f1,PSL,2015-01,2024-12,201501-202412,Sea level pressure,Pa,1.0,month_1,/glade/campaign/cgd/cesm/CESM2-LE/timeseries/a...,ssp370,cmip6,1001.001,1001,CESM2_ssp370_r1i1001p1f1
1,atm,cam.h0,b.e21.BSSP370cmip6.f09_g17.LE2-1001.001,r1i1001p1f1,PSL,2025-01,2034-12,202501-203412,Sea level pressure,Pa,1.0,month_1,/glade/campaign/cgd/cesm/CESM2-LE/timeseries/a...,ssp370,cmip6,1001.001,1001,CESM2_ssp370_r1i1001p1f1
2,atm,cam.h0,b.e21.BSSP370cmip6.f09_g17.LE2-1001.001,r1i1001p1f1,PSL,2035-01,2044-12,203501-204412,Sea level pressure,Pa,1.0,month_1,/glade/campaign/cgd/cesm/CESM2-LE/timeseries/a...,ssp370,cmip6,1001.001,1001,CESM2_ssp370_r1i1001p1f1
3,atm,cam.h0,b.e21.BSSP370cmip6.f09_g17.LE2-1001.001,r1i1001p1f1,PSL,2045-01,2054-12,204501-205412,Sea level pressure,Pa,1.0,month_1,/glade/campaign/cgd/cesm/CESM2-LE/timeseries/a...,ssp370,cmip6,1001.001,1001,CESM2_ssp370_r1i1001p1f1
4,atm,cam.h0,b.e21.BSSP370cmip6.f09_g17.LE2-1001.001,r1i1001p1f1,PSL,2055-01,2064-12,205501-206412,Sea level pressure,Pa,1.0,month_1,/glade/campaign/cgd/cesm/CESM2-LE/timeseries/a...,ssp370,cmip6,1001.001,1001,CESM2_ssp370_r1i1001p1f1


In [8]:
# check that we only have cmip6, not smbb, data
member_id = list(subset_1.df.experiment.unique())
print(member_id)

['ssp370']


In [9]:
%%time
with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    dsets_1 = subset_1.to_dataset_dict(cdf_kwargs={'chunks': {'time':50}, 'decode_times': True})
#    dsets_1 = subset_1.to_dataset_dict(cdf_kwargs={'chunks': {'time':240}, 'decode_times': True})


--> The keys in the returned dictionary of datasets are constructed as follows:
	'component.experiment.stream.forcing_variant.variable'


CPU times: user 24.3 s, sys: 2.52 s, total: 26.8 s
Wall time: 18min 20s


In [10]:
# load in the future datasets
futures_1 = []
for key in sorted(dsets_1.keys()):
    futures_1.append(dsets_1[key])
    print(key)

atm.ssp370.cam.h0.cmip6.PSL


In [11]:
future_ds_1 = xr.concat(futures_1, dim='member_id')

In [12]:
future_ds_1.time

In [13]:
# Shift months by one to be center of time period.
# Take average of the time bounds to get middle of month
# will lose some attributes with time, so may need to put this back in later...
future_ds_1['time'] = future_ds_1.time_bnds.load().mean(dim='nbnd').sel(member_id='r1i1281p1f1')

In [14]:
# get just NH slice
future_ds_1_masked = future_ds_1.isel(lat=slice(164,192))

In [15]:
# grab variables of interest
PSL_le = future_ds_1_masked[var_in_1]

In [16]:
PSL_le.persist()

Unnamed: 0,Array,Chunk
Bytes,1.55 GiB,1.54 MiB
Shape,"(50, 1032, 28, 288)","(1, 50, 28, 288)"
Count,1300 Tasks,1300 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.55 GiB 1.54 MiB Shape (50, 1032, 28, 288) (1, 50, 28, 288) Count 1300 Tasks 1300 Chunks Type float32 numpy.ndarray",50  1  288  28  1032,

Unnamed: 0,Array,Chunk
Bytes,1.55 GiB,1.54 MiB
Shape,"(50, 1032, 28, 288)","(1, 50, 28, 288)"
Count,1300 Tasks,1300 Chunks
Type,float32,numpy.ndarray


## Calculate Seasonal Means

In [17]:
season_names = ['OND','JFM', 'AMJ', 'JAS']

In [18]:
# find total years
xarr_le = PSL_le.coords['time.year'][(PSL_le.coords['time.month']==1)]

In [19]:
# Loop through seasons - le

# make numpy array to fill and specify dimensions we want
seas_array_le_1 = np.zeros([len(season_names),len(xarr_le),len(PSL_le.member_id),len(PSL_le.lat),len(PSL_le.lon)])

for s_count, ss in enumerate(season_names):
#for s_count, ss in enumerate(season_names[0:1]):
    print(ss)
    ### Z PLEV
    # get temporary array of just these month by season
    if ss == 'JFM':
        temp1 = PSL_le.isel(time=PSL_le.time.dt.month.isin([1,2,3]))
    if ss == 'AMJ':
        temp1 = PSL_le.isel(time=PSL_le.time.dt.month.isin([4,5,6]))
    if ss == 'JAS':
        temp1 = PSL_le.isel(time=PSL_le.time.dt.month.isin([7,8,9]))
    if ss == 'OND':
        temp1 = PSL_le.isel(time=PSL_le.time.dt.month.isin([10,11,12]))
    # now loop through years to get the seasonal average by year for each ensemble member
    for y_count, yy in enumerate(xarr_le):
    #for y_count, yy in enumerate(xarr_le[0:1]):
        # select only the indexes for this year
        temp1a = temp1.isel(time=temp1.time.dt.year.isin([yy])).mean(dim='time')
        seas_array_le_1[s_count,y_count,:,:,:] = temp1a   


OND
JFM
AMJ
JAS


In [20]:
print(seas_array_le_1.shape)

(4, 86, 50, 28, 288)


In [21]:
# convert the numpy array to a xarray for easier plotting
PSL_seas_le = xr.DataArray(seas_array_le_1,dims=('season','time','member_id','lat','lon'))

In [22]:
# set coordinate arrays
PSL_seas_le['season'] = season_names
PSL_seas_le['time'] = xarr_le
PSL_seas_le['member_id'] = PSL_le['member_id']
PSL_seas_le['lat'] = PSL_le['lat'].values
PSL_seas_le['lon'] = PSL_le['lon'].values


## Write out files

In [23]:
# quick and dirty way to save a file!

# save rufmod expt, rename the variable so it makes sense
#fout = 'rufmod_vertical_seas_ens_mean_WS'
#
#WS_seas_ens_mean_rufmod.to_dataset(name='vert_ws').to_netcdf(fout+'.nc')

### PSL

In [24]:
#set info to write out
out_tag = 'PSL'
units = 'Pa'
longname = 'sea level pressure'

fout = 'CESM2-LE_seas_'+out_tag

In [26]:
ds_to_save = PSL_seas_le

In [27]:
# check how big this will be to write out in GB
ds_to_save.nbytes/(1024**3)

1.0334014892578125

In [28]:
# assign some attributes
refdata = {'Author': 'Alice DuVivier', 'units':units, 'longname':longname}

ds_to_save.attrs = refdata

In [29]:
# check data
ds_to_save


In [30]:
ds_to_save.to_netcdf(fout+'.nc')  # how to save file