# Ocean thermal forcing -- Verjans refactored with xarray
Clean ocean TF workflow for deployment on CCR.

10 Oct 2024 | EHU
- 15 Oct: try with CESM rather than IPSL, for now.  The [IPSL tripolar grid](https://cmc.ipsl.fr/international-projects/cmip5/ipsl-contribution-to-cmip5-faq/) is a complication to deal with in the next revision.
- 16 Oct: CESM2 is successful (in 2000-2014 and 1950-1999 examples)! Next try implementing `xr.mfdataset`, or else a for loop, to include all historical data together.  MFdataset may complicate the write-out on GHub but could be good for CCR.
- 24 Oct: Found that use of an unconventional fill value may have been messing with processing.  Removed fill value of 1.1e20 from the masking command `tf_out = fftf.where(gld_ds.thetao<1e10)` because xarray handles NaNs by default.

In [45]:
import os
import sys
import glob
import copy
import csv
import numpy as np
import netCDF4 as nc
import xarray as xr
import dask
from datetime import datetime

from codeFunctions import freezingPoint

In [46]:
### Settings for this run
saveBoxGreenlandNC = True
cwd                = os.getcwd()+'/'

SelModel = 'CESM2'

DirThetaoNC = f'/home/theghub/ehultee/projects/cmipdata/files/'
DirSoNC     = f'/home/theghub/ehultee/projects/cmipdata/files/'
DirSaveNC   = f'{cwd}../data/'

### Select experiment ###
To2015hist                 = True
To2100histssp585           = False
To2100histssp126           = False

if(To2015hist):
    Experiments = ['historical']
    DatesCut    = [2015]
elif(To2100histssp585): 
    Experiments = ['historical','ssp585']
    DatesCut    = [2015,2100]
elif(To2100histssp126): 
    Experiments = ['historical','ssp126']
    DatesCut    = [2015,2100]
nExp          = len(Experiments)
depthUnitConv = 1.0 #initialize depth unit converter

### Limits of Greenland domain ###
limN           = 86.0 ## degrees N latitude
limS           = 57.0 ## degrees N latitude
limE           = 4.0 ## degrees E latitude
limW           = 274.0 ## degrees E latitude
## CHECK: confirm that output shows up within this W-E box and not its E-W complement
limDp          = 1200.0
depthSubSample = 1



In [47]:
### 
nExp          = len(Experiments)
depthUnitConv = 1.0 #initialize depth unit converter

if(SelModel=='MIROCES2L'):
    dim2d              = True
    if(To2015hist):
        ls_members     = [f'r{id}' for id in range(1,30+1)]
    elif(To2100histssp585 or To2100histssp126):
        ls_members     = [f'r{id}' for id in range(1,10+1)]
    namelat            = 'latitude'
    namelon            = 'longitude'
    namez              = 'lev'
    datesendhist       = np.array(['201412'])
    if(To2100histssp585):
        datesendssp585     = np.array(['210012'])
    if(To2100histssp126):
        datesendssp126     = np.array(['210012'])
        
if(SelModel=='IPSL-CM6A-LR'):
    dim2d              = True
    if(To2015hist):
        ls_members     = [f'r{id}' for id in range(1,32+1)]
        ls_members.remove('r2') #no r2 member for IPSLCM6A
    elif(To2100histssp585 or To2100histssp126):
        ls_members     = ['r1'] #,'r3','r4','r6','r14']
    namelat            = 'nav_lat'
    namelon            = 'nav_lon'
    namez              = 'olevel'
    datesRef           = [1850.0,2015.0,2040.0] 
    datesendhist       = np.array(['194912','201412'])
    if(To2100histssp585):
        datesendssp585     = np.array(['210012'])
    if(To2100histssp126):
        datesendssp126     = np.array(['210012'])

else:
    print(f'Error script not implemented yet for {SelModel}')

# nMemb           = len(ls_members)

Error script not implemented yet for CESM2


### List the files to be read 

In [48]:
ThetaFiles_test = []
SoFiles_test = []
for expt in Experiments:
    fpath1 = DirThetaoNC+'thetao_Omon_{}_{}_'.format(SelModel, expt)
    print(fpath1)
    fpath2 = DirSoNC+'so_Omon_{}_{}_'.format(SelModel, expt)
    th_temp = glob.glob(f'{fpath1}*.nc')
    s_temp = glob.glob(f'{fpath2}*.nc')
    ThetaFiles_test += th_temp ##concat the glob lists
    SoFiles_test += s_temp

/home/theghub/ehultee/projects/cmipdata/files/thetao_Omon_CESM2_historical_


Confirm that the list is not empty.  If it is, something has gone wrong in the directory access or in the generation of names.

In [49]:
ThetaFiles_test

['/home/theghub/ehultee/projects/cmipdata/files/thetao_Omon_CESM2_historical_r11i1p1f1_gr_195001-199912.nc',
 '/home/theghub/ehultee/projects/cmipdata/files/thetao_Omon_CESM2_historical_r11i1p1f1_gr_185001-189912.nc',
 '/home/theghub/ehultee/projects/cmipdata/files/thetao_Omon_CESM2_historical_r11i1p1f1_gr_190001-194912.nc',
 '/home/theghub/ehultee/projects/cmipdata/files/thetao_Omon_CESM2_historical_r11i1p1f1_gr_200001-201412.nc']

### Load using a `with`-statement, to release memory as much as possible

Specify the paths of the `thetao` and `so` variables -- ensure they come from the same GCM (`SelModel`) and time period.  Use a `with` statement to read in, trim, and close the parent datasets.  This should leave us with the trimmed datasets `gld_ds` and `gld_so` to work with below.

In [50]:
path0 = [f for f in ThetaFiles_test if '2000' in f][0] ## thetao
path1 = [f for f in SoFiles_test if '2000' in f][0] ## salinity

## load in and trim thetao
with xr.open_dataset(path0, chunks={'lev':10}) as ds:
    ## trim to Greenland bounding box
    include_lat = (ds.lat>=limS) & (ds.lat <=limN)
    include_lon = np.logical_or(((ds.lon%360)<=limE),((ds.lon %360) >=limW)) 
    ## modulo 360 to account for lon going -180 to 180 or 0-360
    
    with dask.config.set(**{'array.slicing.split_large_chunks': True}): ## mitigate performance problem with slicing
        gld_ds = ds.where(include_lat & include_lon, drop=True)

## load and trim so
with xr.open_dataset(path1, chunks={'lev':10}) as ds1:
    ## trim to Greenland bounding box
    include_lat = (ds1.lat>=limS) & (ds1.lat <=limN)
    include_lon = np.logical_or(((ds1.lon%360)<=limE),((ds1.lon %360) >=limW))
    
    with dask.config.set(**{'array.slicing.split_large_chunks': True}): ## mitigate performance problem with slicing
        gld_so = ds1.where(include_lat & include_lon, drop=True) ## trim to Gld
    


  decode_timedelta=decode_timedelta,
  decode_timedelta=decode_timedelta,


In [51]:
gld_ds

Unnamed: 0,Array,Chunk
Bytes,7.17 MiB,7.17 MiB
Shape,"(180, 2, 29, 90)","(180, 2, 29, 90)"
Count,6 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 7.17 MiB 7.17 MiB Shape (180, 2, 29, 90) (180, 2, 29, 90) Count 6 Tasks 1 Chunks Type object numpy.ndarray",180  1  90  29  2,

Unnamed: 0,Array,Chunk
Bytes,7.17 MiB,7.17 MiB
Shape,"(180, 2, 29, 90)","(180, 2, 29, 90)"
Count,6 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.31 MiB,407.81 kiB
Shape,"(33, 2, 29, 90)","(10, 2, 29, 90)"
Count,14 Tasks,4 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 1.31 MiB 407.81 kiB Shape (33, 2, 29, 90) (10, 2, 29, 90) Count 14 Tasks 4 Chunks Type float64 numpy.ndarray",33  1  90  29  2,

Unnamed: 0,Array,Chunk
Bytes,1.31 MiB,407.81 kiB
Shape,"(33, 2, 29, 90)","(10, 2, 29, 90)"
Count,14 Tasks,4 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40.78 kiB,40.78 kiB
Shape,"(29, 2, 90)","(29, 2, 90)"
Count,7 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 40.78 kiB 40.78 kiB Shape (29, 2, 90) (29, 2, 90) Count 7 Tasks 1 Chunks Type float64 numpy.ndarray",90  2  29,

Unnamed: 0,Array,Chunk
Bytes,40.78 kiB,40.78 kiB
Shape,"(29, 2, 90)","(29, 2, 90)"
Count,7 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,40.78 kiB,40.78 kiB
Shape,"(90, 2, 29)","(90, 2, 29)"
Count,7 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 40.78 kiB 40.78 kiB Shape (90, 2, 29) (90, 2, 29) Count 7 Tasks 1 Chunks Type float64 numpy.ndarray",29  2  90,

Unnamed: 0,Array,Chunk
Bytes,40.78 kiB,40.78 kiB
Shape,"(90, 2, 29)","(90, 2, 29)"
Count,7 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,59.14 MiB,5.97 MiB
Shape,"(180, 33, 29, 90)","(180, 10, 29, 30)"
Count,48 Tasks,12 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 59.14 MiB 5.97 MiB Shape (180, 33, 29, 90) (180, 10, 29, 30) Count 48 Tasks 12 Chunks Type float32 numpy.ndarray",180  1  90  29  33,

Unnamed: 0,Array,Chunk
Bytes,59.14 MiB,5.97 MiB
Shape,"(180, 33, 29, 90)","(180, 10, 29, 30)"
Count,48 Tasks,12 Chunks
Type,float32,numpy.ndarray


### Compute the ocean thermal forcing

In [52]:
fp = xr.apply_ufunc(freezingPoint, gld_so.so, gld_so.lev, dask='parallelized',
                   dask_gufunc_kwargs={'allow_rechunk':True})
fftf = gld_ds.thetao - fp

In [53]:
## mask and apply a fill value
tf_out = fftf.where(gld_ds.thetao<1e10) ## apply Vincent's fill value of 1.1e20
## actually, just let xarray do its native processing with NaNs.

In [54]:
tf_out.assign_attrs(standard_name='TF',
                    long_name='Ocean thermal forcing',
                    # fillvalue=1.1e20,
                    latbounds=[limS, limN],
                    lonbounds=[limW,limE])

Unnamed: 0,Array,Chunk
Bytes,118.28 MiB,11.95 MiB
Shape,"(180, 33, 29, 90)","(180, 10, 29, 30)"
Count,170 Tasks,12 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 118.28 MiB 11.95 MiB Shape (180, 33, 29, 90) (180, 10, 29, 30) Count 170 Tasks 12 Chunks Type float64 numpy.ndarray",180  1  90  29  33,

Unnamed: 0,Array,Chunk
Bytes,118.28 MiB,11.95 MiB
Shape,"(180, 33, 29, 90)","(180, 10, 29, 30)"
Count,170 Tasks,12 Chunks
Type,float64,numpy.ndarray


In [55]:
now = datetime.now()
ds_temp = tf_out.to_dataset(name='TF')
# ds_temp.TF.attrs = tf_out.attrs
ds_out = ds_temp.assign_attrs(title='Ocean thermal forcing for {}'.format(SelModel),
                             summary='TF computed following Verjans code, in a bounding' + 
                              ' box around Greenland, for ISMIP7 Greenland forcing',
                             institution='NASA Goddard Space Flight Center',
                             creation_date=now.strftime('%Y-%m-%d %H:%M:%S'))

ds_out

Unnamed: 0,Array,Chunk
Bytes,118.28 MiB,11.95 MiB
Shape,"(180, 33, 29, 90)","(180, 10, 29, 30)"
Count,170 Tasks,12 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 118.28 MiB 11.95 MiB Shape (180, 33, 29, 90) (180, 10, 29, 30) Count 170 Tasks 12 Chunks Type float64 numpy.ndarray",180  1  90  29  33,

Unnamed: 0,Array,Chunk
Bytes,118.28 MiB,11.95 MiB
Shape,"(180, 33, 29, 90)","(180, 10, 29, 30)"
Count,170 Tasks,12 Chunks
Type,float64,numpy.ndarray


In [56]:
ds_out.info()

xarray.Dataset {
dimensions:
	time = 180 ;
	lev = 33 ;
	lat = 29 ;
	lon = 90 ;

variables:
	object time(time) ;
		time:axis = T ;
		time:bounds = time_bnds ;
		time:standard_name = time ;
		time:title = time ;
		time:type = double ;
	float64 lev(lev) ;
		lev:axis = Z ;
		lev:bounds = lev_bnds ;
		lev:long_name = ocean model level ;
		lev:positive = down ;
		lev:standard_name = olevel ;
		lev:units = m ;
	float64 lat(lat) ;
		lat:axis = Y ;
		lat:bounds = lat_bnds ;
		lat:long_name = latitude ;
		lat:standard_name = latitude ;
		lat:units = degrees_north ;
	float64 lon(lon) ;
		lon:axis = X ;
		lon:bounds = lon_bnds ;
		lon:long_name = longitude ;
		lon:standard_name = longitude ;
		lon:units = degrees_east ;
	float64 TF(time, lev, lat, lon) ;

// global attributes:
	:title = Ocean thermal forcing for CESM2 ;
	:summary = TF computed following Verjans code, in a bounding box around Greenland, for ISMIP7 Greenland forcing ;
	:institution = NASA Goddard Space Flight Center ;
	:creation_dat

### Write NetCDF out
Attempt a write-out of this file.  The regular grid version for matplotlib is estimated at 400 MB - should be possible.

Note we can't use Vincent's `DatesCut` here, so try using the year tag from the input files instead.

In [57]:
out_path = '/home/theghub/ehultee/data/'
year_tag = path0.strip('.nc').split('_')[-1] ## take the year tag from the GCM input (only one of the two input DS, but we have tried to make them match!)
out_fn = out_path + 'tf-{}-{}-v4_no_intermed_compute.nc'.format(SelModel, year_tag)

from dask.diagnostics import ProgressBar

with ProgressBar():
    ds_out.to_netcdf(path=out_fn)

[########################################] | 100% Completed | 10.6s


### Remove later: check the output

In [None]:
import cartopy  # Map projections libary
import cartopy.crs as ccrs  # Projections list

In [58]:
out_path = '/home/theghub/ehultee/data/'
f_in = out_path + 'tf-{}-200001-201412-v4_no_intermed_compute.nc'.format(SelModel)

ds_new = xr.open_dataset(f_in)

In [59]:
ds_new

In [60]:
tf_tavg = ds_new.TF.mean(dim='time') 
## TODO: remember to re-run the above and name the output variable!
tf_tavg

In [61]:
tf_tavg.sel(lev=0.0).mean(skipna=True)

In [None]:
import matplotlib.pyplot as plt
ax = plt.axes(projection=ccrs.Robinson())
tf_tavg.sel(lev=0.0).plot(ax=ax, transform=ccrs.PlateCarree(), x='lon', y='lat') ## specify x and y coordinates
ax.coastlines(); ax.gridlines();

In [None]:
tf_tavg.sel(lev=0.0).plot()