# Summer SST Data Processing from CESM2-LE derived output
- Process data for Southern Ocean summer mean SST.
- Notebook by Alice DuVivier (NCAR), Kristen Krumhardt (NCAR)
- Note: this notebook needs to load the utils.py file
- October 2024

In [1]:
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
import os
from glob import glob
import xarray as xr
import numpy as np
import esmlab
import pop_tools 
import dask
from distributed import Client
from ncar_jobqueue import NCARCluster
import utils
from datetime import datetime
import matplotlib.pyplot as plt
import cartopy
import cmocean
import cartopy.crs as ccrs

## Spin up DASK cluster

In [2]:
# Create our NCAR Cluster - which uses PBSCluster under the hood
cluster = NCARCluster(walltime='2:00:00', cores=1, processes=1, memory='100 GB', 
                      resource_spec='select=1:ncpus=1:mem=100GB',interface='ext')

# Spin up 32 workers
cluster.scale(32)

# Assign the cluster to our Client
client = Client(cluster)

In [3]:
client
# cluster.close()

0,1
Connection method: Cluster object,Cluster type: dask_jobqueue.PBSCluster
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/duvivier/PBS/proxy/8787/status,

0,1
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/duvivier/PBS/proxy/8787/status,Workers: 0
Total threads: 0,Total memory: 0 B

0,1
Comm: tcp://128.117.208.112:41345,Workers: 0
Dashboard: https://jupyterhub.hpc.ucar.edu/stable/user/duvivier/PBS/proxy/8787/status,Total threads: 0
Started: Just now,Total memory: 0 B


## Load the data

In [4]:
# load grid information
ds_grid = pop_tools.get_grid('POP_gx1v7')
lons = ds_grid.TLONG
lats = ds_grid.TLAT
area = ds_grid.TAREA
area_m = ds_grid.TAREA * 1e-4
lons_norm = utils.normal_lons(lons)

In [5]:
# Select variable of interest and locate the files
varname = 'SST'
path = '/glade/campaign/cgd/cesm/CESM2-LE/ocn/proc/tseries/month_1/' + varname

In [6]:
# list all 50 cmip6 standard forcing ensemble numbers
ens_mems = ['1001.001', '1021.002', '1041.003', '1061.004', '1081.005', '1101.006', '1121.007', '1141.008', '1161.009', '1181.010',
            '1231.001', '1231.002', '1231.003', '1231.004', '1231.005', '1231.006', '1231.007', '1231.008', '1231.009', '1231.010',
            '1251.001', '1251.002', '1251.003', '1251.004', '1251.005', '1251.006', '1251.007', '1251.008', '1251.009', '1251.010',
            '1281.001', '1281.002', '1281.003', '1281.004', '1281.005', '1281.006', '1281.007', '1281.008', '1281.009', '1281.010',
            '1301.001', '1301.002', '1301.003', '1301.004', '1301.005', '1301.006', '1301.007', '1301.008', '1301.009', '1301.010']

### Load historical data

In [7]:
%%time

print('loading historical data')
ds_hist = xr.Dataset()

for m in ens_mems:
    print(m)

    case = 'b.e21.BHISTcmip6.f09_g17.LE2-' + m

    files = sorted(glob(f'{path}/{case}.pop.h.{varname}.??????-??????.nc'))       
    ds_tmp = xr.open_mfdataset(files, data_vars="minimal", coords='minimal', compat="override", parallel=True, 
                               concat_dim="time", combine='nested', decode_times=True)
    # take an average over the time bounds to get the right time dimension
    ds_tmp["time"] = ds_tmp.time_bound.compute().mean(dim="d2")

    # keep only some variables
    keep_vars=['time_bound','TLAT','TLONG','time'] + [varname]
    ds_tmp = ds_tmp.drop([v for v in ds_tmp.variables if v not in keep_vars])
    
    ds_hist = xr.concat([ds_hist,ds_tmp], dim='member_id')


loading historical data
1001.001
1021.002
1041.003
1061.004
1081.005
1101.006
1121.007
1141.008
1161.009
1181.010
1231.001
1231.002
1231.003
1231.004
1231.005
1231.006
1231.007
1231.008
1231.009
1231.010
1251.001
1251.002
1251.003
1251.004
1251.005
1251.006
1251.007
1251.008
1251.009
1251.010
1281.001
1281.002
1281.003
1281.004
1281.005
1281.006
1281.007
1281.008
1281.009
1281.010
1301.001
1301.002
1301.003
1301.004
1301.005
1301.006
1301.007
1301.008
1301.009
1301.010
CPU times: user 18.1 s, sys: 1.1 s, total: 19.2 s
Wall time: 1min 46s


### Load future data

In [8]:
%%time

print('loading future data')
ds_ssp = xr.Dataset()

for m in ens_mems:
    print(m)

    case = 'b.e21.BSSP370cmip6.f09_g17.LE2-' + m

    files = sorted(glob(f'{path}/{case}.pop.h.{varname}.??????-??????.nc'))       
    ds_tmp = xr.open_mfdataset(files, data_vars="minimal", coords='minimal', compat="override", parallel=True, 
                               concat_dim="time", combine='nested', decode_times=True)
    # take an average over the time bounds to get the right time dimension
    ds_tmp["time"] = ds_tmp.time_bound.compute().mean(dim="d2")

    # keep only some variables
    keep_vars=['time_bound','TLAT','TLONG','time'] + [varname]
    ds_tmp = ds_tmp.drop([v for v in ds_tmp.variables if v not in keep_vars])
    
    ds_ssp = xr.concat([ds_ssp,ds_tmp], dim='member_id')


loading future data
1001.001
1021.002
1041.003
1061.004
1081.005
1101.006
1121.007
1141.008
1161.009
1181.010
1231.001
1231.002
1231.003
1231.004
1231.005
1231.006
1231.007
1231.008
1231.009
1231.010
1251.001
1251.002
1251.003
1251.004
1251.005
1251.006
1251.007
1251.008
1251.009
1251.010
1281.001
1281.002
1281.003
1281.004
1281.005
1281.006
1281.007
1281.008
1281.009
1281.010
1301.001
1301.002
1301.003
1301.004
1301.005
1301.006
1301.007
1301.008
1301.009
1301.010
CPU times: user 12 s, sys: 543 ms, total: 12.5 s
Wall time: 51.2 s


## Concatenate historical and future datasets

In [9]:
ds = xr.concat((ds_hist, ds_ssp),dim='time')

In [10]:
ds = ds.isel(z_t=0)

In [11]:
ds

Unnamed: 0,Array,Chunk
Bytes,68.94 GiB,56.25 MiB
Shape,"(50, 3012, 384, 320)","(1, 120, 384, 320)"
Count,40924 Tasks,1300 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 68.94 GiB 56.25 MiB Shape (50, 3012, 384, 320) (1, 120, 384, 320) Count 40924 Tasks 1300 Chunks Type float32 numpy.ndarray",50  1  320  384  3012,

Unnamed: 0,Array,Chunk
Bytes,68.94 GiB,56.25 MiB
Shape,"(50, 3012, 384, 320)","(1, 120, 384, 320)"
Count,40924 Tasks,1300 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.30 MiB,1.88 kiB
Shape,"(50, 3012, 2)","(1, 120, 2)"
Count,39624 Tasks,1300 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 2.30 MiB 1.88 kiB Shape (50, 3012, 2) (1, 120, 2) Count 39624 Tasks 1300 Chunks Type object numpy.ndarray",2  3012  50,

Unnamed: 0,Array,Chunk
Bytes,2.30 MiB,1.88 kiB
Shape,"(50, 3012, 2)","(1, 120, 2)"
Count,39624 Tasks,1300 Chunks
Type,object,numpy.ndarray


### Subset data as needed

#### Keep just 1950 and afterward

In [12]:
## Enter the years you want to keep:
yy_st = "1950"
yy_ed = "2100"

ds_keep = ds.where(ds.time.dt.year >= 1950, drop = True)

#### Keep just southern ocean

In [13]:
## Enter the latitudes you want to keep:
lat_min = -80
lat_max = -60

In [14]:
# crop data to the latitudes we want, use given lat/lon, not specific indices
ds_keep = ds_keep.where(((ds_grid['TLAT'] <= lat_max) & (ds_grid['TLAT'] >= lat_min)), drop=True)

In [15]:
ds_keep

Unnamed: 0,Array,Chunk
Bytes,3.89 GiB,5.27 MiB
Shape,"(50, 1812, 36, 320)","(1, 120, 36, 320)"
Count,44158 Tasks,800 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 3.89 GiB 5.27 MiB Shape (50, 1812, 36, 320) (1, 120, 36, 320) Count 44158 Tasks 800 Chunks Type float32 numpy.ndarray",50  1  320  36  1812,

Unnamed: 0,Array,Chunk
Bytes,3.89 GiB,5.27 MiB
Shape,"(50, 1812, 36, 320)","(1, 120, 36, 320)"
Count,44158 Tasks,800 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,15.55 GiB,21.09 MiB
Shape,"(50, 1812, 2, 36, 320)","(1, 120, 2, 36, 320)"
Count,42858 Tasks,800 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 15.55 GiB 21.09 MiB Shape (50, 1812, 2, 36, 320) (1, 120, 2, 36, 320) Count 42858 Tasks 800 Chunks Type object numpy.ndarray",1812  50  320  36  2,

Unnamed: 0,Array,Chunk
Bytes,15.55 GiB,21.09 MiB
Shape,"(50, 1812, 2, 36, 320)","(1, 120, 2, 36, 320)"
Count,42858 Tasks,800 Chunks
Type,object,numpy.ndarray


## Calculate summer (ONDJFM) means

In [16]:
#starting with Jan, list of days in each month
days_per_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

In [17]:
# Dataset will become shorter because of the Jan.1 boundary for some seasons
# for 1950-2100, 150 years long (1800 time steps instead of 1812). Last year will be 2099
years = np.arange(int(yy_st),int(yy_ed),1)

tlen = 150 #annual timesteps 
xlen = len(ds_keep.nlon)
ylen = len(ds_keep.nlat)
elen = len(ds_keep.member_id)
print(tlen,xlen,ylen,elen)

150 320 36 50


In [29]:
%%time

ds_Oct_Mar = xr.Dataset()
ds_Mar = xr.Dataset()

Oct_Mar_array = np.zeros([tlen,elen,ylen,xlen])
Mar_array = np.zeros([tlen,elen,ylen,xlen])

for year in np.arange(0,tlen,1):

    # get index for months in the first year
    ind_st = year*12 + 9
    ind_ed = ind_st +6
    # get data for these times
    tmp = ds_keep.isel(time=slice(ind_st,ind_ed))
    tmp = tmp.SST
    # keep the data
    Oct_Mar_array[year,:,:,:] = tmp.mean(dim='time')
    # get data for March only
    tmp = ds_keep.isel(time=ind_ed-1)
    # keep the data
    Mar_array[year,:,:,:] = tmp.SST

# convert to xarray
ds_Oct_Mar = xr.DataArray(Oct_Mar_array,dims=('time','member_id','nlat','nlon'))

ds_Oct_Mar[varname] = ds_Oct_Mar
ds_Oct_Mar[varname].attrs['units'] = 'degC'
ds_Oct_Mar['time'] = years
ds_Oct_Mar

# March
ds_Mar = xr.DataArray(Mar_array,dims=('time','member_id','nlat','nlon'))

ds_Mar[varname] = ds_Mar
ds_Mar[varname].attrs['units'] = 'degC'
ds_Mar['time'] = years
ds_Mar

CPU times: user 2min 27s, sys: 7.36 s, total: 2min 34s
Wall time: 9min 22s


## Put together into one dataset

- Netcdf with variable dimension: (member_id x year x nlat x nlon)
- Include TAREA, andd TLAT and TLONG as coordinates

In [30]:
# load grid data to include in output file
dir_in = '/glade/u/home/duvivier/masks/'
fin = 'ocn_grid_gx1v7.nc'
ds_grid = xr.open_mfdataset(dir_in+fin,decode_times=False)
ds_grid['nlat'] = ds_grid.nlat
ds_grid['nlon'] = ds_grid.nlon

# crop data to the latitudes we want, use given lat/lon, not specific indices
ds_grid = ds_grid.where(((ds_grid['TLAT'] <= lat_max) & (ds_grid['TLAT'] >= lat_min)), drop=True)

In [32]:
ds_out['SST_ONDJFM'] = ds_Oct_Mar
ds_out['SST_MAR'] = ds_Mar
ds_out['TAREA'] = ds_grid.TAREA

# change the attributes
ds_out.attrs['author'] = 'Alice DuVivier'
ds_out.attrs['date_processed'] = datetime.now().strftime('%Y-%m-%d')
ds_out.attrs['contents'] = f'summer mean and March sea surface temperature in the Southern Ocean from 1950 to 2099 from the CESM2 Large Ensemble.'

ds_out

Unnamed: 0,Array,Chunk
Bytes,90.00 kiB,90.00 kiB
Shape,"(36, 320)","(36, 320)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 90.00 kiB 90.00 kiB Shape (36, 320) (36, 320) Count 4 Tasks 1 Chunks Type float64 numpy.ndarray",320  36,

Unnamed: 0,Array,Chunk
Bytes,90.00 kiB,90.00 kiB
Shape,"(36, 320)","(36, 320)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,90.00 kiB,90.00 kiB
Shape,"(36, 320)","(36, 320)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 90.00 kiB 90.00 kiB Shape (36, 320) (36, 320) Count 4 Tasks 1 Chunks Type float64 numpy.ndarray",320  36,

Unnamed: 0,Array,Chunk
Bytes,90.00 kiB,90.00 kiB
Shape,"(36, 320)","(36, 320)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,90.00 kiB,90.00 kiB
Shape,"(36, 320)","(36, 320)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 90.00 kiB 90.00 kiB Shape (36, 320) (36, 320) Count 4 Tasks 1 Chunks Type float64 numpy.ndarray",320  36,

Unnamed: 0,Array,Chunk
Bytes,90.00 kiB,90.00 kiB
Shape,"(36, 320)","(36, 320)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,90.00 kiB,90.00 kiB
Shape,"(36, 320)","(36, 320)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 90.00 kiB 90.00 kiB Shape (36, 320) (36, 320) Count 4 Tasks 1 Chunks Type float64 numpy.ndarray",320  36,

Unnamed: 0,Array,Chunk
Bytes,90.00 kiB,90.00 kiB
Shape,"(36, 320)","(36, 320)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,90.00 kiB,90.00 kiB
Shape,"(36, 320)","(36, 320)"
Count,12 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 90.00 kiB 90.00 kiB Shape (36, 320) (36, 320) Count 12 Tasks 1 Chunks Type float64 numpy.ndarray",320  36,

Unnamed: 0,Array,Chunk
Bytes,90.00 kiB,90.00 kiB
Shape,"(36, 320)","(36, 320)"
Count,12 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [34]:
# drop the ULAT/ULONG variables
ds_out = ds_out.drop('ULAT')
ds_out = ds_out.drop('ULONG')

In [35]:
ds_out

Unnamed: 0,Array,Chunk
Bytes,90.00 kiB,90.00 kiB
Shape,"(36, 320)","(36, 320)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 90.00 kiB 90.00 kiB Shape (36, 320) (36, 320) Count 4 Tasks 1 Chunks Type float64 numpy.ndarray",320  36,

Unnamed: 0,Array,Chunk
Bytes,90.00 kiB,90.00 kiB
Shape,"(36, 320)","(36, 320)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,90.00 kiB,90.00 kiB
Shape,"(36, 320)","(36, 320)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 90.00 kiB 90.00 kiB Shape (36, 320) (36, 320) Count 4 Tasks 1 Chunks Type float64 numpy.ndarray",320  36,

Unnamed: 0,Array,Chunk
Bytes,90.00 kiB,90.00 kiB
Shape,"(36, 320)","(36, 320)"
Count,4 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,90.00 kiB,90.00 kiB
Shape,"(36, 320)","(36, 320)"
Count,12 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 90.00 kiB 90.00 kiB Shape (36, 320) (36, 320) Count 12 Tasks 1 Chunks Type float64 numpy.ndarray",320  36,

Unnamed: 0,Array,Chunk
Bytes,90.00 kiB,90.00 kiB
Shape,"(36, 320)","(36, 320)"
Count,12 Tasks,1 Chunks
Type,float64,numpy.ndarray


In [36]:
# Print the dimensions
print("Dimensions:")
for dim in ds_out.dims:
    print(f"\t{dim}: {ds_out[dim].values.shape}")

# Print the coordinates
print("Coordinates:")
for coord in ds_out.coords:
    print(f"\t{coord}:")
    print(f"\t\t{ds_out.coords[coord].values}")
    
# Print the attributes
print("Attributes:")
for attr in ds_out.attrs:
    print(f"\t{attr}: {ds_out.attrs[attr]}")
    

Dimensions:
	time: (150,)
	member_id: (50,)
	nlat: (36,)
	nlon: (320,)
Coordinates:
	time:
		[1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963
 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977
 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991
 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005
 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019
 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033
 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047
 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061
 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075
 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089
 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099]
	SST_MAR:
		[[[[        nan         nan         nan ...         nan         nan
            nan]
   [        nan         nan         nan ...     

## Export and Save

In [38]:
# calculate the size of the dataset in GB
size_gb = ds_out.nbytes / (1024**3)
print(f"The dataset is approximately {size_gb:.2f} GB.")


The dataset is approximately 0.64 GB.


In [39]:
variable = 'SST'

path_out = '/glade/campaign/cgd/ppc/duvivier/cesm2_antarctic_polynya/mpa_analysis/DATA/ecoindex_data/sea_ice/'
file_out = 'CESM2-LE-summer-mean-'+variable+'.nc'
fout = path_out + file_out

In [40]:
# Export the dataset to NetCDF with all attributes and coordinates
ds_out.to_netcdf(fout)

In [41]:
cluster.close()

distributed.client - ERROR - Failed to reconnect to scheduler after 30.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
concurrent.futures._base.CancelledError
