# Sea Ice Zone Data Processing from SSMI
- Process data for Southern Ocean SIZ each year and get climatological extent
- Notebook by Alice DuVivier (NCAR)
- Note: this notebook needs to load the utils.py file
- June 2024

## SIZ is the maximum area covered by sea ice in each year

- Satellite observations show maximum sea ice extent in September

In [1]:
import utils
import xarray as xr
import numpy as np
import dask
from distributed import Client
from ncar_jobqueue import NCARCluster
from glob import glob
import intake
import importlib
import pop_tools
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import cartopy
import cartopy.crs as ccrs

  from distributed.utils import tmpfile


In [None]:
dir_in = '/glade/campaign/cesm/development/pcwg/ssmi/CDR/'

In [None]:
file_in = 'ssmi_cdr_monthly_data_gx1v5_197901-202012.nc'

## Spin up DASK cluster

In [None]:
# Create our NCAR Cluster - which uses PBSCluster under the hood
cluster = NCARCluster(walltime='2:00:00', cores=1, processes=1, memory='100 GB', 
                      resource_spec='select=1:ncpus=1:mem=100GB',interface='ext')

# Spin up 32 workers
cluster.scale(32)

# Assign the cluster to our Client
client = Client(cluster)

In [None]:
client
# cluster.close()

## Define some functions we'll need to load data

In [None]:
# define the metrics we can load 
metrics_dict = {
    "SIZ": ['aice'],
}

def get_metrics_list():
    return list(metrics_dict.keys())

In [None]:
# function to return the variables we need to return for all metrics
def get_metric_variable(metric):
    var_names = metrics_dict[metric]
    # handle the case where there is only one variable
    if isinstance(var_names, str):
        return var_names
    # handle the case where there are multiple variables
    else:
        return var_names[:]

In [None]:
# function to average over top 150m for relevant variables and to keep time bound
def preprocess(ds):
    tb = ds.time_bounds    
    #re-write time bound with saved value
    ds['time_bound'] = tb
    return ds

In [None]:
# function for loading datasets
def load_datasets(varnames, experiment,lat_min,lat_max):
    ds_list = []
    for varname in varnames:
        subset = catalog.search(component='ice',
                                variable=varname,
                                experiment=experiment,
                                forcing_variant='cmip6',
                               )
        with dask.config.set(**{'array.slicing.split_large_chunks': True}):
            dsets = subset.to_dataset_dict()
        ds = dsets[f'ice.{experiment}.cice.h.cmip6.{varname}'] 
        
        # compute time mean to get correct months
        ds['time']= ds.time_bounds.compute().mean(dim="d2")
        # keep only some variables
        keep_vars=['time_bounds','TLAT','TLON','tarea','time'] + [varname]
        ds = ds.drop([v for v in ds.variables if v not in keep_vars])
        ds_list.append(ds)
        ds = xr.merge(ds_list, compat="override")
        
        # crop data to the latitudes we want, use given lat/lon, not specific indices
        #ds = ds.isel(nlat=slice(0,37)) # Crop to Southern Ocean, ind_start = 0, ind_end = 37
        ds = ds.where(((ds['TLAT'] <= lat_max) & (ds['TLAT'] >= lat_min)), drop=True)
        
    return ds

## Select the metric you are interested in

In [None]:
import utils

In [None]:
## Print out potential metrics to investigate:
metrics_list = get_metrics_list()
print(metrics_list)

In [None]:
## Enter the metric  of interest:
metric = 'SIZ'

In [None]:
## Enter the years you want to keep:
yy_st = "1950"
yy_ed = "2100"

In [None]:
## Enter the latitudes you want to keep:
lat_min = -80
lat_max = -60

## Load CESM2-LE data

In [None]:
# for the metric supplied, define the list of required variable names
varnames = get_metric_variable(metric) 
varnames

### Load the Data Catalog

In [None]:
catalog = intake.open_esm_datastore(
    '/glade/collections/cmip/catalog/intake-esm-datastore/catalogs/glade-cesm2-le.json'
)

### Load historical data

In [None]:
%%time
# load historical
ds_hist = load_datasets(varnames,'historical',lat_min,lat_max)
ds_hist

In [None]:
# check that this data looks reasonable

mem_5_hist = ds_hist.aice.isel(member_id=5)

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 4))

mem_5_hist.isel(time=0).plot(ax=ax1)
ax1.set_title(mem_5_hist.isel(time=0).time.values)

mem_5_hist.isel(time=1200).plot(ax=ax2)
ax2.set_title(mem_5_hist.isel(time=1200).time.values)

fig.suptitle("first and last ice concentration for hist dataset")

plt.tight_layout()
plt.show()

### Load future data

In [None]:
%%time
# load future
ds_ssp = load_datasets(varnames, 'ssp370',lat_min,lat_max)
ds_ssp

In [None]:
# check that this data looks reasonable

mem_5_ssp = ds_ssp.aice.isel(member_id=5)

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 4))

mem_5_ssp.isel(time=0).plot(ax=ax1)
ax1.set_title(mem_5_ssp.isel(time=0).time.values)

mem_5_ssp.isel(time=-1).plot(ax=ax2)
ax2.set_title(mem_5_ssp.isel(time=-1).time.values)

fig.suptitle("first and last NPP for future dataset")

plt.tight_layout()
plt.show()

## Concatenate historical and future datasets and crop to just the times we need


In [None]:
ds = xr.concat((ds_hist, ds_ssp),dim='time')
ds.time

In [None]:
# keep just years chosen above
ds = ds.sel(time=slice(yy_st, yy_ed))
#ds = ds.isel(time=slice(1200, 3012)) # Cut timeseries to 1950 - 2100: 1812 time steps
ds.time

In [None]:
# check that this data looks reasonable

mem_5 = ds.aice.isel(member_id=5)

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 4))

mem_5.isel(time=0).plot(ax=ax1)
ax1.set_title(mem_5.isel(time=0).time.values)

mem_5.isel(time=-1).plot(ax=ax2)
ax2.set_title(mem_5.isel(time=-1).time.values)

fig.suptitle("first and last NPP for concattenated dataset")

plt.tight_layout()
plt.show()

# Compute Annual Maximum SIE
* Maximum extent is either in September or October

In [None]:
# get some sizes
years = np.arange(int(yy_st),int(yy_ed),1)

tlen = 150 #annual timesteps 
xlen = len(ds.ni)
ylen = len(ds.nj)
elen = len(ds.member_id)
print(tlen,xlen,ylen,elen)

variable = 'aice'

### Get the September and October sea ice

In [None]:
%%time

var = variable

ds_Sep = xr.Dataset()
Sep = np.zeros([tlen,elen,ylen,xlen])

ds_Oct = xr.Dataset()
Oct = np.zeros([tlen,elen,ylen,xlen])

ds_Sep_Oct = xr.Dataset()
Sep_Oct = np.zeros([tlen,elen,ylen,xlen])

for year in np.arange(0,150,1):

    # September
    #get time index of september for this year
    st_mon_sep = year*12 + 8
    #get that month only
    Sep[year,:,:,:] = ds[var].isel(time=st_mon_sep)

    # October
    #get time index of september for this year
    st_mon_oct = year*12 + 9
    #get that month only
    Oct[year,:,:,:] = ds[var].isel(time=st_mon_oct)

    # Sept and Oct mean
    #get those months only
    temp = ds[var].isel(time=slice(st_mon_sep,st_mon_oct+1))
    Sep_Oct[year,:,:,:] = temp.mean(dim='time')

# turn into xarrays
Sep = xr.DataArray(Sep,dims=('time','member_id','nlat','nlon'))
ds_Sep[var] = Sep
ds_Sep[var].attrs['units'] = ''
ds_Sep['time'] = years

Oct = xr.DataArray(Oct,dims=('time','member_id','nlat','nlon'))
ds_Oct[var] = Oct
ds_Oct[var].attrs['units'] = ''
ds_Oct['time'] = years

Sep_Oct = xr.DataArray(Sep_Oct,dims=('time','member_id','nlat','nlon'))
ds_Sep_Oct[var] = Sep_Oct
ds_Sep_Oct[var].attrs['units'] = ''
ds_Sep_Oct['time'] = years

### Calculate the SIA each year

In [None]:
# keep only regions with >15% ice cover
ds_Sep_mask = ds_Sep.where(ds_Sep.aice > 0.15)
ds_Oct_mask = ds_Oct.where(ds_Oct.aice > 0.15)
ds_Sep_Oct_mask = ds_Sep_Oct.where(ds_Sep_Oct.aice > 0.15)

In [None]:
ds_Sep_mask['time'] = years
ds_Oct_mask['time'] = years
ds_Sep_Oct_mask['time'] = years

In [None]:
# multipy by tarea
tarea = ds.tarea.isel(time=0)
tarea = tarea.rename({'nj':'nlat','ni': 'nlon'})

ds_Sep_mask = ds_Sep_mask*tarea
ds_Oct_mask = ds_Oct_mask*tarea
ds_Sep_Oct_mask = ds_Sep_Oct_mask*tarea

In [None]:
# sum over all lat/lon points
ds_Sep_sum = ds_Sep_mask.aice.sum(dim=['nlat','nlon'])
ds_Oct_sum = ds_Oct_mask.aice.sum(dim=['nlat','nlon'])
ds_Sep_Oct_sum = ds_Sep_Oct_mask.aice.sum(dim=['nlat','nlon'])

In [None]:
# get ensemble mean and standard deviations
ds_Sep_avg = ds_Sep_sum.mean(dim='member_id')
ds_Sep_std = ds_Sep_sum.std(dim='member_id')
ds_Oct_avg = ds_Oct_sum.mean(dim='member_id')
ds_Oct_std = ds_Oct_sum.std(dim='member_id')
ds_Sep_Oct_avg = ds_Sep_Oct_sum.mean(dim='member_id')
ds_Sep_Oct_std = ds_Sep_Oct_sum.std(dim='member_id')

In [None]:
ds_Sep_avg.plot()
ds_Oct_avg.plot()
ds_Sep_Oct_avg.plot()

In [None]:
# Make nicer plot with standard deviations

# create figure
fig = plt.figure(figsize=(20,10))

# Make subplot - note it's nrow x ncol x index (starting upper left)
ax = fig.add_subplot(1,1,1) 

# plot the Sep values
ax.plot(years,ds_Sep_avg,label="September",color='blue',linestyle='solid',linewidth=2)
#ax.plot(years,ds_Sep_avg+ds_Sep_std,color='blue',linestyle='dashed',linewidth=1)
#ax.plot(years,ds_Sep_avg-ds_Sep_std,color='blue',linestyle='dashed',linewidth=1)

# plot the Oct values
ax.plot(years,ds_Oct_avg,label="October",color='red',linestyle='solid',linewidth=2)
#ax.plot(years,ds_Oct_avg+ds_Oct_std,color='red',linestyle='dashed',linewidth=1)
#ax.plot(years,ds_Oct_avg-ds_Oct_std,color='red',linestyle='dashed',linewidth=1)

# plot the mean Sep/Oct values
ax.plot(years,ds_Sep_Oct_avg,label="SepOct Mean",color='black',linestyle='solid',linewidth=2)
ax.plot(years,ds_Sep_Oct_avg+ds_Sep_Oct_std,color='black',linestyle='dashed',linewidth=1)
ax.plot(years,ds_Sep_Oct_avg-ds_Sep_Oct_std,color='black',linestyle='dashed',linewidth=1)

# finish up plot
plt.title('Maximum Southern Hemisphere SIA',fontsize=20)
plt.xlabel('year',fontsize=20)
plt.xticks(fontsize=20, rotation=45)
plt.xlim([1950,2100])
plt.yticks(fontsize=20)
plt.legend(ncol=1,fontsize=20)


In [None]:
# Make nicer plot with standard deviations

# create figure
fig = plt.figure(figsize=(20,10))

# Make subplot - note it's nrow x ncol x index (starting upper left)
ax = fig.add_subplot(1,1,1) 

# plot the Sep values
ax.plot(years,ds_Sep_avg,label="September",color='blue',linestyle='solid',linewidth=2)
ax.plot(years,ds_Sep_avg+ds_Sep_std,color='blue',linestyle='dashed',linewidth=1)
ax.plot(years,ds_Sep_avg-ds_Sep_std,color='blue',linestyle='dashed',linewidth=1)

# plot the Oct values
ax.plot(years,ds_Oct_avg,label="October",color='red',linestyle='solid',linewidth=2)
ax.plot(years,ds_Oct_avg+ds_Oct_std,color='red',linestyle='dashed',linewidth=1)
ax.plot(years,ds_Oct_avg-ds_Oct_std,color='red',linestyle='dashed',linewidth=1)

# finish up plot
plt.title('Maximum Southern Hemisphere SIA',fontsize=20)
plt.xlabel('year',fontsize=20)
plt.xticks(fontsize=20, rotation=45)
plt.xlim([1950,2100])
plt.yticks(fontsize=20)
plt.legend(ncol=1,fontsize=20)


- Satellite observations show maximum sea ice extent in September, but past work showed CESM2 maximum in October (DuVivier et al. 2020, https://doi.org/10.1029/2019JC015934).
- The above figures show that in the CESM2-LE the September sea ice area is greater than the October sea ice area. Time indices have been checked to verify we're pulling the correct months.
- This result is contrary to the DuVivier 2020 paper, but does match observations better.
- Therefore, for calculation of yearly SIZ we will use CESM2-LE September sea ice concentrations.

## Calculate SIZ

### Load CESM2-LE ocean grid

In [None]:
# directory path
dir_in = '/glade/campaign/cgd/ppc/duvivier/masks/'

In [None]:
# load ocean grid
fin = 'ocn_grid.nc'
ds_grid = xr.open_mfdataset(dir_in+fin,decode_times=False)
ds_grid['nlat'] = ds_grid.nlat
ds_grid['nlon'] = ds_grid.nlon
ds_grid

In [None]:
# load sea ice masks
fin = 'REGION_MASK_gx1v7.nc'
ds_mask = xr.open_mfdataset(dir_in+fin,decode_times=False)
ds_mask = ds_mask.isel(time=0)
ds_mask = ds_mask.rename({'lat':'nlat','lon': 'nlon'})
ds_mask['nlat'] = ds_grid.nlat
ds_mask['nlon'] = ds_grid.nlon
ds_mask

In [None]:
# crop data to the latitudes we want, use given lat/lon, not specific indices
ds_grid = ds_grid.where(((ds_grid['TLAT'] <= lat_max) & (ds_grid['TLAT'] >= lat_min)), drop=True)
ds_mask = ds_mask.where(((ds_grid['TLAT'] <= lat_max) & (ds_grid['TLAT'] >= lat_min)), drop=True)

### Make SIZ masks

In [None]:
# set some coordinates for the sea ice so we can do the masking
ds_Sep['nlat'] = ds_mask.nlat
ds_Sep['nlon'] = ds_mask.nlon

In [None]:
# create a mask using the 0-1 array from sh_mask and where SIC is above 15%
SIZ_mask = ds_mask.sh_mask.where(ds_Sep.aice >= 0.15,0)

# omit land points
SIZ_mask = SIZ_mask.where(ds_grid.REGION_MASK == 1)

SIZ_mask

In [None]:
SIZ_mask.isel(time=0,member_id=0).plot()

## Put together into one dataset

- Netcdf with variable dimension: (member_id x year x nlat x nlon) Include TLAT and TLONG as coordinates

In [None]:
ds_out = xr.Dataset()

ds_out['SIC_SEP'] = ds_Sep[variable]
ds_out['SIZ_SEP'] = SIZ_mask

# change the attributes
ds_out.attrs['author'] = 'Alice DuVivier'
ds_out.attrs['date_processed'] = datetime.now().strftime('%Y-%m-%d')
ds_out.attrs['contents'] = f'September {variable} in the Southern Ocean from 1950 to 2099. SIZ is defined as maximum sea ice extent in a year.'

ds_out

In [None]:
# Print the dimensions
print("Dimensions:")
for dim in ds_out.dims:
    print(f"\t{dim}: {ds_out[dim].values.shape}")

# Print the coordinates
print("Coordinates:")
for coord in ds_out.coords:
    print(f"\t{coord}:")
    print(f"\t\t{ds_out.coords[coord].values}")
    
# Print the attributes
print("Attributes:")
for attr in ds_out.attrs:
    print(f"\t{attr}: {ds_out.attrs[attr]}")
    

## Export and Save

In [None]:
# calculate the size of the dataset in GB
size_gb = ds_out.nbytes / (1024**3)
print(f"The dataset is approximately {size_gb:.2f} GB.")


In [None]:
path_out = '/glade/campaign/cgd/ppc/duvivier/cesm2_antarctic_polynya/mpa_analysis/DATA/ecoindex_data/seaice/'
file_out = 'CESM2-LE-SIZ_SIC.nc'
fout = path_out + file_out

In [None]:
# Export the dataset to NetCDF with all attributes and coordinates
ds_out.to_netcdf(fout)

In [None]:
cluster.close()

## Test if it worked

In [None]:
import netCDF4
nc = netCDF4.Dataset(fout)
print(nc.variables)

In [None]:
path =  path_out
ds_in =  xr.open_dataset(path_out+file_out, decode_times=True)

In [None]:
ds_in.info()

In [None]:
mem_5_ints = ds_in.SIZ_SEP.isel(member_id=5)

fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(10, 4))

mem_5_ints.isel(time=0).plot(ax=ax1)

mem_5_ints.isel(time=-1).plot(ax=ax2)

fig.suptitle("first and last SIZ")

plt.tight_layout()
plt.show()