# Self-Organizing Maps (SOMs) Notebook
## Data extraction for composites

**Notebook by Maria J. Molina (NCAR) and Alice DuVivier (NCAR).**

This Notebook reads in data from the CESM2-LE for a user-specified variable. It subsets the data to be just around Antarctica to create composites from.

In [None]:
# Needed imports

from minisom import MiniSom, asymptotic_decay
import xarray as xr
import cftime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import product
import cartopy
import cartopy.crs as ccrs
from cartopy.util import add_cyclic_point
from datetime import timedelta
from itertools import product

import intake
from distributed import Client
from ncar_jobqueue import NCARCluster

In [None]:
import dask
dask.__version__
# Did a conda update dask on cheyenne to get to 2021.09

In [None]:
# start up dask

cluster = NCARCluster(memory='100 GB', walltime='1:00:00', cores=4, processes=2, resource_spec='select=1:ncpus=2:mem=100GB')
cluster.scale(40) # number of workers requested
#cluster.adapt(1,80) # min and max
client = Client(cluster)

cluster = NCARCluster(memory="100GB", walltime='8:00:00', cores=4, processes=2, resource_spec='select=1:ncpus=2:mem=100GB')
Each worker has 100GB, resource_spec is assigning this. 

In [None]:
client

In [None]:
#dask.config.set({"array.slicing.split_large_chunks": True})

#Set dask.config.set({"array.slicing.split_large_chunks": False}) to allow the large chunk and silence the warning.

#### HERE - MAX SAID TRY THIS TOO (BELOW)
# with dask.congfig.set... (DO RIGHT BEFORE ACTUALLY READING DATA WITH DATASET DICT)
#Set dask.config.set({"array.slicing.split_large_chunks": True}) to avoid creating the large chunk in the first place.

## Section 1: Load and get correct training data

In [None]:
# set some info for the CESM2-LE data
# set: variable to test, the location of the data, which ensemble member
var_in = 'hi_d'
 # do not want smbb data
forcing = 'cmip6'

### Load in the data

In [None]:
catalog_file = '/glade/collections/cmip/catalog/intake-esm-datastore/catalogs/glade-cesm2-le.json'

cat = intake.open_esm_datastore(catalog_file)

In [None]:
subset = cat.search(variable=var_in, forcing_variant=forcing)

In [None]:
#subset
subset.df.head()

In [None]:
# make arrays of half (25) of the CESM2-LE members 
# select every other from the large ensemble of both macro and micro starts
# note that the naming of the files (YYYY.#### e.g. 1001.001) doesn't match the member_id directly, 
# but the ensemble number (### e.g. 001) does match the member_id field r? directly. So use this to search

# set list of members from the dataset
member_ids = subset.df.member_id.unique()

# set list of members to KEEP
keep_list = ['r1i', 'r3i', 'r5i','r7i', 'r9i']


In [None]:
member_keep = [] # make a list to fill

for member in keep_list:
    for member_id in member_ids:
        if member in member_id:
            member_keep.append(member_id)

In [None]:
#check that we're keeping the right ones
member_keep

In [None]:
# now reduce subset based on just the members to keep
subset = subset.search(member_id=member_keep)

In [None]:
%%time
#actually load the data we selected into a dataset
with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    dsets = subset.to_dataset_dict(cdf_kwargs={'chunks': {'time':240}, 'decode_times': True})

#dsets

In [None]:
# print names of the dataset keys, which refer to each of the ensembles loaded
dsets.keys()

In [None]:
# Look at just one dataset key to see what it looks like. 
# Note that for 1001 there is one member_id, but for 1231 there are 5 member_ids
# these refer to the individual ensemble members!

dsets['ice.historical.cice.h1.cmip6.'+var_in]

In [None]:
# load in the historical and future datasets

historicals = []
futures = []

for key in sorted(dsets.keys()):
    if 'historical' in key:
        historicals.append(dsets[key])
        print(key)
    elif 'ssp370' in key:
        futures.append(dsets[key])
        print(key)

In [None]:
# Now put these into an array by member_id
historical_ds = xr.concat(historicals, dim='member_id')
future_ds = xr.concat(futures, dim='member_id')

In [None]:
# note that the historical and future xarray datasets have the same coordinates and dimensions *except* time, 
# so we need to concatenate over time
ds = xr.concat([historical_ds,future_ds],dim='time')

In [None]:
# we need to shift time by 1 day because of weird CESM conventions
ds = ds.assign_coords(time=ds.coords["time"]-timedelta(days=1))

In [None]:
ds

## Section 2: Drop the lats that we don't need

In [None]:
# set some limits for Antarctica in general
lat_max = -60
lat_min = -80

In [None]:
# now drop points that are masked
ds_subset = ds.where(((ds['TLAT']<lat_max) & (ds['TLAT']>lat_min)), drop=True)

In [None]:
ds_subset = ds_subset[var_in]

In [None]:
ds_subset

In [None]:
# check that we have all of Antarctica here
ds_subset.sel(member_id='r1i1281p1f1').isel(time=1000).plot()

In [None]:
#%%time
## actually load the data so it doesn't get too big later and makes DASK angry
#ds_subset.load()

## Section 3: Subset the times

In [None]:
ds_subset.time

In [None]:
# keep just years greater than 1980 and less than 2080 
yy_st = "1980"
yy_ed = "2080"
ds_subset = ds_subset.sel(time=slice(yy_st, yy_ed))

In [None]:
ds_subset.time.dt.month

In [None]:
# keep just times corresponding to winter (SH: all times between april and sept)
ds_subset_winter = ds_subset.isel(time=ds_subset.time.dt.month.isin([7,8,9]))

In [None]:
ds_subset_winter

In [None]:
%%time
# actually load the data so it doesn't get too big later and makes DASK angry
#ds_subset_winter.persist()
ds_subset_winter.load()

## Section 4: Save data for making composites

In [None]:
ds_subset_winter.shape

In [None]:
# Flatten the times and member_id
subset_for_composites = ds_subset_winter.stack(new=("member_id","time"))

In [None]:
subset_for_composites

In [None]:
# assign to numpy array object
subsetarray = subset_for_composites.values

In [None]:
subsetarray.shape

In [None]:
subset_for_composites.TLAT.values

## Section 5: Save data as a netcdf

In [None]:
fout = 'antarctic_data_for_som_composites_'+var_in

In [None]:
# set some info for output
longname = subset_for_composites.long_name
print(longname)
units = subset_for_composites.units
print(units)

In [None]:
ds_to_save = xr.Dataset({'data': (['nj','ni','training_times'], subsetarray)}, 
                        coords={'time':(['training_times'],subset_for_composites.time.values),
                                'member_id':(['training_times'],subset_for_composites.member_id.values),
                                'TLON':(['nj','ni'],subset_for_composites.TLON.values),
                                'TLAT':(['nj','ni'],subset_for_composites.TLAT.values),
                                'nj':(['nj'],subset_for_composites.nj.values),
                                'ni':(['ni'],subset_for_composites.ni.values)},
                        attrs={'Author': 'Alice DuVivier', 'units':units, 'longname':longname})

In [None]:
ds_to_save

In [None]:
ds_to_save.to_netcdf(fout+'.nc')  # how to save file