# Self-Organizing Maps (SOMs) Notebook
## Multivariate Data extraction step - Step 1

**Notebook by Maria J. Molina (NCAR) and Alice DuVivier (NCAR).**

This Notebook reads in data subset for a particular region and variable, done in step 1. Then it loops through a series of SOM hyperparameters to train a number of SOMs and determine the best size and such to answer the science questions of interest.

In [1]:
# Needed imports

from minisom import MiniSom, asymptotic_decay
import xarray as xr
import cftime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import product
import cartopy
import cartopy.crs as ccrs
from cartopy.util import add_cyclic_point
from datetime import timedelta
from itertools import product

### Set user-specified information

In [2]:
# set: variable to test, the location of the already extracted training data
var_in_1 = 'aice_d'
var_in_2 = 'hi_d'
#set output name as the combo of the two
var_in = var_in_1+'_'+var_in_2
# set other info for training
sector_short = 'Ross'
data_path = '/glade/p/cgd/ppc/duvivier/cesm2_antarctic_polynya/SOM_analysis/training/'+sector_short+'_v5/'

## Section 1: Load and get correct training data

### Load in the data

In [3]:
# open file saved in earlier notebook (soms_antarctica-gettingdata.ipynb)
subset_1 = xr.open_dataset(data_path+'training_data_region_'+sector_short+'_'+var_in_1+'.nc')
subset_2 = xr.open_dataset(data_path+'training_data_region_'+sector_short+'_'+var_in_2+'.nc')

# assign to numpy array object
subsetarray_1 = subset_1['train_data'].values
subsetarray_2 = subset_2['train_data'].values

In [4]:
# check the data dims/shape - should match the dims from the getting data notebook
# confirm there are no NaN values in array for training (should print False if no values)
print(var_in_1)
print(subsetarray_1.shape)
print(np.isnan(subsetarray_1).any())

print(var_in_2)
print(subsetarray_2.shape)
print(np.isnan(subsetarray_2).any())

aice_d
(232300, 189)
False
hi_d
(232300, 189)
False


In [5]:
# combine into a single array that will have both variables
# dims = ntraining x (2*npts)
subsetarray = np.zeros([len(subset_1.training_times),2*len(subset_1.points)])

# fill in the values from the two training datasets
subsetarray[:,0:(len(subset_1.points))] = subsetarray_1[:,:]
subsetarray[:,(len(subset_1.points)):(2*len(subset_1.points))] = subsetarray_2[:,:]


In [6]:
# check the data dims/shape - should match the dims from the getting data notebook
# confirm there are no NaN values in array for training (should print False if no values)
print(var_in)
print(subsetarray.shape)
print(np.isnan(subsetarray).any())

aice_d_hi_d
(232300, 378)
False


### Save data as a netcdf

In [11]:
fout = 'training_data_region_'+sector_short+'_'+var_in

In [14]:
ds_to_save = xr.Dataset({'train_data': (['training_times','points'], subsetarray)}, 
                        coords={'time':(['training_times'],subset_1.time.values),
                                'member_id':(['training_times'],subset_1.member_id.values)},
                        attrs={'Author': 'Alice DuVivier'})

In [15]:
ds_to_save

In [16]:
ds_to_save.to_netcdf(fout+'.nc')  # how to save file