# Self-Organizing Maps (SOMs) Notebook
## Training step - Step 2

**Notebook by Maria J. Molina (NCAR) and Alice DuVivier (NCAR).**

This Notebook reads in data subset for a particular region and variable, done in step 1. Then it loops through a series of SOM hyperparameters to train a number of SOMs and determine the best size and such to answer the science questions of interest.

In [1]:
# Needed imports

from minisom import MiniSom, asymptotic_decay
import xarray as xr
import cftime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import product
import cartopy
import cartopy.crs as ccrs
from cartopy.util import add_cyclic_point
from datetime import timedelta
from itertools import product

### Set user-specified information

In [2]:
# set: variable to test, the location of the already extracted training data
var_in = 'aice_d'
sector_short = 'Ross'
data_path = '/glade/p/cgd/ppc/duvivier/cesm2_antarctic_polynya/SOM_analysis/training/'+sector_short+'_v5/'

## Section 1: Load and get correct training data

### Load in the data

In [3]:
# open file saved in earlier notebook (soms_antarctica-gettingdata.ipynb)
subset = xr.open_dataset(data_path+'training_data_region_'+sector_short+'_'+var_in+'.nc')

# assign to numpy array object
subsetarray = subset['train_data'].values

In [4]:
# check the data dims/shape - should match the dims from the getting data notebook
print(subsetarray.shape)
# confirm there are no NaN values in array for training (should print False if no values)
print(np.isnan(subsetarray).any())

(232300, 189)
False


## Section 2: SOM training

### Set SOM Hyperparameters we'll test

In [5]:
# set possible grid sizes. These are paired values.
som_grid_rows    = [3, 4, 5]    # (y-axis)
som_grid_cols    = [3, 4, 5]    # (x-axis)

# for each SOM grid, we will need to all possible combos with parameters below
# spread of neighborhood function - sigma - is set below and depends on the som shape
###sigma            = [1.0, 0.5, 0.25, 0.1]
# initial learning rate (at the iteration t we have learning_rate(t) = learning_rate / (1 + t/T) where T is #num_iteration/2)
learning_rate    = [0.005, 0.01, 0.04, 0.05, 0.5]
# how many iterations to go through
num_iteration    = [10000, 50000, 100000, 250000, 500000, 1000000]

In [6]:
# define the normalizing data function
def normalize_data(data):
    """
    Function for normalizing data prior to training using z-score
    """
    return (data - np.nanmean(data)) / np.nanstd(data)

In [7]:
# set other attributes required for som training
input_length = subsetarray.shape[1]      # Total number of points to train on per timestep
decay_function = asymptotic_decay        # Function that reduces learning_rate and sigma at each iteration
neighborhood_function = 'gaussian'       # Function that weights the neighborhood of a position in the map
topology = 'rectangular'                 # Topology of the map; Possible values: 'rectangular', 'hexagonal'
activation_distance = 'euclidean'        # Distance used to activate the map; Possible values: 'euclidean', 'cosine', 'manhattan', 'chebyshev'
random_seed = 1                          # Random seed to use for reproducibility. Using 1.
random_order = True
verbose = False #True

In [8]:
# Want to loop through the different SOM grid sizes and train each separately and write out CSV file

for num_grid, (som_row,som_col) in enumerate(zip(som_grid_rows,som_grid_cols)):
    #print(num_grid)
    print('Testing grid: '+str(som_row)+'x'+str(som_col))

    # set csv filename based on 
    fout = 'test_soms_qerror_'+sector_short+'_'+var_in+'_'+str(som_row)+'x'+str(som_col)+'.csv'
    # array to put quantization errors for this grid
    quant_errors = []
    # set sigma values based on this som shape
    sigma = list(np.hstack([0.5,list(range(1,som_col,1))]))

    # creating list of hyperparameters for each SOM grid
    list_of_sigs = []
    list_of_lrts = []
    list_of_itrs = []
    for sig, lr, n_iter in product(sigma, learning_rate, num_iteration):
        list_of_sigs.append(sig)
        list_of_lrts.append(lr)
        list_of_itrs.append(n_iter)
    
    # create empty csv
    our_csv = pd.DataFrame(np.zeros((len(list_of_sigs), 6), dtype=int), columns=["n_row", "n_col", "sigma", "lr", "n_iter", "q_error"])

    
    # now loop through the training parameters for this specific grid
    for num_exp, (sig, lr, n_iter) in enumerate(zip(list_of_sigs,list_of_lrts,list_of_itrs)):
        # print out which experiment for this grid we are on
        print('Testing '+str(num_exp+1)+' of '+str(len(list_of_sigs)))
        # normalize the training data - maybe take out??
        data = normalize_data(subsetarray)
        # initialize the SOM    
        som = MiniSom(som_row,som_col,input_length,sig,lr,decay_function,
                      neighborhood_function,topology,activation_distance,random_seed) 
    
        som.pca_weights_init(data)  # Initializes the weights to span the first two principal components
                                    # could also try random init: som.random_weights_init(data)
        # train the SOM!
        som.train(data,n_iter,random_order,verbose)
        # Add to the csv file
        our_csv.iloc[num_exp] += [som_row, som_col, sig, lr, n_iter, som.quantization_error(data)]    
    
    # Write the csv file for this grid after testing all the combinations
    print('Writing out CSV file with all qerror for som grid '+str(som_row)+'x'+str(som_col))
    our_csv.to_csv(fout)
    

Testing grid: 3x3
Testing 1 of 90
Testing 2 of 90
Testing 3 of 90
Testing 4 of 90
Testing 5 of 90
Testing 6 of 90
Testing 7 of 90
Testing 8 of 90
Testing 9 of 90
Testing 10 of 90
Testing 11 of 90
Testing 12 of 90
Testing 13 of 90
Testing 14 of 90
Testing 15 of 90
Testing 16 of 90
Testing 17 of 90
Testing 18 of 90
Testing 19 of 90
Testing 20 of 90
Testing 21 of 90
Testing 22 of 90
Testing 23 of 90
Testing 24 of 90
Testing 25 of 90
Testing 26 of 90
Testing 27 of 90
Testing 28 of 90
Testing 29 of 90
Testing 30 of 90
Testing 31 of 90
Testing 32 of 90
Testing 33 of 90
Testing 34 of 90
Testing 35 of 90
Testing 36 of 90
Testing 37 of 90
Testing 38 of 90
Testing 39 of 90
Testing 40 of 90
Testing 41 of 90
Testing 42 of 90
Testing 43 of 90
Testing 44 of 90
Testing 45 of 90
Testing 46 of 90
Testing 47 of 90
Testing 48 of 90
Testing 49 of 90
Testing 50 of 90
Testing 51 of 90
Testing 52 of 90
Testing 53 of 90
Testing 54 of 90
Testing 55 of 90
Testing 56 of 90
Testing 57 of 90
Testing 58 of 90
Testi