In [12]:
import numpy as np
import pandas as pdcoding
import gzip
import panel as pn
import holoviews as hv
from holoviews import opts
from bokeh.plotting import show
import math
from SOMToolBox_Parse import SOMToolBox_Parse
from pysomvis import PySOMVis

# Implementation

### Get input data and class data

In [8]:
baseFolder = "datasets\\chainlink"
idata = SOMToolBox_Parse(f"{baseFolder}\\chainlink.vec").read_weight_file()
classes = SOMToolBox_Parse(f"{baseFolder}\\chainlink.cls").read_weight_file()

### Get a comparison histogram between one main SOM and one or more comparison SOMs

In [9]:
def get_comparison_histo(_m, _n, _weights_main, _idata, _weights_compare):
    _s = len(_weights_compare)
    distanceMatrix = np.zeros((_s, 2, _m , _n))
    
    for k, w in enumerate(_weights_compare):
        w = w['arr']
        for vector in _idata: 
            position =np.argmin(np.sqrt(np.sum(np.power(_weights_main - vector, 2), axis=1)))
            x1 = position % _n
            y1 = position // _n
    
            position2 =np.argmin(np.sqrt(np.sum(np.power(w - vector, 2), axis=1)))
            x2 = position2 % _n
            y2 = position2 // _n
    
            distance = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
            
            distanceMatrix[k][0][y1][x1] += distance
            distanceMatrix[k][1][y1][x1] += 1

        for j in range(_n):
            for i in range(_m):
                if distanceMatrix[k][1][i][j] == 0:
                    distanceMatrix[k][0][i][j] = 0
                else: 
                    distanceMatrix[k][0][i][j] = distanceMatrix[k][0][i][j] / distanceMatrix[k][1][i][j]

    distanceMatrixAvg = distanceMatrix[:, 0, :, :]
    distanceMatrixAvg = distanceMatrixAvg.sum(axis=0)/_s

    return distanceMatrixAvg

### Get a hit histogram of a SOM

In [37]:
#HitHistogram
def get_histo(_m, _n, _weights, _idata):
    hist = np.zeros(_m * _n)
    for vector in _idata: 
        position =np.argmin(np.sqrt(np.sum(np.power(_weights - vector, 2), axis=1)))
        hist[position] += 1

    return hist.reshape(_m, _n)

### Show comparison histogram and hit histogram of main and comparison SOMs

In [57]:
def show_comparison_histo(main, compare, show_main=True, show_compare=True):
    # Change filenames to filepath
    weights_main = SOMToolBox_Parse(f"{baseFolder}\\compare\\{main}").read_weight_file()
    
    weights_compare_path = []
    weights_compare = []
    for f in compare:
        f = f"{baseFolder}\\compare\\{f}"
        weights_compare_path.append(f)
        w = SOMToolBox_Parse(f).read_weight_file()
        weights_compare.append(w)
    
    # Show Main SOM
    if show_main:
        som = get_histo(weights_main['ydim'], weights_main['xdim'], weights_main['arr'], idata['arr'])
        som = hv.Image(som).opts(xaxis=None, yaxis=None)   
        layout = hv.Layout([som.relabel('Main SOM Hit Hist').opts(cmap='jet')])
        show(hv.render(layout))

    # Show Comparison SOMs
    if show_compare:
        soms = []
        for w in weights_compare:
            som = get_histo(w['ydim'], w['xdim'], w['arr'], idata['arr'])
            som = hv.Image(som).opts(xaxis=None, yaxis=None)
            soms.append(som.relabel('Comparison SOM Hit Hist').opts(cmap='jet'))
        layout = hv.Layout(soms)
        show(hv.render(layout))
    
    # Show Comparison between Main and Comparison SOMs
    hv.extension('bokeh')
    comparison = get_comparison_histo(weights_main['ydim'], weights_main['xdim'], weights_main['arr'], idata['arr'], weights_compare)
    comparison = hv.Image(comparison).opts(xaxis=None, yaxis=None)   
    layout = hv.Layout([comparison.relabel('Comparison').opts(cmap='jet')])
    show(hv.render(layout))

# Comparisons

All of our SOMs were trained on the chainlink and clustering dataset with the Java based SOMToolbox. The small SOM has 10x10 units, and the large SOM has 60x100 units.

Where not mentioned otherwise, training used the following parameters:

`learnRate=0.3, sigma=7, randomSeed=7`

### Comparing a SOM with itself

When a SOM is compared with itself, there shouldn't be any differences. The comparison histogram is therefore 0 in all units. Its visualization has the same color in all units.

In [59]:
show_comparison_histo(
    main="chainlink_1000.wgt.gz", 
    compare=["chainlink_1000.wgt.gz"],
)

### Comparing a SOM with different random seeds

In this experiment, Main and Comparison SOMs are trained with exactly the same parameters. The SOMs only differ by the random seed that was used during training. The Main SOM has seed 7, the others have seeds 1-6. We can see the non-deterministic nature of SOM-training here. The Comparison histogram shows large differences in almost all relevant units.

In [62]:
# Small SOM
show_comparison_histo(
    main="chainlink_1000.wgt.gz", 
    compare=["chainlink_1000_1.wgt.gz", 
             "chainlink_1000_2.wgt.gz", 
             "chainlink_1000_3.wgt.gz", 
             "chainlink_1000_4.wgt.gz", 
             "chainlink_1000_5.wgt.gz", 
             "chainlink_1000_6.wgt.gz"]
)

In [63]:
# Large SOM
show_comparison_histo(
    main="chainlink_L_100000.wgt.gz", 
    compare=["chainlink_L_100000_1.wgt.gz", 
             "chainlink_L_100000_2.wgt.gz", 
             "chainlink_L_100000_3.wgt.gz", 
             "chainlink_L_100000_4.wgt.gz", 
             "chainlink_L_100000_5.wgt.gz", 
             "chainlink_L_100000_6.wgt.gz"]
)

### Comparing SOMs from neighbouring training iterations

The main small SOM is trained in 1000 iterations and compared to the SOMs after 1001, 1005 and 1100 iterations.
The main large SOM is trained in 100,000 iterations and compared to the SOMs after 100,100 and 110,000 iterations.
All SOMs are trained with the same seed (7) to ensure deterministic training. 
The comparison shows there there is only small differences between the SOMs because the SOM does not change much in a low number of iterations, and the SOM also does not change much when training is close to conversion.

In [74]:
# Small SOM
show_comparison_histo(
    main="chainlink_1000.wgt.gz", 
    compare=["chainlink_1001.wgt.gz",
            "chainlink_1005.wgt.gz",
            "chainlink_1100.wgt.gz"]
)

In [75]:
# Large SOM
show_comparison_histo(
    main="chainlink_L_100100.wgt.gz", 
    compare=["chainlink_L_100100.wgt.gz",
            "chainlink_L_110000.wgt.gz"]
)

### Comparing SOMs from far apart training iterations

The main small SOM is trained in 1000 iterations and compared to the SOMs after 3000 and 10000 iterations.
The main large SOM is trained in 100,000 iterations and compared to the SOMs after 1000 and 10,000 iterations.
All SOMs are trained with the same seed (7) to ensure deterministic training. 
Unsurprisingly, the comparison shows there is large differences between SOMs of far apart training iterations.

In [72]:
# Small SOM
show_comparison_histo(
    main="chainlink_1000.wgt.gz", 
    compare=["chainlink_3000.wgt.gz",
            "chainlink_10000.wgt.gz"]
)

In [73]:
# Large SOM
show_comparison_histo(
    main="chainlink_L_100000.wgt.gz", 
    compare=["chainlink_L_1000.wgt.gz",
            "chainlink_L_10000.wgt.gz"]
)