In [10]:
import json
import os
import numpy as np
from utils import attention
import torch

# Validate cumulative attention

In [2]:
A1 = np.array([
    [
        [0.8, 0.0, 0.2, 0.0],
        [1.0, 0.0, 0.0, 0.0],
        [0.5, 0.5, 0.0, 0.0],
        [0.0, 0.0, 0.9, 0.1]
    ],
    [
        [1.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 1.0, 0.0],
        [1.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.1, 0.9]
    ]
])
A1.shape

(2, 4, 4)

In [3]:
A2 = np.array([
    [
        [0.1, 0.9, 0.0, 0.0],
        [0.0, 0.0, 1.0, 0.0],
        [1.0, 0.0, 0.0, 0.0],
        [0.0, 1.0, 0.0, 0.0]
    ],
    [
        [1.0, 0.0, 0.0, 0.0],
        [0.0, 0.0, 0.0, 1.0],
        [0.0, 0.0, 1.0, 0.0],
        [0.0, 0.5, 0.0, 0.5]
    ]
])
A2.shape

(2, 4, 4)

In [5]:
A3 = np.array([
    [
        [0.0, 0.0, 1.0, 0.0],
        [1.0, 0.0, 0.0, 0.0],
        [0.5, 0.5, 0.0, 0.0],
        [0.0, 0.0, 0.9, 0.1]
    ], 
    [
        [0.0, 0.0, 1.0, 0.0],
        [1.0, 0.0, 0.0, 0.0],
        [0.5, 0.5, 0.0, 0.0],
        [0.0, 0.0, 0.9, 0.1]
    ]
])
A3.shape

(2, 4, 4)

In [8]:
# Shape: layers, batch, heads, o_features, i_features
As = np.array([A1, A2, A3])[:, None]

assert np.allclose(As.sum(axis=-1), 1), "Wrong attention"

As.shape

(3, 1, 2, 4, 4)

In [14]:
single_attns, cum_attns = attention.compute_std_attentions(torch.Tensor(As), "other")
cum_attns.detach().numpy()

array([[[0.5375    , 0.0625    , 0.275     , 0.125     ]],

       [[0.57375   , 0.0625    , 0.26999998, 0.09375   ]],

       [[0.71812505, 0.075     , 0.17250001, 0.034375  ]]], dtype=float32)

# How many epochs to run the models?

In [7]:
datasets = ["adult", "anneal", "australian", "jasmine", "kr-vs-kp", "ldpa", "sylvine", "volkert"]
metrics = ["balanced_accuracy", "log_loss"]
trials = 5

In [8]:
for d in datasets:
    for m in metrics:
        trials_epochs = []
        for t in range(trials):
            hist_file =  os.path.join("checkpoint", d, "best", m, f"T{t}", "model", "history.json")
            if os.path.exists(hist_file):
                with open(hist_file, "r") as f:
                    history = json.load(f)
                    
                trials_epochs.append(len(history))
                
        print(f"Dataset: {d}   Metrics: {m}   Epochs:{trials_epochs}   Mean:{np.mean(trials_epochs)}  Std:{np.std(trials_epochs)}")
                    
                

Dataset: adult   Metrics: balanced_accuracy   Epochs:[19, 20, 19, 20, 19]   Mean:19.4  Std:0.4898979485566356
Dataset: adult   Metrics: log_loss   Epochs:[38, 33, 31, 34, 31]   Mean:33.4  Std:2.5768197453450252
Dataset: anneal   Metrics: balanced_accuracy   Epochs:[62, 67, 61, 71, 48]   Mean:61.8  Std:7.782030583337488
Dataset: anneal   Metrics: log_loss   Epochs:[61, 51, 88, 84, 99]   Mean:76.6  Std:17.805617091243988
Dataset: australian   Metrics: balanced_accuracy   Epochs:[145, 144, 148, 106, 139]   Mean:136.4  Std:15.47384890710776
Dataset: australian   Metrics: log_loss   Epochs:[22, 25, 21, 22, 23]   Mean:22.6  Std:1.3564659966250536
Dataset: jasmine   Metrics: balanced_accuracy   Epochs:[30, 32, 28, 27, 34]   Mean:30.2  Std:2.5612496949731396
Dataset: jasmine   Metrics: log_loss   Epochs:[34, 33, 34, 37, 35]   Mean:34.6  Std:1.3564659966250536
Dataset: kr-vs-kp   Metrics: balanced_accuracy   Epochs:[59, 70, 126, 69, 49]   Mean:74.6  Std:26.80746164783231
Dataset: kr-vs-kp   Met

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  ret = _var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
