In [111]:
import sys
import os

#path of the project
project_path = "/home/diego/Git/thesis-tabtrans"

sys.path.append(project_path) #This helps to be able to import the data from the parent directory to other files

from utils import data, tabtrans_file, plots,attention
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import pickle
import plotly.express as px


from sklearn import (
                    linear_model, 
                    pipeline, 
                    neighbors, 
                    base, 
                    model_selection, 
                    tree, 
                    feature_selection, 
                    neural_network,
                    cluster
                )

In [112]:
df_id = 1484
X_train, X_test, y_train, y_test, train_indices, val_indices, n_instances, n_labels, n_numerical, n_categories = data.import_data(df_id)
n_features = X_train.shape[1]

name_df = data.get_dataset_name(df_id)

path_of_datset = f'{project_path}/Final_models_4/{name_df}' #The path can be 

path_to_hyperparameters = f'{path_of_datset}/tabtrans/hyperparameter_selection'

#define the path to final_tabtrans
path_to_final_tabtrans = f'{path_of_datset}/tabtrans/final_tabtrans_cv'

sample = 100
path_of_size = f'{path_to_hyperparameters}/{sample}'
path_of_results = f'{path_of_size}/results.csv'


INFO:openml.datasets.dataset:pickle write lsvt


In [113]:
hyperparameters = data.import_hyperparameters(path_of_results, cv = True)
n_layers = int(hyperparameters["n_layers"])
n_heads = int(hyperparameters["n_heads"])
embedding_size = int(hyperparameters["embedding_size"])
batch_size = int(hyperparameters["batch_size"])
epochs = int(hyperparameters["max_epochs_mean"])
n_features = X_train.shape[1]+1
aggregator = "cls"

In [114]:
#import the model
path_to_model = "/home/diego/Git/thesis-tabtrans/Final_models_4/lsvt/tabtrans/final_tabtrans_cv/100/final_model_lsvt_100.pkl"

# loading model
with open(path_to_model, 'rb') as f:
    model = pickle.load(f)

In [115]:
#ATTENTION CUBES

"""
Eneabling and extracting the attention cubes.

To eneable the attention cubes recovering, the only requirement is to 
set the PyTorch module need_weights=True. When the cubes are required the
new output will be:

    - predictions: The predictionsfor the given instances
    - layer outputs: The output of each encoder layer
    - weights: The attention cube of each encoder

In skorch, the trained PyTorch module is saved in the variable .module_.

When using skorch, the only way to recover multiple outputs is by
using the forward/forward_iter method.
"""

model.module_.need_weights = True
cumulative_attns = []

for X_inst, y_inst in zip(X_train, y_train):
    pred, layer_outputs, attn = model.forward(X={
        "x_numerical": X_inst[None, :n_numerical].astype(np.float32),
        "x_categorical": X_inst[None, n_numerical:].astype(np.int32)
        })
        
    """
    The attention cubes dimensions are:
    
    (num. layers, batch size, num. heads, num. features, num. features)
    #Why does the batch size is 1?
    """
    assert attn.shape == (n_layers, 1, n_heads, n_features, n_features) 
    
    """
    To compute the cumulative attention we provide a function in:
    
        utils.attention.compute_std_attentions(attention, aggregator)
        
    The function returns:
        The inidivual attention (non cumulative) of each layer. Shape:  (num layers, batch size, num. features)
        The cumulative attention at each layer. Shape: (num layers, batch size, num. features)
        
    The last layerof the cumulative attention represents the cumulative attention over all
    Transformer Encoders.
    """
    
    ind_attn, cum_attn = attention.compute_std_attentions(attn, aggregator)
    
    assert ind_attn.shape == (n_layers, 1, n_features)
    assert cum_attn.shape == (n_layers, 1, n_features)
    
    cumulative_attns.append( cum_attn[-1, 0])
    
cumulative_attns = np.array(cumulative_attns)

In [None]:
from sklearn.cluster import KMeans

# Initialize KMeans with 4 clusters
kmeans = KMeans(n_clusters=4, random_state=11)

# Fit the model to the data
kmeans.fit(cumulative_attns)

# Get the cluster labels
cluster_labels = kmeans.labels_

In [130]:
# Step 1: Get the sorted indices based on cluster labels
sorted_indices = np.argsort(cluster_labels)

# Step 2: Use sorted indices to reorder cumulative_attns
sorted_cumulative_attns = cumulative_attns[sorted_indices]


In [132]:
sorted_cluster_labels = cluster_labels[sorted_indices]

last_cluster_indices = []

for i in range(len(sorted_cluster_labels)-1):
    actual_value = sorted_cluster_labels[i]
    next_value = sorted_cluster_labels[i+1]
    
    if actual_value != next_value:
        last_cluster_indices.append(i)
    

In [147]:
print(sorted_cumulative_attns.shape)

(103, 311)


In [146]:
size = sorted_cumulative_attns[0].shape

# last_cluster_indices is a list of indices where you want to add NaN rows
nan_row = np.full(sorted_cumulative_attns[0].shape, np.nan)

# Insert NaN rows
for i in sorted(last_cluster_indices, reverse=True):
    sorted_cumulative_attns = np.insert(sorted_cumulative_attns, i + 1, nan_row, axis=0)

In [152]:
"""
Each row represents a test instance, while each column
represents a feature.
"""
import plotly.express as px

fig = px.imshow(sorted_cumulative_attns, color_continuous_scale='Inferno')
fig.show()

In [55]:
def get_attention_summary(attn):
    summary = {}
    
    c_mean_attn = attn.mean(axis=0)
    summary["mean_attention"] = c_mean_attn.tolist()
    
    indices_sort = np.argsort(c_mean_attn)[::-1]
    sorted_attn = c_mean_attn[indices_sort]
    
    summary["sorted_attention"] = sorted_attn.tolist()
    
    cum_attn = np.insert(c_mean_attn[indices_sort].cumsum(), 0, 0)
    summary["cumulative_attention"] = cum_attn.tolist()
    
    slopes = sorted_attn[1:] - sorted_attn[:-1]
    summary["slopes"] = slopes.tolist()
    
    return summary

def get_attn_clusters_info(
    attn,
    labels,
    n_clusters=4
    ):    
    
    #logger.info(f"Test using {n_clusters} clusters")

    #Clustering
    cluster_algo = cluster.KMeans(n_clusters=n_clusters, random_state=SEED)
    cluster_labels = cluster_algo.fit_predict(attn)

    indices = np.lexsort((labels, cluster_labels))
    t_attn = attn[indices]
    t_labels = labels[indices]
    t_cluster_labels = cluster_labels[indices]

    cluster_uniques, clusters_indices = np.unique(t_cluster_labels, return_index=True)
    clusters_indices = np.array(clusters_indices.tolist() + [cluster_labels.shape[0]])

    cluster_option_info = {
        "n_clusters": n_clusters,
        "cluster_labels": t_cluster_labels.tolist(),
        "data_required_sort": indices.tolist(),
        "clusters": []
    }

    # Test each cluster
    for c_l, c_l_start, c_l_end in zip(cluster_uniques, clusters_indices[:-1], clusters_indices[1:]):

        cluster_info = {
            "label": int(c_l),
            "start_index": int(c_l_start),
            "end_index": int(c_l_end),
        }

        cluster_info["attention_summary"] = get_attention_summary(t_attn[c_l_start:c_l_end])
        
        # Mean cluster entropy
        cluster_entropy = t_attn[c_l_start:c_l_end] * np.log(t_attn[c_l_start:c_l_end])
        cluster_entropy = -cluster_entropy.sum(axis=-1).mean()
        cluster_info["mean_entropy"] = cluster_entropy

        # Non predominant class
        c_labels = t_labels[c_l_start:c_l_end]
        existing_labels = np.unique(c_labels)
        # At least two classes
        cluster_info["classification_labels"] = []
        for e_c in existing_labels:
            fraction_in_cluster = c_labels[c_labels == e_c].shape[0] / c_labels.shape[0]
            cluster_info["classification_labels"].append({
                "label": int(e_c),
                "cluster_proportion": fraction_in_cluster
            })

        cluster_option_info["clusters"].append(cluster_info)

    return cluster_option_info

In [57]:
attn = cumulative_attns
labels = y_train
SEED = 11
cluster_info = get_attn_clusters_info(attn,labels,n_clusters=4)

Exception ignored on calling ctypes callback function: <function _ThreadpoolInfo._find_modules_with_dl_iterate_phdr.<locals>.match_module_callback at 0x7d7fcdf7b2e0>
Traceback (most recent call last):
  File "/home/diego/anaconda3/envs/tabtrans/lib/python3.11/site-packages/threadpoolctl.py", line 400, in match_module_callback
    self._make_module_from_path(filepath)
  File "/home/diego/anaconda3/envs/tabtrans/lib/python3.11/site-packages/threadpoolctl.py", line 515, in _make_module_from_path
    module = module_class(filepath, prefix, user_api, internal_api)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/diego/anaconda3/envs/tabtrans/lib/python3.11/site-packages/threadpoolctl.py", line 606, in __init__
    self.version = self.get_version()
                   ^^^^^^^^^^^^^^^^^^
  File "/home/diego/anaconda3/envs/tabtrans/lib/python3.11/site-packages/threadpoolctl.py", line 646, in get_version
    config = get_config().split()
             ^^^^^^^^^^^^

In [64]:
cluster_info.keys()

dict_keys(['n_clusters', 'cluster_labels', 'data_required_sort', 'clusters'])

In [65]:
X_train.shape

(100, 310)

In [73]:
data_sort = cluster_info["data_required_sort"]

# Sort the array using the indices
sorted_X = X_train[data_sort]
sorted_y = y_train[data_sort]