In [1]:
import sys
import os

#path of the project
project_path = "/home/diego-ngz/Git/thesis-tabtrans"

sys.path.append(project_path) #This helps to be able to import the data from the parent directory to other files

from utils import data, tabtrans_file, plots,attention, training, attention_file
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
import pickle
import plotly.express as px
from sklearn.cluster import KMeans
from skorch.callbacks import Checkpoint, TrainEndCheckpoint, EarlyStopping, LoadInitState, EpochScoring, Checkpoint
import skorch
import torch.nn as nn
import torch



from sklearn import (
                    linear_model, 
                    pipeline, 
                    neighbors, 
                    base, 
                    model_selection, 
                    tree, 
                    feature_selection, 
                    neural_network,
                    cluster
                )

/home/diego-ngz/Git/thesis-tabtrans


In [2]:
df_id = 31
X_train, X_test, y_train, y_test, train_indices, val_indices, n_instances, n_labels, n_numerical, n_categories = data.import_data(df_id)
n_features = X_train.shape[1]

name_df = data.get_dataset_name(df_id)

path_of_datset = f'{project_path}/Final_models_4/{name_df}' #The path can be 

path_to_hyperparameters = f'{path_of_datset}/tabtrans/hyperparameter_selection'

#define the path to final_tabtrans
path_to_final_tabtrans = f'{path_of_datset}/tabtrans/final_tabtrans_cv'

sample = 100
path_of_size = f'{path_to_hyperparameters}/{sample}'
path_of_results = f'{path_of_size}/results.csv'


INFO:openml.datasets.dataset:pickle write credit-g


In [3]:
hyperparameters = data.import_hyperparameters(path_of_results, cv = True)
n_layers = int(hyperparameters["n_layers"])
n_heads = int(hyperparameters["n_heads"])
embedding_size = int(hyperparameters["embedding_size"])
batch_size = int(hyperparameters["batch_size"])
epochs = int(hyperparameters["max_epochs_mean"])
n_features = X_train.shape[1]+1
aggregator = "cls"

In [4]:
#Let's load a model that it's already trained
#parameters of the NN
ff_pw_size = 30  #this value because of the paper
attn_dropout = 0.3 #paper
ff_dropout = 0.1 #paper value
aggregator = "cls"
aggregator_parameters = None
decoder_hidden_units = [128,64] #paper value
decoder_activation_fn = nn.ReLU()
need_weights = False
numerical_passthrough = False

#module
module = training.build_module(
    n_categories, # List of number of categories
    n_numerical, # Number of numerical features
    n_heads, # Number of heads per layer
    ff_pw_size, # Size of the MLP inside each transformer encoder layer
    n_layers, # Number of transformer encoder layers    
    n_labels, # Number of output neurons
    embedding_size,
    attn_dropout, 
    ff_dropout, 
    aggregator, # The aggregator for output vectors before decoder
    rnn_aggregator_parameters=aggregator_parameters,
    decoder_hidden_units=decoder_hidden_units,
    decoder_activation_fn=decoder_activation_fn,
    need_weights=need_weights,
    numerical_passthrough=numerical_passthrough
)

#First lets define the model... should be the same as the one that was trained
model = skorch.NeuralNetClassifier(
        module = module,
        criterion=torch.nn.CrossEntropyLoss,
        optimizer=torch.optim.AdamW,
        device= "cuda", #cuda" if torch.cuda.is_available() else
        batch_size = batch_size,
        train_split = None,
        max_epochs = 10,
        optimizer__lr=1e-4,
        optimizer__weight_decay=1e-4,
    )

# Initialize the model
model.initialize()

#path to the file train_end_params.pt
checkpoint_path = "/home/diego-ngz/Git/thesis-tabtrans/Final_models_4/credit-g/tabtrans/final_tabtrans_cv/100/checkpoints/epoch_12/train_end_params.pt"
# Load the saved parameters

model.load_params(f_params = checkpoint_path)




In [5]:
matrix = attention_file.attention_matrix(model, X_train, y_train, n_numerical, n_layers, n_heads, n_features)

In [None]:
#ATTENTION CUBES

"""
Eneabling and extracting the attention cubes.

To eneable the attention cubes recovering, the only requirement is to 
set the PyTorch module need_weights=True. When the cubes are required the
new output will be:

    - predictions: The predictionsfor the given instances
    - layer outputs: The output of each encoder layer
    - weights: The attention cube of each encoder

In skorch, the trained PyTorch module is saved in the variable .module_.

When using skorch, the only way to recover multiple outputs is by
using the forward/forward_iter method.
"""

model.module_.need_weights = True
cumulative_attns = []

for X_inst, y_inst in zip(X_train, y_train):
    pred, layer_outputs, attn = model.forward(X={
        "x_numerical": X_inst[None, :n_numerical].astype(np.float32),
        "x_categorical": X_inst[None, n_numerical:].astype(np.int32)
        })
        
    """
    The attention cubes dimensions are:
    
    (num. layers, batch size, num. heads, num. features, num. features)
    #Why does the batch size is 1?
    """
    assert attn.shape == (n_layers, 1, n_heads, n_features, n_features) 
    
    """
    To compute the cumulative attention we provide a function in:
    
        utils.attention.compute_std_attentions(attention, aggregator)
        
    The function returns:
        The inidivual attention (non cumulative) of each layer. Shape:  (num layers, batch size, num. features)
        The cumulative attention at each layer. Shape: (num layers, batch size, num. features)
        
    The last layerof the cumulative attention represents the cumulative attention over all
    Transformer Encoders.
    """
    
    ind_attn, cum_attn = attention.compute_std_attentions(attn, aggregator)
    
    assert ind_attn.shape == (n_layers, 1, n_features)
    assert cum_attn.shape == (n_layers, 1, n_features)
    
    cumulative_attns.append( cum_attn[-1, 0])
    
cumulative_attns = np.array(cumulative_attns)

In [None]:
cumulative_attns.shape

In [None]:
# Initialize KMeans with 4 clusters
kmeans = KMeans(n_clusters=4, random_state=11)

# Fit the model to the data
kmeans.fit(cumulative_attns)

# Get the cluster label for every instance in cumulative_attns
cluster_labels = kmeans.labels_

In [None]:
cluster_labels

In [None]:
# Step 1: Get the sorted indices based on cluster labels
sorted_indices = np.argsort(cluster_labels) #returns the indices that would sort the array cluster_labels
sorted_indices

In [None]:
# Step 2: Use sorted indices of the clusters to reorder cumulative_attns and to order the y_train labels
# Note that the same indices that order the clusters will make the arrays to be sorted in ascending order for clusters
sorted_cumulative_attns = cumulative_attns[sorted_indices]
sorted_y_train = y_train[sorted_indices]


In [None]:
#Now order the cluster labels based on the sorted indices to have the instance ordered, and the clusters ordered and also the labels
sorted_cluster_labels = cluster_labels[sorted_indices]
sorted_cluster_labels

In [None]:
#Here I will save the index of the last element of each cluster
last_cluster_indices = []

for i in range(len(sorted_cluster_labels)-1):
    actual_value = sorted_cluster_labels[i]
    next_value = sorted_cluster_labels[i+1]
    
    if actual_value != next_value:
        last_cluster_indices.append(i)
    

In [None]:
print(sorted_cumulative_attns.shape[1])

In [None]:
len_cumm_vector = sorted_cumulative_attns.shape[1]

# last_cluster_indices is a list of indices where you want to add NaN rows
nan_row = np.full(len_cumm_vector, np.nan)

# Insert NaN rows
#It does it in reverse in order to not change the indices of the elements
for i in sorted(last_cluster_indices, reverse=True):
    sorted_cumulative_attns = np.insert(sorted_cumulative_attns, i + 1, nan_row, axis=0) #insert the NaN array row in the sorted_cumulative_attns
    # OR Solution 2: Do it all at once
    sorted_y_train = np.insert(sorted_y_train.astype(float), i + 1, np.nan, axis=0)

In [None]:
upper_bound_index = []

s = 1
for i in last_cluster_indices:
    upper_bound_index.append(i+s)
    s+=1

upper_bound_index.append(len(sorted_cumulative_attns)-1)

print(upper_bound_index)

In [None]:
# Initialize lists to collect clusters with NaNs in between groups
final_sorted_cumulative_attns = []
final_sorted_y_train = []  # New list for final_sorted_y_train

# Copy the list to avoid changing the original list
new_upper_bound_index = upper_bound_index.copy()

# Process each cluster individually
start_idx = 0
for i in range(len(upper_bound_index)):
    
    end_idx = upper_bound_index[i]
    
    # Extract the cluster range for both data and labels
    cluster_data = sorted_cumulative_attns[start_idx:end_idx]  # Extract the cluster (end_idx is exclusive)
    cluster_labels = sorted_y_train[start_idx:end_idx]

    # Separate data based on labels (assuming only 0's and 1's)
    data_zeros = cluster_data[cluster_labels == 0]
    data_ones = cluster_data[cluster_labels == 1]
    labels_zeros = cluster_labels[cluster_labels == 0]
    labels_ones = cluster_labels[cluster_labels == 1]

    # Combine with a NaN row between 0's and 1's if both are present
    if len(data_zeros) > 0 and len(data_ones) > 0:
        combined_cluster = np.vstack([data_zeros, np.full((1, cluster_data.shape[1]), np.nan), data_ones])
        combined_labels = np.concatenate([labels_zeros, [np.nan], labels_ones])

        # Update the upper bound indexes:
        for j in range(i, len(new_upper_bound_index)):
            new_upper_bound_index[j] += 1

    else:
        combined_cluster = np.vstack([data_zeros, data_ones])  # No NaN row if only 0's or only 1's
        combined_labels = np.concatenate([labels_zeros, labels_ones])

    # Append the processed cluster data and labels to the final lists
    final_sorted_cumulative_attns.append(combined_cluster)
    final_sorted_y_train.append(combined_labels)  # Append labels with NaNs as well

    # Insert an extra NaN row to separate clusters
    final_sorted_cumulative_attns.append(np.full((1, cluster_data.shape[1]), np.nan))
    final_sorted_y_train.append([np.nan])  # Add NaN row to labels

    # Update the starting index for the next cluster
    start_idx = upper_bound_index[i] + 1

# Concatenate all parts to form the final sorted arrays with NaNs between clusters
final_sorted_cumulative_attns = np.vstack(final_sorted_cumulative_attns[:-1])  # Remove the last NaN row
final_sorted_y_train = np.concatenate(final_sorted_y_train[:-1])  # Remove the last NaN value


In [None]:
#Extract the intervals for every cluster
number_of_clusters = 4

clusters = {}

initial_index = 0

for i in range(number_of_clusters):
    name = f"cluster_{i}"
    
    interval =[]
    
    if i == number_of_clusters:
        end_index = new_upper_bound_index[i]
    else:
        end_index = new_upper_bound_index[i]-1
    
    interval.append(initial_index)
    interval.append(end_index)
    
    clusters[name] = interval
    
    initial_index = end_index+2

In [None]:
print(clusters)
print("--------------------")
print(new_upper_bound_index)

In [None]:
# Dictionary to store intervals for each cluster with respect to labels, ignoring NaNs
label_intervals = {}

# Extract intervals within each cluster based on labels
for cluster, (start, end) in clusters.items():
    cluster_data = final_sorted_y_train[start:end + 1]  # Extract the cluster slice
    intervals = []  # List to store intervals within the cluster

    # Initialize tracking variables
    label_start = None
    current_label = None

    for idx in range(start, end + 1):
        if np.isnan(final_sorted_y_train[idx]):  # Skip NaN values
            continue

        # If starting a new interval
        if current_label is None:
            current_label = final_sorted_y_train[idx]
            label_start = idx
        elif final_sorted_y_train[idx] != current_label:
            # Close the current interval when label changes
            intervals.append({
                'label': current_label,
                'start': label_start,
                'end': idx - 1
            })
            # Start a new interval
            current_label = final_sorted_y_train[idx]
            label_start = idx

    # Append the final interval if there’s an ongoing label sequence
    if current_label is not None:
        intervals.append({
            'label': current_label,
            'start': label_start,
            'end': end
        })

    # Store intervals in the dictionary for the current cluster
    label_intervals[cluster] = intervals


In [None]:
label_intervals

In [None]:
print(label_intervals["cluster_0"])
for x in label_intervals["cluster_0"]:
    print(x)
    print(type(x)) 

In [None]:
#Create the plot
# Create the Plotly Express figure
fig = px.imshow(final_sorted_cumulative_attns, color_continuous_scale='Inferno')

# Remove y-axis tick labels
fig.update_layout(title = f"{name_df}",title_x=0.5, yaxis_showticklabels=False, yaxis=dict(showgrid=False),plot_bgcolor='white')

image_width = final_sorted_cumulative_attns.shape[1]

# Assume image_width is the width of the image (number of columns in sorted_cumulative_attns)
right_x_position = image_width + 5  # Set position for right-side shapes

#clusters variable is {'cluster_0': [0, 18], 'cluster_1': [20, 50], 'cluster_2': [52, 78], 'cluster_3': [80, 105]}
# Add cluster labels and brackets

for i in range(number_of_clusters):
    interval = clusters[f"cluster_{i}"]
    start_index = interval[0]
    end_index = interval[1]
    mid_index = (start_index + end_index) / 2

    fig.add_annotation(x=-13, y=mid_index, text=f"C{i}", showarrow=False, font=dict(size=16, color='black'))
    
    #LEFT SIDE ANNOTATIONS
    # Add bracket-like shapes resembling " [ "
    fig.add_shape(type="line", x0=-5, x1=-5, y0=start_index, y1=end_index, line=dict(color="black", width=1))  # vertical part of the bracket
    fig.add_shape(type="line", x0=-5, x1=-4, y0=start_index, y1=start_index, line=dict(color="black", width=1))     # upper horizontal part
    fig.add_shape(type="line", x0=-5, x1=-4, y0=end_index, y1=end_index, line=dict(color="black", width=1))     # lower horizontal part
    fig.add_shape(type="line", x0=-6, x1=-5, y0=mid_index, y1=mid_index, line=dict(color="black", width=1))     # middel line of the bracket
    
    
    #RIGHT SIDE ANNOTATIONS
    # Right-side shapes for the bracket
    cluster_dict = label_intervals[f"cluster_{i}"]
    #{'label': 0.0, 'start': 0, 'end': 1}
    for labels_dict in cluster_dict:
        label = f"L{int(labels_dict['label'])}"
        start_index = labels_dict['start']
        end_index = labels_dict['end']
        mid_index = (start_index + end_index) / 2
        
        fig.add_annotation(x=right_x_position + 10, y=mid_index, text=label, showarrow=False, font=dict(size=16, color='black'))
        fig.add_shape(type="line", x0=right_x_position, x1=right_x_position, y0=start_index, y1=end_index, line=dict(color="black", width=1))  # vertical part of the bracket    
        fig.add_shape(type="line", x0=right_x_position, x1=right_x_position - 1, y0=start_index, y1=start_index, line=dict(color="black", width=1))  # upper horizontal part
        fig.add_shape(type="line", x0=right_x_position, x1=right_x_position - 1, y0=end_index, y1=end_index, line=dict(color="black", width=1))  # lower horizontal part
        fig.add_shape(type="line", x0=right_x_position + 1, x1=right_x_position, y0=mid_index, y1=mid_index, line=dict(color="black", width=1))  # middle line of the bracket


fig.show()