# Modeling Setup

In [1]:
# First, please setup the essential hyper-param for modeling and select the dataset you would like to model; then run each code block sequentially
# Note: this code requires a GPU to execute (ex. T4 or L4, with high-mem enabled)

# set hyper-params here:
model_type = 'base_sub_no_mask' # valid inputs: base_no_mask (adapt a DTree), base_sub_no_mask (adapt a DTree with several sub-trees)
tree_depth = 10 # depth of the symbolic DTree to adapt
masking = False # whether to perform dropout during logical regularizer computation, default to False
bayes_opt = True # perform bayesian optimization for hyper-param

# select dataset to model by commenting out others:
# dataset = 'Higgs' # valid inputs: Higgs, Census, Credit, Insurance, Cover
# non_ordinal_cat = False # set True if there exists non-ordinal Categorical feature(s) in above dataset
# cat_as_num = False # set True to treat the non-ordinal Categorical feature(s) in above dataset as numericals; default to False

# dataset = 'Census'
# non_ordinal_cat = True
# cat_as_num = False

# dataset = 'Credit'
# non_ordinal_cat = False
# cat_as_num = False

# dataset = 'Insurance'
# non_ordinal_cat = True
# cat_as_num = False

dataset = 'Cover'
non_ordinal_cat = True
cat_as_num = False


In [None]:
!pip install category_encoders
!pip install optuna

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim

import pandas as pd
import numpy as np
import math

import sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import preprocessing
from category_encoders import TargetEncoder
from sklearn.tree import export_text
import optuna
import pickle
from sklearn.preprocessing import QuantileTransformer
from imblearn.over_sampling import RandomOverSampler

import copy
from itertools import chain
import random
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.tree import _tree

torch.set_printoptions(sci_mode = False)

In [4]:
class EarlyStopper:
    def __init__(self, patience = 1, min_delta = 0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss >= (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False


In [5]:
def get_rules(tree, feature_names, class_names):
    tree_ = tree.tree_
    feature_name = [
        feature_names[i] if i != _tree.TREE_UNDEFINED else "undefined!"
        for i in tree_.feature
    ]

    paths = []
    path = []

    def recurse(node, path, paths):

        if tree_.feature[node] != _tree.TREE_UNDEFINED:
            name = feature_name[node]
            threshold = tree_.threshold[node]
            p1, p2 = list(path), list(path)
            p1 += [f"{name} <= {np.round(threshold, 3)}"]
            recurse(tree_.children_left[node], p1, paths)
            p2 += [f"{name} > {np.round(threshold, 3)}"]
            recurse(tree_.children_right[node], p2, paths)
        else:
            paths += [path]

    recurse(0, path, paths)

    return paths


In [6]:
# Neural Symbolic Decision Tree (denoted as NSDT) is the model developed according to our proposed FNLR paradigm:

In [7]:
class MLP(nn.Module):

    def __init__(self, MLP_type, embedding_dimension, hidden_init, hidden_nodes_layer1, hidden_nodes_layer2, activation):
        super(MLP, self).__init__()
        self.MLP_type = MLP_type

        if (self.MLP_type == 'module'):

            if (activation == 'leakyrelu'):
                self.act = nn.LeakyReLU()
            if (activation == 'relu'):
                self.act = nn.ReLU()
            if (activation == 'elu'):
                self.act = nn.ELU()
            if (activation == 'gelu'):
                self.act = nn.GELU()
            if (activation == 'tanh'):
                self.act = nn.Tanh()

            self.sig = nn.Sigmoid()

            self.l1 = nn.Linear(embedding_dimension * 2, hidden_nodes_layer1)
            self.l2 = nn.Linear(hidden_nodes_layer1, hidden_nodes_layer2)
            self.output = nn.Linear(hidden_nodes_layer2, 1)

            if (hidden_init == 'xavier'):
                nn.init.xavier_uniform_(self.l1.weight)
                nn.init.xavier_uniform_(self.l2.weight)
                nn.init.xavier_uniform_(self.output.weight)
            if (hidden_init == 'he'):
                nn.init.kaiming_uniform_(self.l1.weight)
                nn.init.kaiming_uniform_(self.l2.weight)
                nn.init.kaiming_uniform_(self.output.weight)

        else:
            self.output = nn.Linear(len(rules), 1) # rules is global still
            nn.init.xavier_uniform_(self.output.weight)


    def forward(self, inputs):
        if (self.MLP_type == 'module'):
            return self.sig( self.output( self.act( self.l2( self.act( self.l1(inputs) ) ) ) ) )
        else:
            return self.output(inputs)



class NSDT(nn.Module):

    def __init__(self, masking, total_levels, embedding_dimension, hidden_init, hidden_nodes_layer1, hidden_nodes_layer2, activation):
        super(NSDT, self).__init__()
        self.total_levels = total_levels
        self.embedding_dimension = embedding_dimension
        self.hidden_init = hidden_init
        self.hidden_nodes_layer1 = hidden_nodes_layer1
        self.hidden_nodes_layer2 = hidden_nodes_layer2
        self.activation = activation
        self.NSDT_masking = masking

        self.ge_module = MLP('module', self.embedding_dimension, self.hidden_init, self.hidden_nodes_layer1, self.hidden_nodes_layer2, self.activation)
        self.le_module = MLP('module', self.embedding_dimension, self.hidden_init, self.hidden_nodes_layer1, self.hidden_nodes_layer2, self.activation)
        self.be_module = MLP('module', self.embedding_dimension, self.hidden_init, self.hidden_nodes_layer1, self.hidden_nodes_layer2, self.activation)
        self.target_projection = MLP('target', self.embedding_dimension, self.hidden_init, self.hidden_nodes_layer1, self.hidden_nodes_layer2, self.activation) # also passed in, just not used

        self.num_feature_embeddings = nn.Embedding(self.total_levels, self.embedding_dimension)
        self.context_value_embeddings = nn.Embedding(be_node_counter, self.embedding_dimension) # be_node_counter global variable still


    def forward(self, batch_le_feature_index_list, batch_le_context_value_index_list,
                batch_ge_feature_index_list, batch_ge_context_value_index_list,
                batch_be_feature_index_list, batch_be_context_value_index_list, batch_size):

        # compute le nodes
        feature_batch_embed = self.num_feature_embeddings(batch_le_feature_index_list).to('cuda') # 440 x 128 x 200

        lookup_tensor = torch.tensor(batch_le_context_value_index_list, dtype = torch.long).to('cuda')
        context_value_embed = self.context_value_embeddings(lookup_tensor).to('cuda') # 440 x 128 x 200

        le_module_input = torch.cat( [feature_batch_embed, context_value_embed], axis = 2 ).requires_grad_(True)

        le_nodes_output = self.le_module.forward(le_module_input).requires_grad_(True) # 440 x 128 x 1
        le_nodes_output = torch.squeeze(le_nodes_output).permute(1,0).requires_grad_(True)

        # compute ge nodes
        feature_batch_embed = self.num_feature_embeddings(batch_ge_feature_index_list).to('cuda') # 56320 x 200

        lookup_tensor = torch.tensor(batch_ge_context_value_index_list, dtype = torch.long).to('cuda')
        context_value_embed = self.context_value_embeddings(lookup_tensor).to('cuda')

        ge_module_input = torch.cat( [feature_batch_embed, context_value_embed], axis = 2 ).requires_grad_(True)

        ge_nodes_output = self.ge_module.forward(ge_module_input).requires_grad_(True) # 425 x 128 x 1
        ge_nodes_output = torch.squeeze(ge_nodes_output).permute(1,0).requires_grad_(True)

        # copmute be nodes
        feature_batch_embed = self.num_feature_embeddings(batch_be_feature_index_list).to('cuda') # 56320 x 200

        lookup_tensor = torch.tensor(batch_be_context_value_index_list, dtype = torch.long).to('cuda')
        context_value_embed = self.context_value_embeddings(lookup_tensor).to('cuda')

        be_module_input = torch.cat( [feature_batch_embed, context_value_embed], axis = 2 ).requires_grad_(True)

        be_nodes_output = self.be_module.forward(be_module_input).requires_grad_(True) # 425 x 128 x 1
        be_nodes_output = torch.squeeze(be_nodes_output).permute(1,0).requires_grad_(True)


        # cat all-1 vector by the end of le & ge node outputs;
        padding_tensor = torch.ones( (batch_size, 1) ).to('cuda')
        nodes_output = torch.cat( [le_nodes_output, ge_nodes_output, be_nodes_output, padding_tensor], axis = 1 ).requires_grad_(True) # 128 x 865

        if (self.NSDT_masking == True):
            nodes_dropout = torch.tensor( np.random.choice( [0, 1], size = (len(rules), tree_depth), p=[0.05, 0.95] ) ).to('cuda')
            nodes_output = nodes_output[:, rule_look_up_indexes] * nodes_dropout
            nodes_output = torch.where( nodes_output != 0, nodes_output, random.random() ).requires_grad_(True)

            rule_outputs = torch.prod(nodes_output, dim = 2).requires_grad_(True)

        else:
            rule_outputs = torch.prod(nodes_output[:, rule_look_up_indexes], dim = 2).requires_grad_(True)

        return self.target_projection(rule_outputs)



class NSDT_num_feature_only(nn.Module):

    def __init__(self, masking, total_levels, embedding_dimension, hidden_init, hidden_nodes_layer1, hidden_nodes_layer2, activation):
        super(NSDT_num_feature_only, self).__init__()
        self.total_levels = total_levels
        self.embedding_dimension = embedding_dimension
        self.hidden_init = hidden_init
        self.hidden_nodes_layer1 = hidden_nodes_layer1
        self.hidden_nodes_layer2 = hidden_nodes_layer2
        self.activation = activation
        self.NSDT_masking = masking

        self.ge_module = MLP('module', self.embedding_dimension, self.hidden_init, self.hidden_nodes_layer1, self.hidden_nodes_layer2, self.activation)
        self.le_module = MLP('module', self.embedding_dimension, self.hidden_init, self.hidden_nodes_layer1, self.hidden_nodes_layer2, self.activation)
        self.be_module = MLP('module', self.embedding_dimension, self.hidden_init, self.hidden_nodes_layer1, self.hidden_nodes_layer2, self.activation)
        self.target_projection = MLP('target', self.embedding_dimension, self.hidden_init, self.hidden_nodes_layer1, self.hidden_nodes_layer2, self.activation) # also passed in, just not used

        self.num_feature_embeddings = nn.Embedding(self.total_levels, self.embedding_dimension)
        self.context_value_embeddings = nn.Embedding(be_node_counter, self.embedding_dimension) # be_node_counter global variable still


    def forward(self, batch_le_feature_index_list, batch_le_context_value_index_list,
                batch_ge_feature_index_list, batch_ge_context_value_index_list, batch_size):

        # compute le nodes
        feature_batch_embed = self.num_feature_embeddings(batch_le_feature_index_list).to('cuda') # 440 x 128 x 200

        lookup_tensor = torch.tensor(batch_le_context_value_index_list, dtype = torch.long).to('cuda')
        context_value_embed = self.context_value_embeddings(lookup_tensor).to('cuda') # 440 x 128 x 200

        le_module_input = torch.cat( [feature_batch_embed, context_value_embed], axis = 2 ).requires_grad_(True)

        le_nodes_output = self.le_module.forward(le_module_input).requires_grad_(True) # 440 x 128 x 1
        le_nodes_output = torch.squeeze(le_nodes_output).permute(1,0).requires_grad_(True)

        # compute ge nodes
        feature_batch_embed = self.num_feature_embeddings(batch_ge_feature_index_list).to('cuda') # 56320 x 200

        lookup_tensor = torch.tensor(batch_ge_context_value_index_list, dtype = torch.long).to('cuda')
        context_value_embed = self.context_value_embeddings(lookup_tensor).to('cuda')

        ge_module_input = torch.cat( [feature_batch_embed, context_value_embed], axis = 2 ).requires_grad_(True)

        ge_nodes_output = self.ge_module.forward(ge_module_input).requires_grad_(True) # 425 x 128 x 1
        ge_nodes_output = torch.squeeze(ge_nodes_output).permute(1,0).requires_grad_(True)

        # cat all-1 vector by the end of le & ge node outputs;
        padding_tensor = torch.ones( (batch_size, 1) ).to('cuda')
        nodes_output = torch.cat( [le_nodes_output, ge_nodes_output, padding_tensor], axis = 1 ).requires_grad_(True) # 128 x 865

        if (self.NSDT_masking == True):
            nodes_dropout = torch.tensor( np.random.choice( [0, 1], size = (len(rules), tree_depth), p=[0.3, 0.7] ) ).to('cuda')
            nodes_output = nodes_output[:, rule_look_up_indexes] * nodes_dropout
            nodes_output = torch.where( nodes_output != 0, nodes_output, random.random() ).requires_grad_(True)

            rule_outputs = torch.prod(nodes_output, dim = 2).requires_grad_(True)

        else:
            rule_outputs = torch.prod(nodes_output[:, rule_look_up_indexes], dim = 2).requires_grad_(True)

        return self.target_projection(rule_outputs)


In [8]:
def Train_NSDT_split_regs(masking, train_loader, val_loader, test_loader, early_stopper, new_train, cut_off_dict, feature_lookup_dict, cat_bins_total, total_numerical_nodes, total_levels, bins, embedding_dimension, n_iter, mlp_optimizer, hidden_init, hidden_nodes_layer1, hidden_nodes_layer2, activation, learning_rate, weight_decays, reg_frequency, le_reg_weight, ge_reg_weight, be_reg_weight):

    be_weight = 1.0
    if dataset == 'Higgs':
        be_weight = 2.0

    feature_indexes_le_nodes, feature_indexes_ge_nodes, feature_indexes_be_nodes, all_batch_le_context_value_index_list, all_batch_ge_context_value_index_list, all_batch_be_context_value_index_list, cat_features_Si_indexes, le_orderings_Si_indexes, ge_orderings_Si_indexes, le_ref_lookup_indexes, le_features_Si_indexes, ge_ref_lookup_indexes, ge_features_Si_indexes, le_current_feature_indexes, le_next_feature_indexes, le_expected_outputs_1, le_expected_outputs_2, ge_current_feature_indexes, ge_next_feature_indexes, ge_expected_outputs_1, ge_expected_outputs_2 = NSDT_modeling_preps(bins, train_loader, new_train, cut_off_dict, feature_lookup_dict, cat_bins_total, total_numerical_nodes, total_levels)

    clf = NSDT(masking, total_levels, embedding_dimension, hidden_init, hidden_nodes_layer1, hidden_nodes_layer2, activation).to('cuda')

    in_lookup_tensor = torch.tensor( list( range(total_numerical_nodes, be_node_counter) ), dtype = torch.long).to('cuda')
    total_cat_nodes = be_node_counter - total_numerical_nodes

    relu = torch.nn.ReLU()

    pos_weight = torch.tensor( [training_reweight] ).to('cuda') # training_reweight is global still
    bce_loss = nn.BCEWithLogitsLoss(pos_weight = pos_weight, reduction = 'mean')

    if (mlp_optimizer == 'adam'):
        optimizer = optim.Adam(clf.parameters(), lr = learning_rate, weight_decay = weight_decays)
    else:
        optimizer = optim.AdamW(clf.parameters(), lr = learning_rate, weight_decay = weight_decays)

    print('Start Training:')
    print()

    accs = []
    for epoch in range(n_iter):

        total_loss = 0.0
        batch_count = 0
        reg_count = 0

        for batch in train_loader:

            subset = batch[:,:num_of_predictors] # X
            batch_size = len(batch)

            target = batch[:,num_of_predictors].to('cuda') # y
            target = target.float()

            batch_le_feature_index_list = subset[:, feature_indexes_le_nodes].permute(1,0).to('cuda')
            batch_ge_feature_index_list = subset[:, feature_indexes_ge_nodes].permute(1,0).to('cuda')
            batch_be_feature_index_list = subset[:, feature_indexes_be_nodes].permute(1,0).to('cuda')

            batch_le_context_value_index_list = all_batch_le_context_value_index_list[batch_count]
            batch_ge_context_value_index_list = all_batch_ge_context_value_index_list[batch_count]
            batch_be_context_value_index_list = all_batch_be_context_value_index_list[batch_count]

            preds = clf.forward(batch_le_feature_index_list, batch_le_context_value_index_list,
                                batch_ge_feature_index_list, batch_ge_context_value_index_list,
                                batch_be_feature_index_list, batch_be_context_value_index_list,
                                batch_size)
            preds = torch.squeeze(preds)


            # add regularizers losses once every N batch:
            total_in_loss = torch.zeros(1).to('cuda')
            total_be_loss = torch.zeros(1).to('cuda')

            total_le_loss = torch.zeros(1).to('cuda')
            total_ge_loss = torch.zeros(1).to('cuda')

            le_total_ref_loss = torch.zeros(1).to('cuda')
            le_total_asym_loss = torch.zeros(1).to('cuda')
            le_total_trans_loss = torch.zeros(1).to('cuda')
            le_total_ordering_loss = torch.zeros(1).to('cuda')
            le_total_cutoff_loss = torch.zeros(1).to('cuda')
            le_total_comp_loss = torch.zeros(1).to('cuda')

            ge_total_ref_loss = torch.zeros(1).to('cuda')
            ge_total_asym_loss = torch.zeros(1).to('cuda')
            ge_total_trans_loss = torch.zeros(1).to('cuda')
            ge_total_ordering_loss = torch.zeros(1).to('cuda')
            ge_total_cutoff_loss = torch.zeros(1).to('cuda')
            ge_total_comp_loss = torch.zeros(1).to('cuda')

            if reg_count % reg_frequency == 0:

                # # compute in reg loss:
                # # randomly generate 50 (N x 50) tensor --> 50 x N x 50
                # cat_context_value_embed = clf.context_value_embeddings(in_lookup_tensor).expand(50, -1, -1)
                # negative_samples = torch.randn(50, total_cat_nodes, embedding_dimension).to('cuda') * 0.001 # rand vs randn

                # be_module_input = torch.cat([negative_samples, cat_context_value_embed], dim = 2)
                # total_in_loss += torch.sum( -1.0 * torch.log( 1.0 - clf.be_module(be_module_input) ) )


                # for i in be_Si_indexes:
                #     feature_bin_embed = clf.num_feature_embeddings( i[0] )
                #     feature_context_embed = clf.context_value_embeddings( i[1] )
                #     be_module_input = torch.cat( [feature_bin_embed, feature_context_embed], axis = 1 )
                #     be_module_output = clf.be_module(be_module_input)

                #     # compute set reg loss:
                #     total_be_loss += torch.sum( torch.min( torch.cat( [ (1.0 - be_module_output), (be_module_output - 0.0) ], axis = 1 ), dim = 1 )[0] )

                #     # compute equal reg loss (less efficient; only to verify):
                #     total_be_loss += torch.abs( 1.0 - torch.sum(be_module_output) )
                #     total_be_loss += torch.abs( 1.0 - ( torch.max(be_module_output) - torch.min(be_module_output) ) )

                # compute equal & set regs loss more efficiently:
                for i in cat_features_Si_indexes:
                    feature_bin_embed = clf.num_feature_embeddings( i[0] )
                    feature_context_embed = clf.context_value_embeddings( i[1] )
                    be_module_input = torch.cat( [feature_bin_embed, feature_context_embed], axis = 2 )
                    be_module_output = clf.be_module(be_module_input)

                    # compute set reg loss:
                    total_be_loss += torch.sum( torch.min( torch.cat( [ (1.0 - be_module_output), (be_module_output - 0.0) ], axis = 2 ), dim = 2 )[0] )

                    # # compute equal reg loss:
                    # total_be_loss += torch.sum( torch.abs( 1.0 - torch.sum(be_module_output, dim = 1) ) )
                    # total_be_loss += torch.sum( torch.abs( 1.0 - ( torch.max(be_module_output, 1)[0] - torch.min(be_module_output, 1)[0] ) ) )


                # compute ordering regs:
                if reg_count % (reg_frequency * 6) == 0:

                    starting_index = random.randint(0, bins - 2)
                    ending_index = random.randint(starting_index + 2, bins)

                    for i in le_orderings_Si_indexes:
                        feature_bin_embed = clf.num_feature_embeddings( i[0][starting_index:ending_index] )
                        feature_context_embed = clf.context_value_embeddings( i[1][starting_index:ending_index] )
                        num_module_input = torch.cat( [feature_bin_embed, feature_context_embed], axis = 1 )

                        le_module_output = clf.le_module(num_module_input) # all values should <= first value
                        le_total_ordering_loss += torch.sum( relu( le_module_output - le_module_output[0] ) )
                        le_total_ordering_loss += torch.sum( relu( le_module_output[-1] - le_module_output ) )

                        feature_bin_embed = clf.num_feature_embeddings( i[0] )
                        feature_context_embed = clf.context_value_embeddings( i[1] )
                        num_module_input_allbins = torch.cat( [feature_bin_embed, feature_context_embed], axis = 1 )

                        le_module_output = clf.le_module(num_module_input_allbins)
                        context_index = int( torch.argmin( torch.abs(le_module_output - 0.5) ) )
                        le_total_cutoff_loss += torch.sum( relu( le_module_output[context_index] - le_module_output[:context_index] ) )
                        le_total_cutoff_loss += torch.sum( relu( le_module_output[context_index + 1:] - le_module_output[context_index] ) )

                    for i in ge_orderings_Si_indexes:
                        feature_bin_embed = clf.num_feature_embeddings( i[0][starting_index:ending_index] )
                        feature_context_embed = clf.context_value_embeddings( i[1][starting_index:ending_index] )
                        num_module_input = torch.cat( [feature_bin_embed, feature_context_embed], axis = 1 )

                        ge_module_output = clf.ge_module(num_module_input) # all values should <= last value
                        ge_total_ordering_loss += torch.sum( relu( ge_module_output - ge_module_output[-1] ) )
                        ge_total_ordering_loss += torch.sum( relu( ge_module_output[0] - ge_module_output ) )

                        feature_bin_embed = clf.num_feature_embeddings( i[0] )
                        feature_context_embed = clf.context_value_embeddings( i[1] )
                        num_module_input_allbins = torch.cat( [feature_bin_embed, feature_context_embed], axis = 1 )

                        ge_module_output = clf.ge_module(num_module_input_allbins)
                        context_index = int( torch.argmin( torch.abs(ge_module_output - 0.5) ) )
                        ge_total_cutoff_loss += torch.sum( relu( ge_module_output[context_index] - ge_module_output[context_index + 1:] ) )
                        ge_total_cutoff_loss += torch.sum( relu( ge_module_output[:context_index] - ge_module_output[context_index] ) )


                # compute le nodes ref & asym & trans reg losses 1st:
                current_feature_bin_embed = clf.context_value_embeddings(le_ref_lookup_indexes)
                feature_bin_input = torch.cat( [current_feature_bin_embed, current_feature_bin_embed], axis = 1 )

                le_total_ref_loss += torch.sum( torch.abs( 0.5 - clf.le_module(feature_bin_input) ) )


                for i in range(num_of_numerical_predictors):
                    feature_bin_embed = clf.num_feature_embeddings( le_features_Si_indexes[i][0] )
                    feature_context_embed = clf.context_value_embeddings( le_features_Si_indexes[i][1] )

                    feature_all_embed = torch.cat( [feature_bin_embed, feature_context_embed], axis = 0 )
                    feature_all_embed_1 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    feature_all_embed_2 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]

                    ab_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_2], axis = 1 )
                    ba_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_1], axis = 1 )

                    le_total_asym_loss += torch.sum( torch.abs( clf.le_module(ab_module_input) - ( 1.0 - clf.le_module(ba_module_input) ) ) )


                    feature_all_embed_1 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    feature_all_embed_2 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    feature_all_embed_3 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]

                    ab_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_2], axis = 1 )
                    bc_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_3], axis = 1 )
                    ac_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_3], axis = 1 )

                    ab_le_output = clf.le_module(ab_module_input)
                    bc_le_output = clf.le_module(bc_module_input)
                    ac_le_output = clf.le_module(ac_module_input)

                    le_total_trans_loss += torch.sum( torch.sigmoid( (ab_le_output - 0.5) * 1e7 ) * torch.sigmoid( (bc_le_output - 0.5) * 1e7 ) * relu( torch.max(torch.cat( [ ab_le_output, bc_le_output ], axis = 1 ), 1)[0].view(-1, 1) - ac_le_output ) )
                    le_total_trans_loss += torch.sum( torch.sigmoid( (0.5 - ab_le_output) * 1e7 ) * torch.sigmoid( (0.5 - bc_le_output) * 1e7 ) * relu( ac_le_output - torch.min(torch.cat( [ ab_le_output, bc_le_output ], axis = 1 ), 1)[0].view(-1, 1) ) )

                    ba_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_1], axis = 1 )
                    cb_module_input = torch.cat( [feature_all_embed_3, feature_all_embed_2], axis = 1 )
                    ca_module_input = torch.cat( [feature_all_embed_3, feature_all_embed_1], axis = 1 )

                    ba_le_output = clf.le_module(ba_module_input)
                    cb_le_output = clf.le_module(cb_module_input)
                    ca_le_output = clf.le_module(ca_module_input)

                    le_total_trans_loss += torch.sum( torch.sigmoid( (ba_le_output - 0.5) * 1e7 ) * torch.sigmoid( (cb_le_output - 0.5) * 1e7 ) * relu( torch.max(torch.cat( [ ba_le_output, cb_le_output ], axis = 1 ), 1)[0].view(-1, 1) - ca_le_output ) )
                    le_total_trans_loss += torch.sum( torch.sigmoid( (0.5 - ba_le_output) * 1e7 ) * torch.sigmoid( (0.5 - cb_le_output) * 1e7 ) * relu( ca_le_output - torch.min(torch.cat( [ ba_le_output, cb_le_output ], axis = 1 ), 1)[0].view(-1, 1) ) )


                    # proposed but dropped due to redundancy
                    # feature_all_embed_1 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    # feature_all_embed_2 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    # feature_all_embed_3 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]

                    # ab_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_2], axis = 1 )
                    # bc_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_3], axis = 1 )
                    # ac_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_3], axis = 1 )

                    # ab_le_output = clf.le_module(ab_module_input)
                    # bc_le_output = clf.le_module(bc_module_input)
                    # ac_le_output = clf.le_module(ac_module_input)

                    # diff = torch.abs(0.5 - ab_le_output) - torch.abs(0.5 - bc_le_output)
                    # le_total_comp_loss += torch.sum( torch.sigmoid( (0.5 - ab_le_output) * 1e7 ) * torch.sigmoid( (bc_le_output - 0.5) * 1e7 ) * ( torch.sigmoid( ( torch.abs(0.5 - ab_le_output) - torch.abs(0.5 - bc_le_output) ) * 1e7 ) * torch.abs( ac_le_output - (0.5 - diff) ) + torch.sigmoid( ( torch.abs(0.5 - bc_le_output) - torch.abs(0.5 - ab_le_output) ) * 1e7 ) * torch.abs( (0.5 - diff) - ac_le_output ) ) )

                    # diff = torch.abs(0.5 - bc_le_output) - torch.abs(0.5 - ab_le_output)
                    # le_total_comp_loss += torch.sum( torch.sigmoid( (ab_le_output - 0.5) * 1e7 ) * torch.sigmoid( (0.5 - bc_le_output) * 1e7 ) * ( torch.sigmoid( ( torch.abs(0.5 - bc_le_output) - torch.abs(0.5 - ab_le_output) ) * 1e7 ) * torch.abs( ac_le_output - (0.5 - diff) ) + torch.sigmoid( ( torch.abs(0.5 - ab_le_output) - torch.abs(0.5 - bc_le_output) ) * 1e7 ) * torch.abs( (0.5 - diff) - ac_le_output ) ) )

                    # ba_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_1], axis = 1 )
                    # cb_module_input = torch.cat( [feature_all_embed_3, feature_all_embed_2], axis = 1 )
                    # ca_module_input = torch.cat( [feature_all_embed_3, feature_all_embed_1], axis = 1 )

                    # ba_le_output = clf.le_module(ba_module_input)
                    # cb_le_output = clf.le_module(cb_module_input)
                    # ca_le_output = clf.le_module(ca_module_input)

                    # diff = torch.abs(0.5 - ba_le_output) - torch.abs(0.5 - cb_le_output)
                    # le_total_comp_loss += torch.sum( torch.sigmoid( (0.5 - ba_le_output) * 1e7 ) * torch.sigmoid( (cb_le_output - 0.5) * 1e7 ) * ( torch.sigmoid( ( torch.abs(0.5 - ba_le_output) - torch.abs(0.5 - cb_le_output) ) * 1e7 ) * torch.abs( ca_le_output - (0.5 - diff) ) + torch.sigmoid( ( torch.abs(0.5 - cb_le_output) - torch.abs(0.5 - ba_le_output) ) * 1e7 ) * torch.abs( (0.5 - diff) - ca_le_output ) ) )

                    # diff = torch.abs(0.5 - cb_le_output) - torch.abs(0.5 - ba_le_output)
                    # le_total_comp_loss += torch.sum( torch.sigmoid( (ba_le_output - 0.5) * 1e7 ) * torch.sigmoid( (0.5 - cb_le_output) * 1e7 ) * ( torch.sigmoid( ( torch.abs(0.5 - cb_le_output) - torch.abs(0.5 - ba_le_output) ) * 1e7 ) * torch.abs( ca_le_output - (0.5 - diff) ) + torch.sigmoid( ( torch.abs(0.5 - ba_le_output) - torch.abs(0.5 - cb_le_output) ) * 1e7 ) * torch.abs( (0.5 - diff) - ca_le_output ) ) )


                # now compute ge nodes ref & asym & trans reg losses:
                current_feature_bin_embed = clf.context_value_embeddings(ge_ref_lookup_indexes)
                feature_bin_input = torch.cat( [current_feature_bin_embed, current_feature_bin_embed], axis = 1 )

                ge_total_ref_loss += torch.sum( torch.abs( 0.5 - clf.ge_module(feature_bin_input) ) )


                for i in range(num_of_numerical_predictors):
                    feature_bin_embed = clf.num_feature_embeddings( ge_features_Si_indexes[i][0] )
                    feature_context_embed = clf.context_value_embeddings( ge_features_Si_indexes[i][1] )

                    feature_all_embed = torch.cat( [feature_bin_embed, feature_context_embed], axis = 0 )
                    feature_all_embed_1 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    feature_all_embed_2 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]

                    ab_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_2], axis = 1 )
                    ba_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_1], axis = 1 )

                    ge_total_asym_loss += torch.sum( torch.abs( clf.ge_module(ab_module_input) - ( 1.0 - clf.ge_module(ba_module_input) ) ) )


                    feature_all_embed_1 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    feature_all_embed_2 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    feature_all_embed_3 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]

                    ab_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_2], axis = 1 )
                    bc_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_3], axis = 1 )
                    ac_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_3], axis = 1 )

                    ab_ge_output = clf.ge_module(ab_module_input)
                    bc_ge_output = clf.ge_module(bc_module_input)
                    ac_ge_output = clf.ge_module(ac_module_input)

                    ge_total_trans_loss += torch.sum( torch.sigmoid( (ab_ge_output - 0.5) * 1e7 ) * torch.sigmoid( (bc_ge_output - 0.5) * 1e7 ) * relu( torch.max(torch.cat( [ ab_ge_output, bc_ge_output ], axis = 1 ), 1)[0].view(-1, 1) - ac_ge_output ) )
                    ge_total_trans_loss += torch.sum( torch.sigmoid( (0.5 - ab_ge_output) * 1e7 ) * torch.sigmoid( (0.5 - bc_ge_output) * 1e7 ) * relu( ac_ge_output - torch.min(torch.cat( [ ab_ge_output, bc_ge_output ], axis = 1 ), 1)[0].view(-1, 1) ) )

                    ba_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_1], axis = 1 )
                    cb_module_input = torch.cat( [feature_all_embed_3, feature_all_embed_2], axis = 1 )
                    ca_module_input = torch.cat( [feature_all_embed_3, feature_all_embed_1], axis = 1 )

                    ba_ge_output = clf.ge_module(ba_module_input)
                    cb_ge_output = clf.ge_module(cb_module_input)
                    ca_ge_output = clf.ge_module(ca_module_input)

                    ge_total_trans_loss += torch.sum( torch.sigmoid( (ba_ge_output - 0.5) * 1e7 ) * torch.sigmoid( (cb_ge_output - 0.5) * 1e7 ) * relu( torch.max(torch.cat( [ ba_ge_output, cb_ge_output ], axis = 1 ), 1)[0].view(-1, 1) - ca_ge_output ) )
                    ge_total_trans_loss += torch.sum( torch.sigmoid( (0.5 - ba_ge_output) * 1e7 ) * torch.sigmoid( (0.5 - cb_ge_output) * 1e7 ) * relu( ca_ge_output - torch.min(torch.cat( [ ba_ge_output, cb_ge_output ], axis = 1 ), 1)[0].view(-1, 1) ) )


                    # proposed but dropped due to redundancy
                    # feature_all_embed_1 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    # feature_all_embed_2 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    # feature_all_embed_3 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]

                    # ab_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_2], axis = 1 )
                    # bc_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_3], axis = 1 )
                    # ac_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_3], axis = 1 )

                    # ab_ge_output = clf.ge_module(ab_module_input)
                    # bc_ge_output = clf.ge_module(bc_module_input)
                    # ac_ge_output = clf.ge_module(ac_module_input)

                    # diff = torch.abs(0.5 - ab_ge_output) - torch.abs(0.5 - bc_ge_output)
                    # ge_total_comp_loss += torch.sum( torch.sigmoid( (0.5 - ab_ge_output) * 1e7 ) * torch.sigmoid( (bc_ge_output - 0.5) * 1e7 ) * ( torch.sigmoid( ( torch.abs(0.5 - ab_ge_output) - torch.abs(0.5 - bc_ge_output) ) * 1e7 ) * torch.abs( ac_ge_output - (0.5 - diff) ) + torch.sigmoid( ( torch.abs(0.5 - bc_ge_output) - torch.abs(0.5 - ab_ge_output) ) * 1e7 ) * torch.abs( (0.5 - diff) - ac_ge_output ) ) )

                    # diff = torch.abs(0.5 - bc_ge_output) - torch.abs(0.5 - ab_ge_output)
                    # ge_total_comp_loss += torch.sum( torch.sigmoid( (ab_ge_output - 0.5) * 1e7 ) * torch.sigmoid( (0.5 - bc_ge_output) * 1e7 ) * ( torch.sigmoid( ( torch.abs(0.5 - bc_ge_output) - torch.abs(0.5 - ab_ge_output) ) * 1e7 ) * torch.abs( ac_ge_output - (0.5 - diff) ) + torch.sigmoid( ( torch.abs(0.5 - ab_ge_output) - torch.abs(0.5 - bc_ge_output) ) * 1e7 ) * torch.abs( (0.5 - diff) - ac_ge_output ) ) )

                    # ba_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_1], axis = 1 )
                    # cb_module_input = torch.cat( [feature_all_embed_3, feature_all_embed_2], axis = 1 )
                    # ca_module_input = torch.cat( [feature_all_embed_3, feature_all_embed_1], axis = 1 )

                    # ba_ge_output = clf.ge_module(ba_module_input)
                    # cb_ge_output = clf.ge_module(cb_module_input)
                    # ca_ge_output = clf.ge_module(ca_module_input)

                    # diff = torch.abs(0.5 - ba_ge_output) - torch.abs(0.5 - cb_ge_output)
                    # ge_total_comp_loss += torch.sum( torch.sigmoid( (0.5 - ba_ge_output) * 1e7 ) * torch.sigmoid( (cb_ge_output - 0.5) * 1e7 ) * ( torch.sigmoid( ( torch.abs(0.5 - ba_ge_output) - torch.abs(0.5 - cb_ge_output) ) * 1e7 ) * torch.abs( ca_ge_output - (0.5 - diff) ) + torch.sigmoid( ( torch.abs(0.5 - cb_ge_output) - torch.abs(0.5 - ba_ge_output) ) * 1e7 ) * torch.abs( (0.5 - diff) - ca_ge_output ) ) )

                    # diff = torch.abs(0.5 - bc_ge_output) - torch.abs(0.5 - ab_ge_output)
                    # ge_total_comp_loss += torch.sum( torch.sigmoid( (ba_ge_output - 0.5) * 1e7 ) * torch.sigmoid( (0.5 - cb_ge_output) * 1e7 ) * ( torch.sigmoid( ( torch.abs(0.5 - cb_ge_output) - torch.abs(0.5 - ba_ge_output) ) * 1e7 ) * torch.abs( ca_ge_output - (0.5 - diff) ) + torch.sigmoid( ( torch.abs(0.5 - ba_ge_output) - torch.abs(0.5 - cb_ge_output) ) * 1e7 ) * torch.abs( (0.5 - diff) - ca_ge_output ) ) )


                # compute new le & ge reg loss w/ static bin diff.:
                for i in range(bins):
                    current_feature_bin_embed = clf.num_feature_embeddings( le_current_feature_indexes[i] )
                    next_feature_bin_embed = clf.num_feature_embeddings( le_next_feature_indexes[i] )

                    feature_bin_input_1 = torch.cat( [current_feature_bin_embed, next_feature_bin_embed], axis = 2 )
                    feature_bin_input_2 = torch.cat( [next_feature_bin_embed, current_feature_bin_embed], axis = 2 )

                    total_le_loss += torch.sum( torch.abs( le_expected_outputs_1[i] - torch.squeeze( clf.le_module(feature_bin_input_1) ) ) )
                    total_le_loss += torch.sum( torch.abs( le_expected_outputs_2[i] - torch.squeeze( clf.le_module(feature_bin_input_2) ) ) )


                    current_feature_bin_embed = clf.num_feature_embeddings( ge_current_feature_indexes[i] )
                    next_feature_bin_embed = clf.num_feature_embeddings( ge_next_feature_indexes[i] )

                    feature_bin_input_1 = torch.cat( [current_feature_bin_embed, next_feature_bin_embed], axis = 2 )
                    feature_bin_input_2 = torch.cat( [next_feature_bin_embed, current_feature_bin_embed], axis = 2 )

                    total_ge_loss += torch.sum( torch.abs( ge_expected_outputs_1[i] - torch.squeeze( clf.ge_module(feature_bin_input_1) ) ) )
                    total_ge_loss += torch.sum( torch.abs( ge_expected_outputs_2[i] - torch.squeeze( clf.ge_module(feature_bin_input_2) ) ) )


                # batch_reweight = reg_weight * reg_frequency * ( 1.0/ batch_size )

                le_batch_reweight = le_reg_weight * reg_frequency *  ( 1.0/ batch_size )
                ge_batch_reweight = ge_reg_weight * reg_frequency *  ( 1.0/ batch_size )
                be_batch_reweight = be_weight * be_reg_weight * reg_frequency *  ( 1.0/ batch_size )

                loss = bce_loss(preds, target) + be_batch_reweight * total_be_loss + le_batch_reweight * (total_le_loss + le_total_ref_loss + le_total_asym_loss + le_total_trans_loss + 10.0 * le_total_ordering_loss + 10.0 * le_total_cutoff_loss) + ge_batch_reweight * (total_ge_loss + ge_total_ref_loss + ge_total_asym_loss + ge_total_trans_loss + 10.0 * ge_total_ordering_loss + 10.0 * ge_total_cutoff_loss)

                if (epoch == 0 and reg_count == 0):
                    init_num_regs_losses = (total_le_loss + total_ge_loss + le_total_ref_loss + le_total_asym_loss + le_total_trans_loss + 10.0 * le_total_ordering_loss + 10.0 * le_total_cutoff_loss + ge_total_ref_loss + ge_total_asym_loss + ge_total_trans_loss + 10.0 * ge_total_ordering_loss + 10.0 * ge_total_cutoff_loss) # + le_total_comp_loss + ge_total_comp_loss
                    init_cat_regs_losses = be_weight * total_be_loss

                if (epoch != 0):
                    end_num_regs_losses = (total_le_loss + total_ge_loss + le_total_ref_loss + le_total_asym_loss + le_total_trans_loss + 10.0 * le_total_ordering_loss + 10.0 * le_total_cutoff_loss + ge_total_ref_loss + ge_total_asym_loss + ge_total_trans_loss + 10.0 * ge_total_ordering_loss + 10.0 * ge_total_cutoff_loss) # + le_total_comp_loss + ge_total_comp_loss
                    end_cat_regs_losses = be_weight * total_be_loss

                # print(loss)
                # le_reg_weight = le_reg_weight * ( 1.0/ batch_size )
                # ge_reg_weight = ge_reg_weight * ( 1.0/ batch_size )
                # be_reg_weight = be_reg_weight * ( 1.0/ batch_size )
                # ordering_reg_weight = max(le_reg_weight, ge_reg_weight, be_reg_weight)

                # loss = bce_loss(preds, target) + le_reg_weight * (total_le_loss + le_total_ref_loss + le_total_asym_loss + le_total_trans_loss) + ge_reg_weight * (total_ge_loss + ge_total_ref_loss + ge_total_asym_loss + ge_total_trans_loss) + be_reg_weight * total_be_loss + ordering_reg_weight * (100.0 * le_total_ordering_loss + 30.0 * le_total_cutoff_loss + 100.0 * ge_total_ordering_loss + 30.0 * ge_total_cutoff_loss)
                # print(loss)
                # print()


                if reg_count == 0:
                    print('total_le_loss: ', total_le_loss)
                    print('total_ge_loss: ', total_ge_loss)
                    print('total_be_loss: ', total_be_loss)
                    print('le_total_ref_loss: ', le_total_ref_loss)
                    print('le_total_asym_loss: ', le_total_asym_loss)
                    print('le_total_trans_loss: ', le_total_trans_loss)
                    print('le_total_ordering_loss: ', le_total_ordering_loss)
                    print('le_total_cutoff_loss: ', le_total_cutoff_loss)
                    print('le_total_comp_loss: ', le_total_comp_loss)
                    print('ge_total_ref_loss: ', ge_total_ref_loss)
                    print('ge_total_asym_loss: ', ge_total_asym_loss)
                    print('ge_total_trans_loss: ', ge_total_trans_loss)
                    print('ge_total_ordering_loss: ', ge_total_ordering_loss)
                    print('ge_total_cutoff_loss: ', ge_total_cutoff_loss)
                    print('ge_total_comp_loss: ', ge_total_comp_loss)


            else:
                loss = bce_loss(preds, target)

            loss.backward()
            optimizer.step()
            clf.zero_grad()

            total_loss += loss
            batch_count += 1
            reg_count += 1


        # perform early-stopping:
        clf.NSDT_masking = False
        clf = clf.eval()

        # Create context value index lists for training set:
        val_all_batch_le_context_value_index_list = []
        val_all_batch_ge_context_value_index_list = []
        val_all_batch_be_context_value_index_list = []

        for batch in val_loader:

            val_batch_size = len(batch)
            val_batch_le_context_value_index_list = []
            val_batch_ge_context_value_index_list = []
            val_batch_be_context_value_index_list = []

            for node in le_nodes:
                le_context_value_index_list = [ node[2] ] * val_batch_size
                val_batch_le_context_value_index_list.append(le_context_value_index_list)

            for node in ge_nodes:
                ge_context_value_index_list = [ node[2] ] * val_batch_size
                val_batch_ge_context_value_index_list.append(ge_context_value_index_list)

            for node in be_nodes:
                be_context_value_index_list = [ node[2] ] * val_batch_size
                val_batch_be_context_value_index_list.append(be_context_value_index_list)

            val_all_batch_le_context_value_index_list.append(val_batch_le_context_value_index_list)
            val_all_batch_ge_context_value_index_list.append(val_batch_ge_context_value_index_list)
            val_all_batch_be_context_value_index_list.append(val_batch_be_context_value_index_list)


        val_pred_labels = []

        batch_len = 0

        val_feature_indexes_le_nodes  = []
        val_feature_indexes_ge_nodes  = []
        val_feature_indexes_be_nodes  = []

        for node in le_nodes:
            val_feature_indexes_le_nodes.append(node[0] - 1)

        for node in ge_nodes:
            val_feature_indexes_ge_nodes.append(node[0] - 1)

        for node in be_nodes:
            val_feature_indexes_be_nodes.append(node[0] - 1)


        for batch in val_loader:

            subset = batch[:, :] # X only; target already removed
            val_batch_size = len(batch)

            val_batch_le_feature_index_list = subset[:, val_feature_indexes_le_nodes].permute(1,0).to('cuda')
            val_batch_ge_feature_index_list = subset[:, val_feature_indexes_ge_nodes].permute(1,0).to('cuda')
            val_batch_be_feature_index_list = subset[:, val_feature_indexes_be_nodes].permute(1,0).to('cuda')

            val_batch_le_context_value_index_list = val_all_batch_le_context_value_index_list[batch_len]
            val_batch_ge_context_value_index_list = val_all_batch_ge_context_value_index_list[batch_len]
            val_batch_be_context_value_index_list = val_all_batch_be_context_value_index_list[batch_len]

            val_preds = clf.forward(val_batch_le_feature_index_list, val_batch_le_context_value_index_list,
                                val_batch_ge_feature_index_list, val_batch_ge_context_value_index_list,
                                val_batch_be_feature_index_list, val_batch_be_context_value_index_list,
                                val_batch_size )


            val_preds = torch.sigmoid(val_preds)
            val_preds = torch.squeeze(val_preds).tolist()

            batch_len += 1

            for i in val_preds:
                if i > 0.5:
                    val_pred_labels.append(1)
                if i <= 0.5:
                    val_pred_labels.append(0)

        # Compute Balanced Accuracy:
        val_accuracy = metrics.balanced_accuracy_score(val_data['target'], val_pred_labels)


        # now compute test accuracy for bayes optimization record
        # Create context value index lists for training set:
        test_all_batch_le_context_value_index_list = []
        test_all_batch_ge_context_value_index_list = []
        test_all_batch_be_context_value_index_list = []

        for batch in test_loader:

            test_batch_size = len(batch)
            test_batch_le_context_value_index_list = []
            test_batch_ge_context_value_index_list = []
            test_batch_be_context_value_index_list = []

            for node in le_nodes:
                le_context_value_index_list = [ node[2] ] * test_batch_size
                test_batch_le_context_value_index_list.append(le_context_value_index_list)

            for node in ge_nodes:
                ge_context_value_index_list = [ node[2] ] * test_batch_size
                test_batch_ge_context_value_index_list.append(ge_context_value_index_list)

            for node in be_nodes:
                be_context_value_index_list = [ node[2] ] * test_batch_size
                test_batch_be_context_value_index_list.append(be_context_value_index_list)

            test_all_batch_le_context_value_index_list.append(test_batch_le_context_value_index_list)
            test_all_batch_ge_context_value_index_list.append(test_batch_ge_context_value_index_list)
            test_all_batch_be_context_value_index_list.append(test_batch_be_context_value_index_list)


        test_pred_labels = []

        batch_len = 0

        test_feature_indexes_le_nodes  = []
        test_feature_indexes_ge_nodes  = []
        test_feature_indexes_be_nodes  = []

        for node in le_nodes:
            test_feature_indexes_le_nodes.append(node[0] - 1)

        for node in ge_nodes:
            test_feature_indexes_ge_nodes.append(node[0] - 1)

        for node in be_nodes:
            test_feature_indexes_be_nodes.append(node[0] - 1)


        for batch in test_loader:

            subset = batch[:, :] # X only; target already removed
            test_batch_size = len(batch)

            test_batch_le_feature_index_list = subset[:, test_feature_indexes_le_nodes].permute(1,0).to('cuda')
            test_batch_ge_feature_index_list = subset[:, test_feature_indexes_ge_nodes].permute(1,0).to('cuda')
            test_batch_be_feature_index_list = subset[:, test_feature_indexes_be_nodes].permute(1,0).to('cuda')

            test_batch_le_context_value_index_list = test_all_batch_le_context_value_index_list[batch_len]
            test_batch_ge_context_value_index_list = test_all_batch_ge_context_value_index_list[batch_len]
            test_batch_be_context_value_index_list = test_all_batch_be_context_value_index_list[batch_len]

            test_preds = clf.forward(test_batch_le_feature_index_list, test_batch_le_context_value_index_list,
                                test_batch_ge_feature_index_list, test_batch_ge_context_value_index_list,
                                test_batch_be_feature_index_list, test_batch_be_context_value_index_list,
                                test_batch_size )


            test_preds = torch.sigmoid(test_preds)
            test_preds = torch.squeeze(test_preds).tolist()

            batch_len += 1

            for i in test_preds:
                if i > 0.5:
                    test_pred_labels.append(1)
                if i <= 0.5:
                    test_pred_labels.append(0)

        # Compute Balanced Accuracy:
        test_accuracy = metrics.balanced_accuracy_score(test_data['target'], test_pred_labels)

        clf = clf.train()
        clf.NSDT_masking = masking

        accs.append( [val_accuracy, test_accuracy] )

        if early_stopper.early_stop(-val_accuracy):
            if dataset == 'Higgs':
                return clf, accs[ len(accs) - early_stopper.patience - 1 ][0], accs[ len(accs) - early_stopper.patience - 1 ][1], init_num_regs_losses, end_num_regs_losses, init_cat_regs_losses, end_cat_regs_losses
            else:
                return clf, accs[ len(accs) - 1 ][0], accs[ len(accs) - 1 ][1], init_num_regs_losses, end_num_regs_losses, init_cat_regs_losses, end_cat_regs_losses

        print()
        print( epoch + 1, ': ', 'total loss: ', total_loss, '; val accuracy: ', accs[epoch][0] )

    return clf, accs[ len(accs) - 1 ][0], accs[ len(accs) - 1 ][1], init_num_regs_losses, end_num_regs_losses, init_cat_regs_losses, end_cat_regs_losses


In [9]:
def Train_num_feature_only_NSDT_split_regs(masking, train_loader, val_loader, test_loader, early_stopper, new_train, cut_off_dict, feature_lookup_dict, cat_bins_total, total_numerical_nodes, total_levels, bins, embedding_dimension, n_iter, mlp_optimizer, hidden_init, hidden_nodes_layer1, hidden_nodes_layer2, activation, learning_rate, weight_decays, reg_frequency, le_reg_weight, ge_reg_weight):

    feature_indexes_le_nodes, feature_indexes_ge_nodes, feature_indexes_be_nodes, all_batch_le_context_value_index_list, all_batch_ge_context_value_index_list, all_batch_be_context_value_index_list, le_orderings_Si_indexes, ge_orderings_Si_indexes, le_ref_lookup_indexes, le_features_Si_indexes, ge_ref_lookup_indexes, ge_features_Si_indexes, le_current_feature_indexes, le_next_feature_indexes, le_expected_outputs_1, le_expected_outputs_2, ge_current_feature_indexes, ge_next_feature_indexes, ge_expected_outputs_1, ge_expected_outputs_2 = NSDT_modeling_preps(bins, train_loader, new_train, cut_off_dict, feature_lookup_dict, cat_bins_total, total_numerical_nodes, total_levels)

    clf = NSDT_num_feature_only(masking, total_levels, embedding_dimension, hidden_init, hidden_nodes_layer1, hidden_nodes_layer2, activation).to('cuda')

    relu = torch.nn.ReLU()

    pos_weight = torch.tensor( [training_reweight] ).to('cuda')
    bce_loss = nn.BCEWithLogitsLoss(pos_weight = pos_weight, reduction = 'mean')

    if (mlp_optimizer == 'adam'):
        optimizer = optim.Adam(clf.parameters(), lr = learning_rate, weight_decay = weight_decays)
    else:
        optimizer = optim.AdamW(clf.parameters(), lr = learning_rate, weight_decay = weight_decays)

    print('Start Training:')
    print()

    accs = []

    for epoch in range(n_iter):

        total_loss = 0.0
        batch_count = 0
        reg_count = 0

        for batch in train_loader:

            subset = batch[:,:num_of_predictors] # X
            batch_size = len(batch)

            target = batch[:,num_of_predictors].to('cuda') # y
            target = target.float()

            batch_le_feature_index_list = subset[:, feature_indexes_le_nodes].permute(1,0).to('cuda')
            batch_ge_feature_index_list = subset[:, feature_indexes_ge_nodes].permute(1,0).to('cuda')

            batch_le_context_value_index_list = all_batch_le_context_value_index_list[batch_count]
            batch_ge_context_value_index_list = all_batch_ge_context_value_index_list[batch_count]

            preds = clf.forward(batch_le_feature_index_list, batch_le_context_value_index_list,
                                batch_ge_feature_index_list, batch_ge_context_value_index_list,
                                batch_size)
            preds = torch.squeeze(preds)


            # add regularizers losses once every 5 batch:
            total_le_loss = torch.zeros(1).to('cuda')
            total_ge_loss = torch.zeros(1).to('cuda')

            le_total_ref_loss = torch.zeros(1).to('cuda')
            le_total_asym_loss = torch.zeros(1).to('cuda')
            le_total_trans_loss = torch.zeros(1).to('cuda')
            le_total_ordering_loss = torch.zeros(1).to('cuda')
            le_total_cutoff_loss = torch.zeros(1).to('cuda')
            le_total_comp_loss = torch.zeros(1).to('cuda')

            ge_total_ref_loss = torch.zeros(1).to('cuda')
            ge_total_asym_loss = torch.zeros(1).to('cuda')
            ge_total_trans_loss = torch.zeros(1).to('cuda')
            ge_total_ordering_loss = torch.zeros(1).to('cuda')
            ge_total_cutoff_loss = torch.zeros(1).to('cuda')
            ge_total_comp_loss = torch.zeros(1).to('cuda')

            if reg_count % reg_frequency == 0:

                # compute ordering regs:
                if reg_count % (reg_frequency * 3) == 0:

                    starting_index = random.randint(0, bins - 2)
                    ending_index = random.randint(starting_index + 2, bins)

                    for i in le_orderings_Si_indexes:
                        feature_bin_embed = clf.num_feature_embeddings( i[0][starting_index:ending_index] )
                        feature_context_embed = clf.context_value_embeddings( i[1][starting_index:ending_index] )
                        num_module_input = torch.cat( [feature_bin_embed, feature_context_embed], axis = 1 )

                        le_module_output = clf.le_module(num_module_input) # all values should <= first value
                        le_total_ordering_loss += torch.sum( relu( le_module_output - le_module_output[0] ) )
                        le_total_ordering_loss += torch.sum( relu( le_module_output[-1] - le_module_output ) )

                        feature_bin_embed = clf.num_feature_embeddings( i[0] )
                        feature_context_embed = clf.context_value_embeddings( i[1] )
                        num_module_input_allbins = torch.cat( [feature_bin_embed, feature_context_embed], axis = 1 )

                        le_module_output = clf.le_module(num_module_input_allbins)
                        context_index = int( torch.argmin( torch.abs(le_module_output - 0.5) ) )
                        le_total_cutoff_loss += torch.sum( relu( le_module_output[context_index] - le_module_output[:context_index] ) )
                        le_total_cutoff_loss += torch.sum( relu( le_module_output[context_index + 1:] - le_module_output[context_index] ) )

                    for i in ge_orderings_Si_indexes:
                        feature_bin_embed = clf.num_feature_embeddings( i[0][starting_index:ending_index] )
                        feature_context_embed = clf.context_value_embeddings( i[1][starting_index:ending_index] )
                        num_module_input = torch.cat( [feature_bin_embed, feature_context_embed], axis = 1 )

                        ge_module_output = clf.ge_module(num_module_input) # all values should <= last value
                        ge_total_ordering_loss += torch.sum( relu( ge_module_output - ge_module_output[-1] ) )
                        ge_total_ordering_loss += torch.sum( relu( ge_module_output[0] - ge_module_output ) )

                        feature_bin_embed = clf.num_feature_embeddings( i[0] )
                        feature_context_embed = clf.context_value_embeddings( i[1] )
                        num_module_input_allbins = torch.cat( [feature_bin_embed, feature_context_embed], axis = 1 )

                        ge_module_output = clf.ge_module(num_module_input_allbins)
                        context_index = int( torch.argmin( torch.abs(ge_module_output - 0.5) ) )
                        ge_total_cutoff_loss += torch.sum( relu( ge_module_output[context_index] - ge_module_output[context_index + 1:] ) )
                        ge_total_cutoff_loss += torch.sum( relu( ge_module_output[:context_index] - ge_module_output[context_index] ) )


                # compute le nodes ref & asym & trans reg losses 1st:
                current_feature_bin_embed = clf.context_value_embeddings(le_ref_lookup_indexes)
                feature_bin_input = torch.cat( [current_feature_bin_embed, current_feature_bin_embed], axis = 1 )

                le_total_ref_loss += torch.sum( torch.abs( 0.5 - clf.le_module(feature_bin_input) ) )


                for i in range(num_of_numerical_predictors):
                    feature_bin_embed = clf.num_feature_embeddings( le_features_Si_indexes[i][0] )
                    feature_context_embed = clf.context_value_embeddings( le_features_Si_indexes[i][1] )

                    feature_all_embed = torch.cat( [feature_bin_embed, feature_context_embed], axis = 0 )
                    feature_all_embed_1 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    feature_all_embed_2 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]

                    ab_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_2], axis = 1 )
                    ba_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_1], axis = 1 )

                    le_total_asym_loss += torch.sum( torch.abs( clf.le_module(ab_module_input) - ( 1.0 - clf.le_module(ba_module_input) ) ) )


                    feature_all_embed_1 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    feature_all_embed_2 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    feature_all_embed_3 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]

                    ab_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_2], axis = 1 )
                    bc_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_3], axis = 1 )
                    ac_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_3], axis = 1 )

                    ab_le_output = clf.le_module(ab_module_input)
                    bc_le_output = clf.le_module(bc_module_input)
                    ac_le_output = clf.le_module(ac_module_input)

                    le_total_trans_loss += torch.sum( torch.sigmoid( (ab_le_output - 0.5) * 1e7 ) * torch.sigmoid( (bc_le_output - 0.5) * 1e7 ) * relu( torch.max(torch.cat( [ ab_le_output, bc_le_output ], axis = 1 ), 1)[0].view(-1, 1) - ac_le_output ) )
                    le_total_trans_loss += torch.sum( torch.sigmoid( (0.5 - ab_le_output) * 1e7 ) * torch.sigmoid( (0.5 - bc_le_output) * 1e7 ) * relu( ac_le_output - torch.min(torch.cat( [ ab_le_output, bc_le_output ], axis = 1 ), 1)[0].view(-1, 1) ) )

                    ba_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_1], axis = 1 )
                    cb_module_input = torch.cat( [feature_all_embed_3, feature_all_embed_2], axis = 1 )
                    ca_module_input = torch.cat( [feature_all_embed_3, feature_all_embed_1], axis = 1 )

                    ba_le_output = clf.le_module(ba_module_input)
                    cb_le_output = clf.le_module(cb_module_input)
                    ca_le_output = clf.le_module(ca_module_input)

                    le_total_trans_loss += torch.sum( torch.sigmoid( (ba_le_output - 0.5) * 1e7 ) * torch.sigmoid( (cb_le_output - 0.5) * 1e7 ) * relu( torch.max(torch.cat( [ ba_le_output, cb_le_output ], axis = 1 ), 1)[0].view(-1, 1) - ca_le_output ) )
                    le_total_trans_loss += torch.sum( torch.sigmoid( (0.5 - ba_le_output) * 1e7 ) * torch.sigmoid( (0.5 - cb_le_output) * 1e7 ) * relu( ca_le_output - torch.min(torch.cat( [ ba_le_output, cb_le_output ], axis = 1 ), 1)[0].view(-1, 1) ) )


                    # feature_all_embed_1 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    # feature_all_embed_2 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    # feature_all_embed_3 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]

                    # ab_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_2], axis = 1 )
                    # bc_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_3], axis = 1 )
                    # ac_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_3], axis = 1 )

                    # ab_le_output = clf.le_module(ab_module_input)
                    # bc_le_output = clf.le_module(bc_module_input)
                    # ac_le_output = clf.le_module(ac_module_input)

                    # diff = torch.abs(0.5 - ab_le_output) - torch.abs(0.5 - bc_le_output)
                    # le_total_comp_loss += torch.sum( torch.sigmoid( (0.5 - ab_le_output) * 1e7 ) * torch.sigmoid( (bc_le_output - 0.5) * 1e7 ) * ( torch.sigmoid( ( torch.abs(0.5 - ab_le_output) - torch.abs(0.5 - bc_le_output) ) * 1e7 ) * torch.abs( ac_le_output - (0.5 - diff) ) + torch.sigmoid( ( torch.abs(0.5 - bc_le_output) - torch.abs(0.5 - ab_le_output) ) * 1e7 ) * torch.abs( (0.5 - diff) - ac_le_output ) ) )

                    # diff = torch.abs(0.5 - bc_le_output) - torch.abs(0.5 - ab_le_output)
                    # le_total_comp_loss += torch.sum( torch.sigmoid( (ab_le_output - 0.5) * 1e7 ) * torch.sigmoid( (0.5 - bc_le_output) * 1e7 ) * ( torch.sigmoid( ( torch.abs(0.5 - bc_le_output) - torch.abs(0.5 - ab_le_output) ) * 1e7 ) * torch.abs( ac_le_output - (0.5 - diff) ) + torch.sigmoid( ( torch.abs(0.5 - ab_le_output) - torch.abs(0.5 - bc_le_output) ) * 1e7 ) * torch.abs( (0.5 - diff) - ac_le_output ) ) )

                    # ba_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_1], axis = 1 )
                    # cb_module_input = torch.cat( [feature_all_embed_3, feature_all_embed_2], axis = 1 )
                    # ca_module_input = torch.cat( [feature_all_embed_3, feature_all_embed_1], axis = 1 )

                    # ba_le_output = clf.le_module(ba_module_input)
                    # cb_le_output = clf.le_module(cb_module_input)
                    # ca_le_output = clf.le_module(ca_module_input)

                    # diff = torch.abs(0.5 - ba_le_output) - torch.abs(0.5 - cb_le_output)
                    # le_total_comp_loss += torch.sum( torch.sigmoid( (0.5 - ba_le_output) * 1e7 ) * torch.sigmoid( (cb_le_output - 0.5) * 1e7 ) * ( torch.sigmoid( ( torch.abs(0.5 - ba_le_output) - torch.abs(0.5 - cb_le_output) ) * 1e7 ) * torch.abs( ca_le_output - (0.5 - diff) ) + torch.sigmoid( ( torch.abs(0.5 - cb_le_output) - torch.abs(0.5 - ba_le_output) ) * 1e7 ) * torch.abs( (0.5 - diff) - ca_le_output ) ) )

                    # diff = torch.abs(0.5 - cb_le_output) - torch.abs(0.5 - ba_le_output)
                    # le_total_comp_loss += torch.sum( torch.sigmoid( (ba_le_output - 0.5) * 1e7 ) * torch.sigmoid( (0.5 - cb_le_output) * 1e7 ) * ( torch.sigmoid( ( torch.abs(0.5 - cb_le_output) - torch.abs(0.5 - ba_le_output) ) * 1e7 ) * torch.abs( ca_le_output - (0.5 - diff) ) + torch.sigmoid( ( torch.abs(0.5 - ba_le_output) - torch.abs(0.5 - cb_le_output) ) * 1e7 ) * torch.abs( (0.5 - diff) - ca_le_output ) ) )


                # now compute ge nodes ref & asym & trans reg losses:
                current_feature_bin_embed = clf.context_value_embeddings(ge_ref_lookup_indexes)
                feature_bin_input = torch.cat( [current_feature_bin_embed, current_feature_bin_embed], axis = 1 )

                ge_total_ref_loss += torch.sum( torch.abs( 0.5 - clf.ge_module(feature_bin_input) ) )


                for i in range(num_of_numerical_predictors):
                    feature_bin_embed = clf.num_feature_embeddings( ge_features_Si_indexes[i][0] )
                    feature_context_embed = clf.context_value_embeddings( ge_features_Si_indexes[i][1] )

                    feature_all_embed = torch.cat( [feature_bin_embed, feature_context_embed], axis = 0 )
                    feature_all_embed_1 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    feature_all_embed_2 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]

                    ab_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_2], axis = 1 )
                    ba_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_1], axis = 1 )

                    ge_total_asym_loss += torch.sum( torch.abs( clf.ge_module(ab_module_input) - ( 1.0 - clf.ge_module(ba_module_input) ) ) )


                    feature_all_embed_1 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    feature_all_embed_2 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    feature_all_embed_3 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]

                    ab_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_2], axis = 1 )
                    bc_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_3], axis = 1 )
                    ac_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_3], axis = 1 )

                    ab_ge_output = clf.ge_module(ab_module_input)
                    bc_ge_output = clf.ge_module(bc_module_input)
                    ac_ge_output = clf.ge_module(ac_module_input)

                    ge_total_trans_loss += torch.sum( torch.sigmoid( (ab_ge_output - 0.5) * 1e7 ) * torch.sigmoid( (bc_ge_output - 0.5) * 1e7 ) * relu( torch.max(torch.cat( [ ab_ge_output, bc_ge_output ], axis = 1 ), 1)[0].view(-1, 1) - ac_ge_output ) )
                    ge_total_trans_loss += torch.sum( torch.sigmoid( (0.5 - ab_ge_output) * 1e7 ) * torch.sigmoid( (0.5 - bc_ge_output) * 1e7 ) * relu( ac_ge_output - torch.min(torch.cat( [ ab_ge_output, bc_ge_output ], axis = 1 ), 1)[0].view(-1, 1) ) )

                    ba_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_1], axis = 1 )
                    cb_module_input = torch.cat( [feature_all_embed_3, feature_all_embed_2], axis = 1 )
                    ca_module_input = torch.cat( [feature_all_embed_3, feature_all_embed_1], axis = 1 )

                    ba_ge_output = clf.ge_module(ba_module_input)
                    cb_ge_output = clf.ge_module(cb_module_input)
                    ca_ge_output = clf.ge_module(ca_module_input)

                    ge_total_trans_loss += torch.sum( torch.sigmoid( (ba_ge_output - 0.5) * 1e7 ) * torch.sigmoid( (cb_ge_output - 0.5) * 1e7 ) * relu( torch.max(torch.cat( [ ba_ge_output, cb_ge_output ], axis = 1 ), 1)[0].view(-1, 1) - ca_ge_output ) )
                    ge_total_trans_loss += torch.sum( torch.sigmoid( (0.5 - ba_ge_output) * 1e7 ) * torch.sigmoid( (0.5 - cb_ge_output) * 1e7 ) * relu( ca_ge_output - torch.min(torch.cat( [ ba_ge_output, cb_ge_output ], axis = 1 ), 1)[0].view(-1, 1) ) )


                    # feature_all_embed_1 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    # feature_all_embed_2 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]
                    # feature_all_embed_3 = feature_all_embed[ torch.randperm(feature_all_embed.size()[0]) ]

                    # ab_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_2], axis = 1 )
                    # bc_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_3], axis = 1 )
                    # ac_module_input = torch.cat( [feature_all_embed_1, feature_all_embed_3], axis = 1 )

                    # ab_ge_output = clf.ge_module(ab_module_input)
                    # bc_ge_output = clf.ge_module(bc_module_input)
                    # ac_ge_output = clf.ge_module(ac_module_input)

                    # diff = torch.abs(0.5 - ab_ge_output) - torch.abs(0.5 - bc_ge_output)
                    # ge_total_comp_loss += torch.sum( torch.sigmoid( (0.5 - ab_ge_output) * 1e7 ) * torch.sigmoid( (bc_ge_output - 0.5) * 1e7 ) * ( torch.sigmoid( ( torch.abs(0.5 - ab_ge_output) - torch.abs(0.5 - bc_ge_output) ) * 1e7 ) * torch.abs( ac_ge_output - (0.5 - diff) ) + torch.sigmoid( ( torch.abs(0.5 - bc_ge_output) - torch.abs(0.5 - ab_ge_output) ) * 1e7 ) * torch.abs( (0.5 - diff) - ac_ge_output ) ) )

                    # diff = torch.abs(0.5 - bc_ge_output) - torch.abs(0.5 - ab_ge_output)
                    # ge_total_comp_loss += torch.sum( torch.sigmoid( (ab_ge_output - 0.5) * 1e7 ) * torch.sigmoid( (0.5 - bc_ge_output) * 1e7 ) * ( torch.sigmoid( ( torch.abs(0.5 - bc_ge_output) - torch.abs(0.5 - ab_ge_output) ) * 1e7 ) * torch.abs( ac_ge_output - (0.5 - diff) ) + torch.sigmoid( ( torch.abs(0.5 - ab_ge_output) - torch.abs(0.5 - bc_ge_output) ) * 1e7 ) * torch.abs( (0.5 - diff) - ac_ge_output ) ) )

                    # ba_module_input = torch.cat( [feature_all_embed_2, feature_all_embed_1], axis = 1 )
                    # cb_module_input = torch.cat( [feature_all_embed_3, feature_all_embed_2], axis = 1 )
                    # ca_module_input = torch.cat( [feature_all_embed_3, feature_all_embed_1], axis = 1 )

                    # ba_ge_output = clf.ge_module(ba_module_input)
                    # cb_ge_output = clf.ge_module(cb_module_input)
                    # ca_ge_output = clf.ge_module(ca_module_input)

                    # diff = torch.abs(0.5 - ba_ge_output) - torch.abs(0.5 - cb_ge_output)
                    # ge_total_comp_loss += torch.sum( torch.sigmoid( (0.5 - ba_ge_output) * 1e7 ) * torch.sigmoid( (cb_ge_output - 0.5) * 1e7 ) * ( torch.sigmoid( ( torch.abs(0.5 - ba_ge_output) - torch.abs(0.5 - cb_ge_output) ) * 1e7 ) * torch.abs( ca_ge_output - (0.5 - diff) ) + torch.sigmoid( ( torch.abs(0.5 - cb_ge_output) - torch.abs(0.5 - ba_ge_output) ) * 1e7 ) * torch.abs( (0.5 - diff) - ca_ge_output ) ) )

                    # diff = torch.abs(0.5 - bc_ge_output) - torch.abs(0.5 - ab_ge_output)
                    # ge_total_comp_loss += torch.sum( torch.sigmoid( (ba_ge_output - 0.5) * 1e7 ) * torch.sigmoid( (0.5 - cb_ge_output) * 1e7 ) * ( torch.sigmoid( ( torch.abs(0.5 - cb_ge_output) - torch.abs(0.5 - ba_ge_output) ) * 1e7 ) * torch.abs( ca_ge_output - (0.5 - diff) ) + torch.sigmoid( ( torch.abs(0.5 - ba_ge_output) - torch.abs(0.5 - cb_ge_output) ) * 1e7 ) * torch.abs( (0.5 - diff) - ca_ge_output ) ) )


                # compute new le & ge reg loss w/ static bin diff.:
                for i in range(bins):
                    current_feature_bin_embed = clf.num_feature_embeddings( le_current_feature_indexes[i] )
                    next_feature_bin_embed = clf.num_feature_embeddings( le_next_feature_indexes[i] )

                    feature_bin_input_1 = torch.cat( [current_feature_bin_embed, next_feature_bin_embed], axis = 2 )
                    feature_bin_input_2 = torch.cat( [next_feature_bin_embed, current_feature_bin_embed], axis = 2 )

                    total_le_loss += torch.sum( torch.abs( le_expected_outputs_1[i] - torch.squeeze( clf.le_module(feature_bin_input_1) ) ) )
                    total_le_loss += torch.sum( torch.abs( le_expected_outputs_2[i] - torch.squeeze( clf.le_module(feature_bin_input_2) ) ) )


                    current_feature_bin_embed = clf.num_feature_embeddings( ge_current_feature_indexes[i] )
                    next_feature_bin_embed = clf.num_feature_embeddings( ge_next_feature_indexes[i] )

                    feature_bin_input_1 = torch.cat( [current_feature_bin_embed, next_feature_bin_embed], axis = 2 )
                    feature_bin_input_2 = torch.cat( [next_feature_bin_embed, current_feature_bin_embed], axis = 2 )

                    total_ge_loss += torch.sum( torch.abs( ge_expected_outputs_1[i] - torch.squeeze( clf.ge_module(feature_bin_input_1) ) ) )
                    total_ge_loss += torch.sum( torch.abs( ge_expected_outputs_2[i] - torch.squeeze( clf.ge_module(feature_bin_input_2) ) ) )


                # batch_reweight = reg_weight * reg_frequency * ( 1.0/ batch_size )
                # loss = bce_loss(preds, target) + batch_reweight * (total_le_loss + total_ge_loss + le_total_ref_loss + le_total_asym_loss + le_total_trans_loss + 10.0 * le_total_ordering_loss + 10.0 * le_total_cutoff_loss + ge_total_ref_loss + ge_total_asym_loss + ge_total_trans_loss + 10.0 * ge_total_ordering_loss + 10.0 * ge_total_cutoff_loss) # + le_total_comp_loss + ge_total_comp_loss

                le_batch_reweight = le_reg_weight * reg_frequency *  ( 1.0/ batch_size )
                ge_batch_reweight = ge_reg_weight * reg_frequency *  ( 1.0/ batch_size )

                loss = bce_loss(preds, target) + le_batch_reweight * (total_le_loss + le_total_ref_loss + le_total_asym_loss + le_total_trans_loss + 10.0 * le_total_ordering_loss + 10.0 * le_total_cutoff_loss) + ge_batch_reweight * (total_ge_loss + ge_total_ref_loss + ge_total_asym_loss + ge_total_trans_loss + 10.0 * ge_total_ordering_loss + 10.0 * ge_total_cutoff_loss)


                if (epoch == 0 and reg_count == 0):
                    init_num_regs_losses = (total_le_loss + total_ge_loss + le_total_ref_loss + le_total_asym_loss + le_total_trans_loss + 10.0 * le_total_ordering_loss + 10.0 * le_total_cutoff_loss + ge_total_ref_loss + ge_total_asym_loss + ge_total_trans_loss + 10.0 * ge_total_ordering_loss + 10.0 * ge_total_cutoff_loss) # + le_total_comp_loss + ge_total_comp_loss

                if (epoch != 0):
                    end_num_regs_losses = (total_le_loss + total_ge_loss + le_total_ref_loss + le_total_asym_loss + le_total_trans_loss + 10.0 * le_total_ordering_loss + 10.0 * le_total_cutoff_loss + ge_total_ref_loss + ge_total_asym_loss + ge_total_trans_loss + 10.0 * ge_total_ordering_loss + 10.0 * ge_total_cutoff_loss) # + le_total_comp_loss + ge_total_comp_loss

                if reg_count == 0:
                    print('total_le_loss: ', total_le_loss)
                    print('total_ge_loss: ', total_ge_loss)
                    print('le_total_ref_loss: ', le_total_ref_loss)
                    print('le_total_asym_loss: ', le_total_asym_loss)
                    print('le_total_trans_loss: ', le_total_trans_loss)
                    print('le_total_ordering_loss: ', le_total_ordering_loss)
                    print('le_total_cutoff_loss: ', le_total_cutoff_loss)
                    print('le_total_comp_loss: ', le_total_comp_loss)
                    print('ge_total_ref_loss: ', ge_total_ref_loss)
                    print('ge_total_asym_loss: ', ge_total_asym_loss)
                    print('ge_total_trans_loss: ', ge_total_trans_loss)
                    print('ge_total_ordering_loss: ', ge_total_ordering_loss)
                    print('ge_total_cutoff_loss: ', ge_total_cutoff_loss)
                    print('ge_total_comp_loss: ', ge_total_comp_loss)

            else:
                loss = bce_loss(preds, target)

            loss.backward()
            optimizer.step()
            clf.zero_grad()

            total_loss += loss
            batch_count += 1
            reg_count += 1


        # perform early-stopping:
        clf.NSDT_masking = False
        clf = clf.eval()

        # Create context value index lists for training set:
        val_all_batch_le_context_value_index_list = []
        val_all_batch_ge_context_value_index_list = []
        val_all_batch_be_context_value_index_list = []

        for batch in val_loader:

            val_batch_size = len(batch)
            val_batch_le_context_value_index_list = []
            val_batch_ge_context_value_index_list = []
            val_batch_be_context_value_index_list = []

            for node in le_nodes:
                le_context_value_index_list = [ node[2] ] * val_batch_size
                val_batch_le_context_value_index_list.append(le_context_value_index_list)

            for node in ge_nodes:
                ge_context_value_index_list = [ node[2] ] * val_batch_size
                val_batch_ge_context_value_index_list.append(ge_context_value_index_list)

            for node in be_nodes:
                be_context_value_index_list = [ node[2] ] * val_batch_size
                val_batch_be_context_value_index_list.append(be_context_value_index_list)

            val_all_batch_le_context_value_index_list.append(val_batch_le_context_value_index_list)
            val_all_batch_ge_context_value_index_list.append(val_batch_ge_context_value_index_list)
            val_all_batch_be_context_value_index_list.append(val_batch_be_context_value_index_list)


        val_pred_labels = []

        batch_len = 0

        val_feature_indexes_le_nodes  = []
        val_feature_indexes_ge_nodes  = []
        val_feature_indexes_be_nodes  = []

        for node in le_nodes:
            val_feature_indexes_le_nodes.append(node[0] - 1)

        for node in ge_nodes:
            val_feature_indexes_ge_nodes.append(node[0] - 1)

        for node in be_nodes:
            val_feature_indexes_be_nodes.append(node[0] - 1)


        for batch in val_loader:

            subset = batch[:, :] # X only; target already removed
            val_batch_size = len(batch)

            val_batch_le_feature_index_list = subset[:, val_feature_indexes_le_nodes].permute(1,0).to('cuda')
            val_batch_ge_feature_index_list = subset[:, val_feature_indexes_ge_nodes].permute(1,0).to('cuda')

            val_batch_le_context_value_index_list = val_all_batch_le_context_value_index_list[batch_len]
            val_batch_ge_context_value_index_list = val_all_batch_ge_context_value_index_list[batch_len]

            val_preds = clf.forward(val_batch_le_feature_index_list, val_batch_le_context_value_index_list,
                                val_batch_ge_feature_index_list, val_batch_ge_context_value_index_list,
                                val_batch_size )


            val_preds = torch.sigmoid(val_preds)
            val_preds = torch.squeeze(val_preds).tolist()

            batch_len += 1

            for i in val_preds:
                if i > 0.5:
                    val_pred_labels.append(1)
                if i <= 0.5:
                    val_pred_labels.append(0)

        # Compute Balanced Accuracy:
        val_accuracy = metrics.balanced_accuracy_score(val_data['target'], val_pred_labels)


        # now compute test accuracy for bayes optimization record
        # Create context value index lists for training set:
        test_all_batch_le_context_value_index_list = []
        test_all_batch_ge_context_value_index_list = []
        test_all_batch_be_context_value_index_list = []

        for batch in test_loader:

            test_batch_size = len(batch)
            test_batch_le_context_value_index_list = []
            test_batch_ge_context_value_index_list = []
            test_batch_be_context_value_index_list = []

            for node in le_nodes:
                le_context_value_index_list = [ node[2] ] * test_batch_size
                test_batch_le_context_value_index_list.append(le_context_value_index_list)

            for node in ge_nodes:
                ge_context_value_index_list = [ node[2] ] * test_batch_size
                test_batch_ge_context_value_index_list.append(ge_context_value_index_list)

            for node in be_nodes:
                be_context_value_index_list = [ node[2] ] * test_batch_size
                test_batch_be_context_value_index_list.append(be_context_value_index_list)

            test_all_batch_le_context_value_index_list.append(test_batch_le_context_value_index_list)
            test_all_batch_ge_context_value_index_list.append(test_batch_ge_context_value_index_list)
            test_all_batch_be_context_value_index_list.append(test_batch_be_context_value_index_list)


        test_pred_labels = []

        batch_len = 0

        test_feature_indexes_le_nodes  = []
        test_feature_indexes_ge_nodes  = []
        test_feature_indexes_be_nodes  = []

        for node in le_nodes:
            test_feature_indexes_le_nodes.append(node[0] - 1)

        for node in ge_nodes:
            test_feature_indexes_ge_nodes.append(node[0] - 1)

        for node in be_nodes:
            test_feature_indexes_be_nodes.append(node[0] - 1)


        for batch in test_loader:

            subset = batch[:, :] # X only; target already removed
            test_batch_size = len(batch)

            test_batch_le_feature_index_list = subset[:, test_feature_indexes_le_nodes].permute(1,0).to('cuda')
            test_batch_ge_feature_index_list = subset[:, test_feature_indexes_ge_nodes].permute(1,0).to('cuda')

            test_batch_le_context_value_index_list = test_all_batch_le_context_value_index_list[batch_len]
            test_batch_ge_context_value_index_list = test_all_batch_ge_context_value_index_list[batch_len]

            test_preds = clf.forward(test_batch_le_feature_index_list, test_batch_le_context_value_index_list,
                                test_batch_ge_feature_index_list, test_batch_ge_context_value_index_list,
                                test_batch_size )


            test_preds = torch.sigmoid(test_preds)
            test_preds = torch.squeeze(test_preds).tolist()

            batch_len += 1

            for i in test_preds:
                if i > 0.5:
                    test_pred_labels.append(1)
                if i <= 0.5:
                    test_pred_labels.append(0)

        # Compute Balanced Accuracy:
        test_accuracy = metrics.balanced_accuracy_score(test_data['target'], test_pred_labels)

        clf = clf.train()
        clf.NSDT_masking = masking

        accs.append( [val_accuracy, test_accuracy] )

        if early_stopper.early_stop(-val_accuracy):
            if dataset == 'Higgs':
                return clf, accs[ len(accs) - early_stopper.patience - 1 ][0], accs[ len(accs) - early_stopper.patience - 1 ][1], init_num_regs_losses, end_num_regs_losses, init_cat_regs_losses, end_cat_regs_losses
            else:
                return clf, accs[ len(accs) - 1 ][0], accs[ len(accs) - 1 ][1], init_num_regs_losses, end_num_regs_losses, init_cat_regs_losses, end_cat_regs_losses

        print()
        print( epoch + 1, ': ', 'total loss: ', total_loss, '; val accuracy: ', accs[epoch][0] )


    return clf, accs[ len(accs) - 1 ][0], accs[ len(accs) - 1 ][1], init_num_regs_losses, end_num_regs_losses


In [10]:
if (dataset == 'Higgs'):

    # Load dataset & change target feature name to target:
    data = pd.read_csv('higgs.csv', low_memory = False)
    data = data.rename( columns={'class': 'target'} )

    # replace target feature at the end of features
    headers = list(data.columns)
    headers.remove('target')
    headers.append('target')
    data = data[headers]

    # drop samples with missing cells (only 8 in total)
    data = data.replace('?', np.nan)
    data.dropna(inplace = True)

    # Treat jet1b-tag to jet4b-tag features (4 in total) as Ordinal Categorical
    for i in data.columns:
        if (i != 'target'):
            print(i, data[i].dtype)
            if (data[i].nunique() == 3):
                data[i] = data[i].astype('object')
            else:
                data[i] = data[i].astype('float')

    # place all categorical features after all numerical features
    nums = []
    cats = []
    for i in data.columns:
        if (i != 'target'):
            if pd.api.types.is_numeric_dtype(data[i]):
                nums.append(i)
            else:
                cats.append(i)

    headers = []
    for i in nums:
        headers.append(i)
    for i in cats:
        headers.append(i)
    headers.append('target')
    data = data[headers]

    print(data)
    print()

    # Change all feature names to Feature_1 - Feature_N
    original_features = []
    predictors = list(data.columns)
    headers = []
    for i in range( 1, len(data.columns) ):
        s = 'Feature' + str(i)
        original_features.append( predictors[i - 1] )
        headers.append(s)
    headers.append('target')
    data.columns = headers

    print(data)
    print()

    for i in original_features:
        print(i)
    print()

    # Peform stratified train-test split based on a random seed
    predictors = list(data.columns)
    predictors.remove('target')
    X_train, X_test, Y_train, Y_test = train_test_split(data[predictors], data['target'], test_size = 0.2, random_state = 1, stratify = data['target'])

    train_data = X_train
    train_data['target'] = Y_train

    test_data = X_test
    test_data['target'] = Y_test

    X_train, X_val, Y_train, Y_val = train_test_split(train_data[predictors], train_data['target'], test_size = 0.125, random_state = 1, stratify = train_data['target'])

    train_data = X_train
    train_data['target'] = Y_train

    val_data = X_val
    val_data['target'] = Y_val

    print(val_data)
    print()
    print(test_data)

    training_reweight = 1.0 # well-balanced between 2 classes; no need to reweight


In [11]:
# Drop feature 25 as it's sample weight;
def Census_processing(train_data):
    headers = []
    for i in range(1,42):
        if (i < 25):
            s = 'Feature' + str(i)
        elif (i == 25):
            s = 'Feature_to_drop'
        else:
            s = 'Feature' + str(i - 1)

        headers.append(s)

    headers.append('target')
    train_data.columns = headers

    cont_features = ['Feature1', 'Feature6', 'Feature17', 'Feature18', 'Feature19', 'Feature30', 'Feature39']
    for i in train_data.columns:
        if (i != 'target'):
            if i in cont_features:
                train_data[i] = train_data[i].astype('float')
            else:
                train_data[i] = train_data[i].astype('object')

    train_data['target'] = train_data['target'].replace( [' - 50000.', ' 50000+.'], [0, 1] )
    train_data = train_data.drop( columns = ['Feature_to_drop'])

    oridinal_feature_names = ['age', 'class of worker', 'detailed industry recode', 'detailed occupation recode', 'education', 'wage per hour', 'enroll in edu inst last wk',
    'marital status', 'major industry code', 'major occupation code', 'race', 'hispanic origin', 'sex', 'member of a labor union', 'reason for unemployment',
    'full or part time employment status', 'capital gains', 'capital losses', 'dividends from stocks', 'tax filer status', 'region of previous residence',
    'state of previous residence', 'detailed household and family status', 'detailed household summary in household', 'migration code-change in msa',
    'migration code-change in reg', 'migration code-move within reg', 'live in this house 1 year ago', 'migration prev res in sunbelt', 'num persons worked for employer',
    'family members under 18', 'country of birth father', 'country of birth mother', 'country of birth self', 'citizenship', 'own business or self employed',
    'fill inc questionnaire for veterans admin', 'veterans benefits', 'weeks worked in year', 'year', 'target']
    train_data.columns = oridinal_feature_names

    dup_header = list(train_data.columns)
    dup_header.remove('target')

    print( len(train_data) )
    train_data = train_data.drop_duplicates(subset = dup_header)
    print( len(train_data) )
    print()

    # place all categorical features after all numerical features
    nums = []
    cats = []
    for i in train_data.columns:
        if (i != 'target'):
            if pd.api.types.is_numeric_dtype(train_data[i]):
                nums.append(i)
            else:
                cats.append(i)

    cols = []
    for i in nums:
        cols.append(i)
    for i in cats:
        cols.append(i)
    cols.append('target')
    train_data = train_data[cols]

    for column in cats:
        train_data = train_data[ train_data[column] != ' ?' ]
    print( len(train_data) )

    # Change all feature names to Feature_1 - Feature_N
    cols = []
    original_features = []
    predictors = list(train_data.columns)
    for i in range( 1, len(train_data.columns) ):
        original_features.append( predictors[i - 1] )
        s = 'Feature' + str(i)
        cols.append(s)
    cols.append('target')
    train_data.columns = cols

    for i in train_data.columns:
        print(i, train_data[i].dtype, train_data[i].nunique(), train_data[i].isna().sum()  )
    print()

    return train_data, original_features


if (dataset == 'Census'):
    train_data = pd.read_csv('census-income.csv')
    train_data, original_features = Census_processing(train_data)

    predictors = list(train_data.columns)
    predictors.remove('target')
    X_train, X_val, Y_train, Y_val = train_test_split(train_data[predictors], train_data['target'], test_size = 0.1, random_state = 0, stratify = train_data['target'])

    # train_data = X_train
    # train_data['target'] = Y_train

    # Try both try both re-sampling & cost-sensitive learning
    # https://imbalanced-learn.org/stable/references/generated/imblearn.under_sampling.RandomUnderSampler.html#imblearn.under_sampling.RandomUnderSampler
    ros = RandomOverSampler()
    X_ros, y_ros = ros.fit_resample(X_train, Y_train)
    train_data = X_ros
    train_data['target'] = y_ros

    val_data = X_val
    val_data['target'] = Y_val

    print(len(val_data))
    for i in train_data.columns:
        val_extra_levels = list( set( list( val_data[i].unique() ) ) - set( list( train_data[i].unique() ) ) )
        if len(val_extra_levels) > 0:
            if (train_data[i].dtype == 'string' or train_data[i].dtype == 'object'):
                print( i, ' ', len(val_extra_levels), ' ', val_extra_levels )

                for level in val_extra_levels:
                    print( len( val_data[ val_data[i] == level ] ) )
                    val_data = val_data[val_data[i] != level]

    print(len(val_data))
    print()

    test_data = pd.read_csv('census-income_test.csv')
    test_data, original_features = Census_processing(test_data)

    print(len(test_data))
    for i in train_data.columns:
        test_extra_levels = list( set( list( test_data[i].unique() ) ) - set( list( train_data[i].unique() ) ) )
        if len(test_extra_levels) > 0:
            if (train_data[i].dtype == 'string' or train_data[i].dtype == 'object'):
                print( i, ' ', len(test_extra_levels), ' ', test_extra_levels )

                for level in test_extra_levels:
                    print( len( test_data[ test_data[i] == level ] ) )
                    test_data = test_data[test_data[i] != level]

    print(len(test_data))
    print()

    for i in original_features:
        print(i)

    class1_count = len( train_data[ train_data['target'] == 0 ] )
    class2_count = len( train_data[ train_data['target'] == 1 ] )
    print(class1_count / class2_count)

    training_reweight = class1_count / class2_count

    class1_count = len( test_data[ test_data['target'] == 0 ] )
    class2_count = len( test_data[ test_data['target'] == 1 ] )
    print(class1_count / class2_count)


In [None]:
if (dataset == 'Credit'):

    data = pd.read_csv('credit_new.csv')
    data = data.drop( columns = ['Unnamed: 0'])
    data = data.rename( columns={'SeriousDlqin2yrs': 'target'} )

    cols = list(data.columns)
    cols.remove('target')
    cols.append('target')
    data = data[cols]
    data.dropna(inplace = True)

    features = list(data.columns)

    headers = []
    original_features = []
    predictors = list(data.columns)
    for i in range( 1, len(data.columns) ):
        original_features.append( predictors[i - 1] )
        s = 'Feature' + str(i)
        headers.append(s)
        print(i, ' ', features[i-1])
    print()

    headers.append('target')
    data.columns = headers

    for i in original_features:
        print(i)

    # Peform stratified train-test split based on a random seed
    predictors = list(data.columns)
    predictors.remove('target')
    X_train, X_test, Y_train, Y_test = train_test_split(data[predictors], data['target'], test_size = 0.2, random_state = 0, stratify = data['target'])

    train_data = X_train
    train_data['target'] = Y_train

    test_data = X_test
    test_data['target'] = Y_test

    X_train, X_val, Y_train, Y_val = train_test_split(train_data[predictors], train_data['target'], test_size = 0.125, random_state = 0, stratify = train_data['target'])

    # Try both try both re-sampling & cost-sensitive learning
    # train_data = X_train
    # train_data['target'] = Y_train

    ros = RandomOverSampler()
    X_ros, y_ros = ros.fit_resample(X_train, Y_train)
    train_data = X_ros
    train_data['target'] = y_ros

    val_data = X_val
    val_data['target'] = Y_val

    class1_count = len( train_data[ train_data['target'] == 0 ] )
    class2_count = len( train_data[ train_data['target'] == 1 ] )
    print(class1_count / class2_count)

    training_reweight = class1_count / class2_count # increase reg loss weight to set minority weight higher
    print(training_reweight)

    class1_count = len( test_data[ test_data['target'] == 0 ] )
    class2_count = len( test_data[ test_data['target'] == 1 ] )
    print(class1_count / class2_count)


In [13]:
if (dataset == 'Cover'):

    data = pd.read_csv('covtype.csv')
    data = data.rename( columns={'Cover_Type': 'target'} )

    # convert the 2 most common types to 0, with the rest types (> 15%) being 1
    data.loc[data['target'] == 1, 'target'] = 0
    data.loc[data['target'] == 2, 'target'] = 0
    data.loc[data['target'] != 0, 'target'] = 1

    data = pd.read_csv('covtype.csv')
    data = data.rename( columns={'Cover_Type': 'target'} )

    data.loc[data['target'] == 1, 'target'] = 0
    data.loc[data['target'] == 2, 'target'] = 0
    data.loc[data['target'] != 0, 'target'] = 1

    print( data['target'].unique() )
    print(len(data[data['target'] == 0]))
    print(len(data[data['target'] == 1]))
    print(len(data[data['target'] == 0]) / len(data[data['target'] == 1]))

    columns = data.columns

    # Select column names from the 11th to the second-to-last column
    selected_columns = columns[10:-1]

    # Convert the selected columns to string type as they encode the presence or absence of a cateogrical feature
    data[selected_columns] = data[selected_columns].astype(str)

    cols = list(data.columns)
    cols.remove('target')
    cols.append('target')
    data = data[cols]
    data.dropna(inplace = True)

    features = list(data.columns)

    headers = []
    original_features = []
    predictors = list(data.columns)
    for i in range( 1, len(data.columns) ):
        original_features.append( predictors[i - 1] )
        s = 'Feature' + str(i)
        headers.append(s)
        print(i, ' ', features[i-1])
    print()

    headers.append('target')
    data.columns = headers

    for i in original_features:
        print(i)

    # Peform stratified train-test split based on a random seed
    predictors = list(data.columns)
    predictors.remove('target')
    X_train, X_test, Y_train, Y_test = train_test_split(data[predictors], data['target'], test_size = 0.2, random_state = 1, stratify = data['target'])

    train_data = X_train
    train_data['target'] = Y_train

    test_data = X_test
    test_data['target'] = Y_test

    X_train, X_val, Y_train, Y_val = train_test_split(train_data[predictors], train_data['target'], test_size = 0.125, random_state = 1, stratify = train_data['target'])

    train_data = X_train
    train_data['target'] = Y_train

    val_data = X_val
    val_data['target'] = Y_val

    print(val_data)
    print()
    print(test_data)

    class1_count = len( train_data[ train_data['target'] == 0 ] )
    class2_count = len( train_data[ train_data['target'] == 1 ] )
    print(class1_count / class2_count)

    training_reweight = class1_count / class2_count # increase reg loss weight to set minority weight higher


In [14]:
if (dataset == 'Insurance'):

    data = pd.read_csv('all_state.csv')
    data = data.rename( columns={'loss': 'target'} )
    data = data.drop( columns = ['id'] )

    # convert target s/t the top 10% will be 1 (most severe ones), while others being 0;
    threshold = data['target'].quantile(.9)
    print( threshold, data['target'].max() )

    data.loc[data['target'] < threshold, 'target'] = 0
    data.loc[data['target'] >= threshold, 'target'] = 1
    data['target'] = data['target'].astype('int')
    print( data['target'].dtype )
    print()

    # place all categorical features after all numerical features
    nums = []
    cats = []
    for i in data.columns:
        if (i != 'target'):
            if pd.api.types.is_numeric_dtype(data[i]):
                nums.append(i)
            else:
                cats.append(i)

    headers = []
    for i in nums:
        headers.append(i)
    for i in cats:
        headers.append(i)
    headers.append('target')
    data = data[headers]

    # Change all feature names to Feature_1 - Feature_N
    headers = []
    original_features = []
    predictors = list(data.columns)
    for i in range( 1, len(data.columns) ):
        original_features.append( predictors[i - 1] )
        s = 'Feature' + str(i)
        headers.append(s)
    headers.append('target')
    data.columns = headers

    for i in data.columns:
        print( i, data[i].dtype, data[i].nunique(), data[i].isna().sum() )
    print()

    # Peform stratified train-test split based on a random seed
    predictors = list(data.columns)
    predictors.remove('target')
    X_train, X_test, Y_train, Y_test = train_test_split(data[predictors], data['target'], test_size = 0.2, random_state = 0, stratify = data['target'])

    train_data = X_train
    train_data['target'] = Y_train

    test_data = X_test
    test_data['target'] = Y_test

    X_train, X_val, Y_train, Y_val = train_test_split(train_data[predictors], train_data['target'], test_size = 0.125, random_state = 0, stratify = train_data['target'])

    # Try both try both re-sampling & cost-sensitive learning
    # train_data = X_train
    # train_data['target'] = Y_train

    ros = RandomOverSampler()
    X_ros, y_ros = ros.fit_resample(X_train, Y_train)
    train_data = X_ros
    train_data['target'] = y_ros

    val_data = X_val
    val_data['target'] = Y_val

    print(len(val_data))
    for i in train_data.columns:
        val_extra_levels = list( set( list( val_data[i].unique() ) ) - set( list( train_data[i].unique() ) ) )
        if len(val_extra_levels) > 0:
            if (train_data[i].dtype == 'string' or train_data[i].dtype == 'object'):
                print( i, ' ', len(val_extra_levels), ' ', val_extra_levels )

                for level in val_extra_levels:
                    print( len( val_data[ val_data[i] == level ] ) )
                    val_data = val_data[val_data[i] != level]

    print(len(val_data))
    print()

    print(len(test_data))
    for i in train_data.columns:
        test_extra_levels = list( set( list( test_data[i].unique() ) ) - set( list( train_data[i].unique() ) ) )
        if len(test_extra_levels) > 0:
            if (train_data[i].dtype == 'string' or train_data[i].dtype == 'object'):
                print( i, ' ', len(test_extra_levels), ' ', test_extra_levels )

                for level in test_extra_levels:
                    print( len( test_data[ test_data[i] == level ] ) )
                    test_data = test_data[test_data[i] != level]

    print(len(test_data))
    print()

    for i in original_features:
        print(i)

    class1_count = len( train_data[ train_data['target'] == 0 ] )
    class2_count = len( train_data[ train_data['target'] == 1 ] )
    print(class1_count / class2_count)

    training_reweight = class1_count / class2_count # increase reg loss weight to set minority weight higher
    print(training_reweight)

    class1_count = len( test_data[ test_data['target'] == 0 ] )
    class2_count = len( test_data[ test_data['target'] == 1 ] )
    print(class1_count / class2_count)


# Model Training:

In [None]:
# Compute DTree-based paths/rules to be adapted:

# make 2 copies of train sets for later use;
train_copy = train_data.copy()
train_copy_2 = train_data.copy()

# pre-process training copy to fit with sk-learn decision tree:
if (non_ordinal_cat == False):
    # for ordinal Cat feature, simply convert to float
    for i in train_copy.columns:
        if (i != 'target'):
            train_copy[i] = train_copy[i].astype('float64')

else:
    # for non-ordinal Cat feature, perform target encoding
    for i in train_copy.columns:
        if (i != 'target'):
            if pd.api.types.is_numeric_dtype(train_copy[i]) == False:
                print(i)
                encoder = TargetEncoder()
                train_copy[i] = encoder.fit_transform( train_copy[i], train_copy['target'] )


# compute & show performance for DTree on processed data
X_train = train_copy.iloc[ :, :len(train_copy.columns) - 1 ]
Y_train = list( train_copy['target'] )

clf = DecisionTreeClassifier(max_depth = tree_depth, class_weight = 'balanced', random_state = 1)
clf.fit(X_train, Y_train)

tree_rules = export_text(clf, feature_names = list(X_train.columns) )
print(tree_rules)
print()
print()

# get all features used during tree generation
feature_set = []
rules = get_rules(clf, list(X_train.columns), Y_train)
for i in range( len(rules) ):
    for j in range( len(rules[i]) ):
        rules[i][j] = rules[i][j].split(' ')
        feature_set.append( rules[i][j][0] )

# get unused features based on generated tree structure
feature_list = list( set(feature_set) )
print(feature_list)
print()

drop_list = list( set( list(train_data.columns) ) - set(feature_list) )
drop_list.remove('target')
print(drop_list)
print()

# build feature list for second DTree to ensure same performance while rules w/ updated feature indexes;
second_tree_features = []
count = 0
for i in train_data.columns:
    if (i != 'target'):
        if (i in drop_list):
            second_tree_features.append('d_' + str(i) )
        else:
            second_tree_features.append( 'Feature' + str(count + 1) )
            count += 1
second_tree_features.append('target')

# drop unused features on training & testing set to align with tree
train_data = train_data.drop(columns = drop_list)
val_data = val_data.drop(columns = drop_list)
test_data = test_data.drop(columns = drop_list)

# re-index the kept feature names from Feature1
num_of_predictors = len( list(train_data.columns) ) - 1 # last one is target variable
new_feature_names = []
for i in range(num_of_predictors):
    new_feature_names.append( 'Feature' + str(i+1) )
new_feature_names.append('target')

train_data.columns = new_feature_names
val_data.columns = new_feature_names
test_data.columns = new_feature_names

print(train_data.columns)
print(second_tree_features)
print()

print(train_data)
print()

# with features re-indexed/renamed; generate DTree again
train_copy_2.columns = second_tree_features

# pre-process training copy to fit with sk-learn decision tree:
if (non_ordinal_cat == False):
    # for ordinal Cat feature, simply convert to float
    for i in train_copy_2.columns:
        if (i != 'target'):
            train_copy_2[i] = train_copy_2[i].astype('float64')

else:
    # for non-ordinal Cat feature, perform target encoding
    for i in train_copy_2.columns:
        if (i != 'target'):
            if pd.api.types.is_numeric_dtype(train_copy_2[i]) == False:
                encoder = TargetEncoder()
                train_copy_2[i] = encoder.fit_transform( train_copy_2[i], train_copy_2['target'] )


X_train = train_copy_2.iloc[ :, :len(train_copy.columns)-1 ]
Y_train = list( train_copy_2['target'] )

clf.fit(X_train, Y_train)

# We fit on training set and then evaluate on trainning set again, ONLY to measure whether the tree is able to learn the dataset;
preds = clf.predict(X_train)
accuracy = metrics.balanced_accuracy_score(Y_train, preds)
if accuracy > 0.5:
    print('DTree is able to learn this data')
print('SciKit-Learn verison: ', sklearn.__version__)


In [16]:
if 'sub' not in model_type:

    # get dict for kept features for future mapping:
    kept_feature_dict = {}
    counter = 0
    for i in range( len(original_features) ):
        s = 'Feature' + str(i + 1)
        if s not in drop_list:
            counter += 1
            kept_feature_dict[counter] = original_features[i]

    for k, v in kept_feature_dict.items():

        print(k, '  ', train_data[ 'Feature' + str(k) ].nunique(), '  ', v)
    print()
    print()

    # if we would like to match sk-learn DTree adapted by treating cat features as numericals:
    if cat_as_num == True:
        encoder_dict = {}
        # for non-ordinal Cat feature, perform target encoding
        for i in train_data.columns:
            if (i != 'target'):
                if pd.api.types.is_numeric_dtype(train_data[i]) == False:
                    encoder = TargetEncoder()
                    train_data[i] = encoder.fit_transform( train_data[i], train_data['target'] )
                    encoder_dict[i] = encoder

    # with DTree clf generated & unused features dropped on both training & testing sets,
    # simply iterate through columns to append indexes (starting from 1 for this block) to cat_features list;
    cat_features = []
    for i in train_data.columns:
        if (train_data[i].dtype == 'string' or train_data[i].dtype == 'object'):
            print(i)
            cat_features.append( int(i[7:]) )

    num_of_numerical_predictors = num_of_predictors - len(cat_features)

    print(train_data)
    print()
    print()


    rules = get_rules(clf, list(X_train.columns), Y_train)
    print('rule pruning --------------------------------------------------')
    for i in range( len(rules) ):
        for j in range( len(rules[i]) ):
            rules[i][j] = rules[i][j].split(' ')
            rules[i][j][0] = int( rules[i][j][0][7:] )

    # for i in range( len(rules) ):
    #     removed_list = []
    #     for j in range(len(rules[i]) - 1):
    #         if (rules[i][j][0] == rules[i][j+1][0]) and (rules[i][j][1] == rules[i][j+1][1]):
    #             removed_list.append(rules[i][j])
    #     for elt in removed_list:
    #         rules[i].remove(elt)

    for i in range( len(rules) ):
        removed_list = []
        for j in range( len(rules[i]) ):
            for num in range(j + 1, len(rules[i]) ):
                if (rules[i][j][0] == rules[i][num][0]) and (rules[i][j][1] == rules[i][num][1]):
                    if rules[i][j] not in removed_list:
                        removed_list.append(rules[i][j])
        for elt in removed_list:
            rules[i].remove(elt)
    # --------------------------------------------------

    # change all cat features' symbol to 'be':
    for i in range( len(rules) ):
        for j in range( len(rules[i]) ):
            if rules[i][j][0] in cat_features:
                rules[i][j][1] = 'be'

    le_node_count = 0
    for i in range( len(rules) ):
        for j in range( len(rules[i]) ):
            if (rules[i][j][1] == '<='):
                le_node_count += 1

    ge_node_count = 0
    for i in range( len(rules) ):
        for j in range( len(rules[i]) ):
            if (rules[i][j][1] == '>'):
                ge_node_count += 1

    le_node_counter = 0
    ge_node_counter = le_node_count
    be_node_counter = le_node_count + ge_node_count

    le_nodes = []
    ge_nodes = []
    be_nodes = []

    rule_look_up_indexes = copy.deepcopy(rules)

    for i in range( len(rules) ):
        for j in range( len(rules[i]) ):
            if (rules[i][j][1] == '<='):
                rules[i][j][2] = le_node_counter
                rule_look_up_indexes[i][j] = le_node_counter
                le_nodes.append(rules[i][j])
                le_node_counter += 1

            elif (rules[i][j][1] == '>'):
                rules[i][j][2] = ge_node_counter
                rule_look_up_indexes[i][j] = ge_node_counter
                ge_nodes.append(rules[i][j])
                ge_node_counter += 1

            elif (rules[i][j][1] == 'be'):
                rules[i][j][2] = be_node_counter
                rule_look_up_indexes[i][j] = be_node_counter
                be_nodes.append(rules[i][j])
                be_node_counter += 1


    for rule in rules:
        for node in rule:
            print(node)
        print()

    print()
    print()
    print()

    for rule in rule_look_up_indexes:
        print(rule)

    print()
    print()
    print()

    # perform padding to make all rules of same length for batch processing
    for i in range( len(rule_look_up_indexes) ):
        if len( rule_look_up_indexes[i] ) < tree_depth:
            padding_count = tree_depth - len( rule_look_up_indexes[i] )

            for j in range(padding_count):
                rule_look_up_indexes[i].append(be_node_counter)

    for rule in rule_look_up_indexes:
        print(rule)


In [None]:
if 'sub' in model_type:

    # Add rules from differrent subtrees

    # get dict for kept features for future mapping:
    kept_feature_dict = {}
    counter = 0
    for i in range( len(original_features) ):
        s = 'Feature' + str(i + 1)
        if s not in drop_list:
            counter += 1
            kept_feature_dict[counter] = original_features[i]

    for k, v in kept_feature_dict.items():

        print(k, '  ', train_data[ 'Feature' + str(k) ].nunique(), '  ', v)
    print()
    print()

    # if we would like to match sk-learn DTree adapted by treating cat features as numericals:
    if cat_as_num == True:
        encoder_dict = {}
        # for non-ordinal Cat feature, perform target encoding
        for i in train_data.columns:
            if (i != 'target'):
                if pd.api.types.is_numeric_dtype(train_data[i]) == False:
                    encoder = TargetEncoder()
                    train_data[i] = encoder.fit_transform( train_data[i], train_data['target'] )
                    encoder_dict[i] = encoder

    # with DTree clf generated & unused features dropped on both training & testing sets,
    # simply iterate through columns to append indexes (starting from 1 for this block) to cat_features list;
    cat_features = []
    for i in train_data.columns:
        if (train_data[i].dtype == 'string' or train_data[i].dtype == 'object'):
            print(i)
            cat_features.append( int(i[7:]) )

    num_of_numerical_predictors = num_of_predictors - len(cat_features)

    print(train_data)
    print()
    print()

    rules = get_rules(clf, list(X_train.columns), Y_train)
    print('rule pruning --------------------------------------------------')
    for i in range( len(rules) ):
        for j in range( len(rules[i]) ):
            rules[i][j] = rules[i][j].split(' ')
            rules[i][j][0] = int( rules[i][j][0][7:] )

    for i in range( len(rules) ):
        removed_list = []
        for j in range( len(rules[i]) ):
            for num in range(j + 1, len(rules[i]) ):
                if (rules[i][j][0] == rules[i][num][0]) and (rules[i][j][1] == rules[i][num][1]):
                    if rules[i][j] not in removed_list:
                        removed_list.append(rules[i][j])
        for elt in removed_list:
            rules[i].remove(elt)
    # --------------------------------------------------


    # Add subtrees on train_data w/ kept features only:

    # make 3 copies of train & test sets for later use;
    train_copy = train_data.sample(frac=0.8, replace = False, random_state = 0)
    train_copy_2 = train_data.sample(frac=0.8, replace = False, random_state = 1)
    train_copy_3 = train_data.sample(frac=0.8, replace = False, random_state = 2)

    # pre-process training copy to fit with sk-learn decision tree:
    if (non_ordinal_cat == False):
        # for ordinal Cat feature, simply convert to float
        for i in train_copy.columns:
            if (i != 'target'):
                train_copy[i] = train_copy[i].astype('float64')
                train_copy_2[i] = train_copy_2[i].astype('float64')
                train_copy_3[i] = train_copy_3[i].astype('float64')

    else:
        # for non-ordinal Cat feature, perform target encoding
        for i in train_copy.columns:
            if (i != 'target'):
                if pd.api.types.is_numeric_dtype(train_copy[i]) == False:
                    print(i)
                    encoder = TargetEncoder()
                    train_copy[i] = encoder.fit_transform( train_copy[i], train_copy['target'] )
                    encoder = TargetEncoder()
                    train_copy_2[i] = encoder.fit_transform( train_copy_2[i], train_copy_2['target'] )
                    encoder = TargetEncoder()
                    train_copy_3[i] = encoder.fit_transform( train_copy_3[i], train_copy_3['target'] )


    # compute & show performance for DTree on processed data
    X_train = train_copy.iloc[ :, :len(train_copy.columns) - 1 ]
    Y_train = list( train_copy['target'] )

    clf_minus_2 = DecisionTreeClassifier(max_depth = tree_depth - 2, class_weight = 'balanced', random_state = 1)
    clf_minus_2.fit(X_train, Y_train)

    rules_minus_2 = get_rules(clf_minus_2, list(X_train.columns), Y_train)
    print('rule pruning --------------------------------------------------')
    for i in range( len(rules_minus_2) ):
        for j in range( len(rules_minus_2[i]) ):
            rules_minus_2[i][j] = rules_minus_2[i][j].split(' ')
            rules_minus_2[i][j][0] = int( rules_minus_2[i][j][0][7:] )

    for i in range( len(rules) ):
        removed_list = []
        for j in range( len(rules[i]) ):
            for num in range(j + 1, len(rules[i]) ):
                if (rules[i][j][0] == rules[i][num][0]) and (rules[i][j][1] == rules[i][num][1]):
                    if rules[i][j] not in removed_list:
                        removed_list.append(rules[i][j])
        for elt in removed_list:
            rules[i].remove(elt)


    X_train = train_copy_2.iloc[ :, :len(train_copy_2.columns) - 1 ]
    Y_train = list( train_copy_2['target'] )

    clf_minus_4 = DecisionTreeClassifier(max_depth = tree_depth - 4, class_weight = 'balanced', random_state = 1)
    clf_minus_4.fit(X_train, Y_train)

    rules_minus_4 = get_rules(clf_minus_4, list(X_train.columns), Y_train)
    print('rule pruning --------------------------------------------------')
    for i in range( len(rules_minus_4) ):
        for j in range( len(rules_minus_4[i]) ):
            rules_minus_4[i][j] = rules_minus_4[i][j].split(' ')
            rules_minus_4[i][j][0] = int( rules_minus_4[i][j][0][7:] )

    for i in range( len(rules) ):
        removed_list = []
        for j in range( len(rules[i]) ):
            for num in range(j + 1, len(rules[i]) ):
                if (rules[i][j][0] == rules[i][num][0]) and (rules[i][j][1] == rules[i][num][1]):
                    if rules[i][j] not in removed_list:
                        removed_list.append(rules[i][j])
        for elt in removed_list:
            rules[i].remove(elt)


    X_train = train_copy_3.iloc[ :, :len(train_copy_3.columns) - 1 ]
    Y_train = list( train_copy_3['target'] )

    clf_minus_6 = DecisionTreeClassifier(max_depth = tree_depth - 6, class_weight = 'balanced', random_state = 1)
    clf_minus_6.fit(X_train, Y_train)

    rules_minus_6 = get_rules(clf_minus_6, list(X_train.columns), Y_train)
    print('rule pruning --------------------------------------------------')
    for i in range( len(rules_minus_6) ):
        for j in range( len(rules_minus_6[i]) ):
            rules_minus_6[i][j] = rules_minus_6[i][j].split(' ')
            rules_minus_6[i][j][0] = int( rules_minus_6[i][j][0][7:] )

    for i in range( len(rules) ):
        removed_list = []
        for j in range( len(rules[i]) ):
            for num in range(j + 1, len(rules[i]) ):
                if (rules[i][j][0] == rules[i][num][0]) and (rules[i][j][1] == rules[i][num][1]):
                    if rules[i][j] not in removed_list:
                        removed_list.append(rules[i][j])
        for elt in removed_list:
            rules[i].remove(elt)

    rules = rules + rules_minus_2 + rules_minus_4 + rules_minus_6 # merge rules from the main tree & its subtrees

    # change all cat features' symbol to 'be':
    for i in range( len(rules) ):
        for j in range( len(rules[i]) ):
            if rules[i][j][0] in cat_features:
                rules[i][j][1] = 'be'

    le_node_count = 0
    for i in range( len(rules) ):
        for j in range( len(rules[i]) ):
            if (rules[i][j][1] == '<='):
                le_node_count += 1

    ge_node_count = 0
    for i in range( len(rules) ):
        for j in range( len(rules[i]) ):
            if (rules[i][j][1] == '>'):
                ge_node_count += 1

    le_node_counter = 0
    ge_node_counter = le_node_count
    be_node_counter = le_node_count + ge_node_count

    le_nodes = []
    ge_nodes = []
    be_nodes = []

    rule_look_up_indexes = copy.deepcopy(rules)

    for i in range( len(rules) ):
        for j in range( len(rules[i]) ):
            if (rules[i][j][1] == '<='):
                rules[i][j][2] = le_node_counter
                rule_look_up_indexes[i][j] = le_node_counter
                le_nodes.append(rules[i][j])
                le_node_counter += 1

            elif (rules[i][j][1] == '>'):
                rules[i][j][2] = ge_node_counter
                rule_look_up_indexes[i][j] = ge_node_counter
                ge_nodes.append(rules[i][j])
                ge_node_counter += 1

            elif (rules[i][j][1] == 'be'):
                rules[i][j][2] = be_node_counter
                rule_look_up_indexes[i][j] = be_node_counter
                be_nodes.append(rules[i][j])
                be_node_counter += 1


    for rule in rules:
        for node in rule:
            print(node)
        print()

    print()
    print()
    print()

    for rule in rule_look_up_indexes:
        print(rule)

    print()
    print()
    print()

    # perform padding to make all rules of same length for batch processing
    for i in range( len(rule_look_up_indexes) ):
        if len( rule_look_up_indexes[i] ) < tree_depth:
            padding_count = tree_depth - len( rule_look_up_indexes[i] )

            for j in range(padding_count):
                rule_look_up_indexes[i].append(be_node_counter)

    for rule in rule_look_up_indexes:
        print(rule)


In [18]:
def FeatureTransfers(train_data, bins): # change bin_number to a map (feature, bin_number)

    df = train_data.copy()
    df_2 = train_data.copy()

    print( len(df) )

    cut_off_dict = {}

    feature_level_index = 0
    feature_lookup_dict = {} # lookup dict for index access of the embedding matrix
    cat_bins_total = []

    # Use info from training set only for conversion on both sets to avoid data leakage
    for i in df.columns:
        if ('target' in i) == False:
            if pd.api.types.is_numeric_dtype(df[i]):
                levels = df[i].nunique()

                if (levels > bins): # Cont. Num. feature; compute bins
                    cut_offs = []
                    for num in range(0, bins + 1):
                        n = num * (1.0 / float(bins) ) # n ranges from 0.0 - 1.0 to cover both ends
                        cut_offs.append( df[i].quantile(n) )
                    if (len( set(cut_offs) ) == 2):
                        cut_offs.insert(0, df[i].min() - 0.01)

                else: # if <= bin number, just use original bins, no need to compute
                    cut_offs = list( df[i].unique() )
                    cut_offs.append(df[i].min() - 0.01)# insert an extra bin right after 1st bin; s/t every bin will be a unique level
                    cut_offs.sort() # sort bins as they need to increase monotonically

                cut_off_dict[i] = cut_offs

                # For testing set：
                # # 把所有比 1st cutoff 还小的数 convert 成 = 1st cutoff
                # # 把所有比 last cutoff 还大的数 convert 成 = last cutoff
                # df_test.loc[df_test[i] < cut_offs[0], i] = cut_offs[0]
                # df_test.loc[df_test[i] > cut_offs[len(cut_offs)-1], i] = cut_offs[len(cut_offs)-1] ------------------------- call later

                # use cut-offs generated from only train set to perform conversion to avoid leakage
                df[i] = pd.cut(df[i], cut_offs, duplicates = 'drop', right = True, include_lowest = True)
                # df_test[i] = pd.cut(df_test[i], cut_offs, duplicates = 'drop', right = True, include_lowest = True) ------------------------- call later

                feature_copy = df[i].copy()

                df[i] = i + '_' + df[i].astype(str)
                # df_test[i] = i + '_' + df_test[i].astype(str) ------------------------- call later

                feature_levels = list( feature_copy.unique() )

                feature_levels.sort() # sort numerical bins in ascending order
                for count in range( len(feature_levels) ):
                    feature_levels[count] = i + '_' + str( feature_levels[count] )

                for i in feature_levels:
                    feature_lookup_dict[i] = feature_level_index
                    feature_level_index += 1

                if ( len( set(cut_offs) ) <= bins ):
                    feature_level_index += bins - len( set(cut_offs) ) + 1
                # if ( len(feature_levels) <= bins ):
                #     feature_level_index += bins - len(feature_levels)


    for i in df_2.columns:
        if ('target' in i) == False:
            if pd.api.types.is_numeric_dtype(df_2[i]) == False:

                # for Cat features:
                feature_copy = df_2[i].copy()
                feature_levels = list( feature_copy.unique() )
                cat_bins_total.append( len(feature_levels) )

                df[i] = i + '_' + df[i].astype(str)
                # df_test[i] = i + '_' + df_test[i].astype(str) ------------------------- call later

                for count in range( len(feature_levels) ):
                    feature_levels[count] = i + '_' + str( feature_levels[count] )

                for i in feature_levels:
                    feature_lookup_dict[i] = feature_level_index
                    feature_level_index += 1


    return [df, cut_off_dict, feature_lookup_dict, cat_bins_total]


In [19]:
def NSDT_modeling_preps(bins, train_loader, new_train, cut_off_dict, feature_lookup_dict, cat_bins_total, total_numerical_nodes, total_levels):

    train_loader = train_loader
    new_train = new_train
    cut_off_dict = cut_off_dict
    feature_lookup_dict = feature_lookup_dict
    cat_bins_total = cat_bins_total
    total_numerical_nodes = total_numerical_nodes
    total_levels = total_levels

    print( len(new_train) )


    # Create context value index lists for training set:
    # be nodes would simply be empty if the dataset contains num features only
    all_batch_le_context_value_index_list = []
    all_batch_ge_context_value_index_list = []
    all_batch_be_context_value_index_list = []

    for batch in train_loader:

        batch_size = len(batch)
        batch_le_context_value_index_list = []
        batch_ge_context_value_index_list = []
        batch_be_context_value_index_list = []

        for node in le_nodes:
            le_context_value_index_list = [ node[2] ] * batch_size
            batch_le_context_value_index_list.append(le_context_value_index_list)

        for node in ge_nodes:
            ge_context_value_index_list = [ node[2] ] * batch_size
            batch_ge_context_value_index_list.append(ge_context_value_index_list)

        for node in be_nodes:
            be_context_value_index_list = [ node[2] ] * batch_size
            batch_be_context_value_index_list.append(be_context_value_index_list)

        all_batch_le_context_value_index_list.append(batch_le_context_value_index_list)
        all_batch_ge_context_value_index_list.append(batch_ge_context_value_index_list)
        all_batch_be_context_value_index_list.append(batch_be_context_value_index_list)


    feature_indexes_le_nodes  = []
    feature_indexes_ge_nodes  = []
    feature_indexes_be_nodes  = []

    for node in le_nodes:
        feature_indexes_le_nodes.append(node[0] - 1)

    for node in ge_nodes:
        feature_indexes_ge_nodes.append(node[0] - 1)

    for node in be_nodes:
        feature_indexes_be_nodes.append(node[0] - 1)


    # le & ge regs:
    feature_bin_counts = [bins] * num_of_numerical_predictors

    le_current_feature_indexes = []

    for list_count in range(bins,0,-1):
        current_index = 0
        twoD_list = []

        for feature_counts in feature_bin_counts:
            oneD_list = [current_index] * list_count
            twoD_list.append(oneD_list)

            current_index += feature_counts
        le_current_feature_indexes.append(twoD_list)

    for i in range(bins):
        le_current_feature_indexes[i] = list( np.asarray(le_current_feature_indexes[i]) + i )


    le_next_feature_indexes = []

    for list_count in range(bins,0,-1):
        current_index = 0
        twoD_list = []

        for feature_counts in feature_bin_counts:
            oneD_list = list( range(current_index, list_count + current_index) )
            twoD_list.append(oneD_list)

            current_index += feature_counts
        le_next_feature_indexes.append(twoD_list)

    for i in range(bins):
        le_next_feature_indexes[i] = list( np.asarray(le_next_feature_indexes[i]) + i )


    for i in range( len(le_current_feature_indexes) ):
        le_current_feature_indexes[i] = torch.tensor(le_current_feature_indexes[i], dtype = torch.long).to('cuda')

    for i in range( len(le_next_feature_indexes) ):
        le_next_feature_indexes[i] = torch.tensor(le_next_feature_indexes[i], dtype = torch.long).to('cuda')


    print( le_current_feature_indexes[0][0] )
    print( le_next_feature_indexes[0][1] )
    print()


    ge_current_feature_indexes = []

    for list_count in range(1, bins + 1):
        current_index = 0
        twoD_list = []

        for feature_counts in feature_bin_counts:
            oneD_list = [current_index] * list_count
            twoD_list.append(oneD_list)

            current_index += feature_counts
        ge_current_feature_indexes.append(twoD_list)

    for i in range(bins):
        ge_current_feature_indexes[i] = list( np.asarray(ge_current_feature_indexes[i]) + i )


    ge_next_feature_indexes = []

    for list_count in range(1,bins + 1):
        current_index = 0
        twoD_list = []

        for feature_counts in feature_bin_counts:
            oneD_list = list( range(current_index, list_count + current_index) )
            twoD_list.append(oneD_list)

            current_index += feature_counts
        ge_next_feature_indexes.append(twoD_list)


    for i in range( len(ge_current_feature_indexes) ):
        ge_current_feature_indexes[i] = torch.tensor(ge_current_feature_indexes[i], dtype = torch.long).to('cuda')

    for i in range( len(ge_next_feature_indexes) ):
        ge_next_feature_indexes[i] = torch.tensor(ge_next_feature_indexes[i], dtype = torch.long).to('cuda')


    print( (ge_current_feature_indexes[0][0]) )
    print( (ge_current_feature_indexes[0][1]) )
    print()

    print( (ge_current_feature_indexes[1][0]) )
    print( (ge_current_feature_indexes[1][1]) )
    print()


    # compute le & ge x matrices for regularization:
    one_bin_diff = round(0.5 / (bins - 1), 6)
    le_expected_outputs_1 = []
    le_expected_outputs_2 = []

    for i in range(bins): # all features' 1st bin to last (25th) bin
        bin_list_of_outputs = []

        for j in range(len (le_current_feature_indexes[i]) ):
            count = 0
            list_of_outputs = []

            for k in range(len (le_current_feature_indexes[i][0]) ):
                list_of_outputs.append(0.5 + count * one_bin_diff)
                count += 1

            bin_list_of_outputs.append(list_of_outputs)

        le_expected_outputs_1.append( torch.FloatTensor(bin_list_of_outputs).to('cuda') ) # bin_list_of_outputs = 21 x 25 for the first bin


    for i in range(bins): # all features' 1st bin to last (25th) bin
        bin_list_of_outputs = []

        for j in range(len (le_current_feature_indexes[i]) ):
            count = 0
            list_of_outputs = []

            for k in range(len (le_current_feature_indexes[i][0]) ):
                list_of_outputs.append(0.5 - count * one_bin_diff)
                count += 1

            bin_list_of_outputs.append(list_of_outputs)

        le_expected_outputs_2.append( torch.FloatTensor(bin_list_of_outputs).to('cuda') )


    print((le_expected_outputs_1[2][0]))
    print((le_expected_outputs_2[2][0]))

    # [2 2]
    # [0, 1]

    # [24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24]
    # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]

    ge_expected_outputs_1 = []
    ge_expected_outputs_2 = []

    for i in range(bins): # all features' 1st bin to last (25th) bin
        bin_list_of_outputs = []

        for j in range(len (ge_current_feature_indexes[i]) ):
            count = len (ge_current_feature_indexes[i][0]) - 1
            list_of_outputs = []

            for k in range(len (ge_current_feature_indexes[i][0]) ):
                list_of_outputs.append(0.5 + count * one_bin_diff)
                count -= 1

            bin_list_of_outputs.append(list_of_outputs)

        ge_expected_outputs_1.append( torch.FloatTensor(bin_list_of_outputs).to('cuda') )


    for i in range(bins): # all features' 1st bin to last (25th) bin
        bin_list_of_outputs = []

        for j in range(len (ge_current_feature_indexes[i]) ):
            count = len (ge_current_feature_indexes[i][0]) - 1
            list_of_outputs = []

            for k in range(len (ge_current_feature_indexes[i][0]) ):
                list_of_outputs.append(0.5 - count * one_bin_diff)
                count -= 1

            bin_list_of_outputs.append(list_of_outputs)

        ge_expected_outputs_2.append( torch.FloatTensor(bin_list_of_outputs).to('cuda') )


    print((ge_expected_outputs_1[5][0]))
    print((ge_expected_outputs_2[5][0]))


    # ref, Si (asym & trans), and ordering_Si;
    le_ref_lookup_indexes = []
    ge_ref_lookup_indexes = []

    le_features_Si_indexes = []
    ge_features_Si_indexes = []

    le_orderings_Si_indexes = []
    ge_orderings_Si_indexes = []

    num_features = []
    for i in range(len(new_train.columns) - 1): # target feature at the end
        num_features.append(i)

    for i in cat_features:
        num_features.remove(i - 1)

    feature_bin_count = 0
    for i in range( len(num_features) ):
        feature_Si_indexes = []
        feature_bin_indexes = []
        feature_context_indexes = []

        feature_bin_indexes = list( range(feature_bin_count, feature_bin_count + bins) )
        feature_bin_count += bins

        for rule in rules:
            for node in rule:
                if (node[1] == '<='):
                    if (node[0] == num_features[i] + 1):
                        feature_context_indexes.append(node[2])
                        le_ref_lookup_indexes.append(node[2])

        print(i, ': ', feature_context_indexes)

        feature_Si_indexes.append( torch.LongTensor(feature_bin_indexes).to('cuda') )
        feature_Si_indexes.append( torch.LongTensor(feature_context_indexes).to('cuda') )
        le_features_Si_indexes.append(feature_Si_indexes)

        for context_value in feature_context_indexes:
            ordering_Si_indexes = []
            ordering_Si_indexes.append( torch.LongTensor(feature_bin_indexes).to('cuda') )
            ordering_Si_indexes.append( torch.LongTensor( [context_value] * bins ).to('cuda') )
            le_orderings_Si_indexes.append(ordering_Si_indexes)

    print()
    print()

    feature_bin_count = 0
    for i in range( len(num_features) ):
        feature_Si_indexes = []
        feature_bin_indexes = []
        feature_context_indexes = []

        feature_bin_indexes = list( range(feature_bin_count, feature_bin_count + bins) )
        feature_bin_count += bins

        for rule in rules:
            for node in rule:
                if (node[1] == '>'):
                    if (node[0] == num_features[i] + 1):
                        feature_context_indexes.append(node[2])
                        ge_ref_lookup_indexes.append(node[2])

        print(i, ': ', feature_context_indexes)

        feature_Si_indexes.append( torch.LongTensor(feature_bin_indexes).to('cuda') )
        feature_Si_indexes.append( torch.LongTensor(feature_context_indexes).to('cuda') )
        ge_features_Si_indexes.append(feature_Si_indexes)

        for context_value in feature_context_indexes:
            ordering_Si_indexes = []
            ordering_Si_indexes.append( torch.LongTensor(feature_bin_indexes).to('cuda') )
            ordering_Si_indexes.append( torch.LongTensor( [context_value] * bins ).to('cuda') )
            ge_orderings_Si_indexes.append(ordering_Si_indexes)

    print( len(le_features_Si_indexes), len(ge_features_Si_indexes) )
    print( len(le_orderings_Si_indexes), len(ge_orderings_Si_indexes) )

    le_ref_lookup_indexes = torch.LongTensor(le_ref_lookup_indexes).to('cuda')
    ge_ref_lookup_indexes = torch.LongTensor(ge_ref_lookup_indexes).to('cuda')


    # Be reg prep:
    # For each Cat. feature, we need: feature bin indexes & context value indexes;
    # Create a 2D list per feature; all feature bin index (as 1D list) * # of context value indexes times/rows;
    # Create another 2D list per feature: each 1D list = a context value index * cardinality of feature;
    # num_of_numerical_predictors * bins = 1st cat

    if len(cat_features) > 0:
        # make feature bin lookup indexes for all cat features (not needed for num features, always 25):
        cat_features_bin_indexes = []

        i = num_of_numerical_predictors * bins
        count = 0
        while i < total_levels:
            cat_feature_bin_indexes = []
            for j in range(cat_bins_total[count]):
                cat_feature_bin_indexes.append(i)
                i += 1

            cat_features_bin_indexes.append(cat_feature_bin_indexes)
            count += 1


        be_Si_indexes = []
        be_Si_indexes_2 = []
        for i in range( len(cat_features) ):
            be_bin_indexes = []
            be_context_indexes = []

            be_bin_indexes = cat_features_bin_indexes[i]

            for rule in rules:
                for node in rule:
                    if (node[1] == 'be'):
                        if (node[0] == cat_features[i]):
                            be_context_indexes.append(node[2])

            # print(be_bin_indexes, be_context_indexes)
            be_context_indexes = list( set(be_context_indexes) )
            # print(be_bin_indexes, be_context_indexes)

            for j in be_context_indexes:
                be_Si_indexes.append( [ be_bin_indexes, [j] * len(be_bin_indexes) ] )
                be_Si_indexes_2.append( [ be_bin_indexes, [j] * len(be_bin_indexes) ] )

        # for speed boost, may append all i with same i[0] into 1 list;
        for i in range( len(be_Si_indexes) ):
            be_Si_indexes[i][0] = torch.LongTensor(be_Si_indexes[i][0]).to('cuda')
            be_Si_indexes[i][1] = torch.LongTensor(be_Si_indexes[i][1]).to('cuda')


        cat_features_Si_indexes = []
        counter = 0
        for i in range( len(cat_features) ):

            be_bin_indexes = []
            be_context_indexes = []
            feature = be_Si_indexes_2[counter][0]

            for j in be_Si_indexes_2:
                if feature == j[0]:
                    be_bin_indexes.append(j[0])
                    be_context_indexes.append(j[1])
                    counter += 1

            be_bin_indexes = torch.LongTensor(be_bin_indexes).to('cuda')
            be_context_indexes = torch.LongTensor(be_context_indexes).to('cuda')
            cat_features_Si_indexes.append( [be_bin_indexes, be_context_indexes] )

    if len(cat_features) > 0:
        return feature_indexes_le_nodes, feature_indexes_ge_nodes, feature_indexes_be_nodes, all_batch_le_context_value_index_list, \
        all_batch_ge_context_value_index_list, all_batch_be_context_value_index_list, cat_features_Si_indexes, le_orderings_Si_indexes, \
        ge_orderings_Si_indexes, le_ref_lookup_indexes, le_features_Si_indexes, ge_ref_lookup_indexes, ge_features_Si_indexes, le_current_feature_indexes, \
        le_next_feature_indexes, le_expected_outputs_1, le_expected_outputs_2, ge_current_feature_indexes, ge_next_feature_indexes, ge_expected_outputs_1, \
        ge_expected_outputs_2

    else:
        return feature_indexes_le_nodes, feature_indexes_ge_nodes, feature_indexes_be_nodes, all_batch_le_context_value_index_list, \
        all_batch_ge_context_value_index_list, all_batch_be_context_value_index_list, le_orderings_Si_indexes, \
        ge_orderings_Si_indexes, le_ref_lookup_indexes, le_features_Si_indexes, ge_ref_lookup_indexes, ge_features_Si_indexes, le_current_feature_indexes, \
        le_next_feature_indexes, le_expected_outputs_1, le_expected_outputs_2, ge_current_feature_indexes, ge_next_feature_indexes, ge_expected_outputs_1, \
        ge_expected_outputs_2


In [None]:
def objective(trial):

    torch_seed = 0
    torch.manual_seed(torch_seed)

    # You may also tune epoch as a hyper-param by adding it below
    if (dataset == 'Higgs'):
        epochs = 150
        p = 40
    elif (dataset == 'Cover'):
        epochs = 40
        p = 15
    else:
        epochs = 30
        p = 10
    early_stopper = EarlyStopper(patience = p, min_delta = 0)

    # set hyper-params to tune: -------------------------------------------
    global bayes_opt
    bayes_opt = True

    normalize = trial.suggest_categorical( 'normalize num features', ['0', '1'] )

    if (dataset == 'Higgs'):
        batch_size = trial.suggest_int('training batch size', 128, 256, step = 128)
    else:
        batch_size = trial.suggest_int('training batch size', 128, 512, step = 128)

    bins = trial.suggest_int('num feature levels', 20, 35)
    embedding_dimension = trial.suggest_int('feature embedding_dimension', 150, 250)

    optimizer = trial.suggest_categorical('module optimizer', [ 'adam', 'adamw'])
    hidden_init = trial.suggest_categorical('MLP initialization', [ 'he', 'xavier'])
    hidden_nodes_layer1 = trial.suggest_int('num of hidden nodes in 1st hidden layer', 30, 150)
    hidden_nodes_layer2 = trial.suggest_int('num of hidden nodes in 2nd hidden layer', 30, hidden_nodes_layer1)
    activation = trial.suggest_categorical('MLP activation function', [ 'relu', 'leakyrelu', 'elu', 'gelu', 'tanh'])
    learning_rate = trial.suggest_float('Adam learning rate', 0.0001, 0.0003, step = 0.00005)
    weight_decays = trial.suggest_float("Adam weight_decay", 1e-6, 1e-3, log = True)
    reg_frequency = trial.suggest_int('reg computation frequency', 50, 55)
    le_reg_weight = trial.suggest_float( 'le reg re-weight', 0.001, 0.021, step = 0.001)
    ge_reg_weight = trial.suggest_float( 'ge reg re-weight', 0.001, 0.021, step = 0.001)

    if (len(cat_features) > 0):
        be_reg_weight = trial.suggest_float( 'be reg re-weight', 0.00001, 0.001, step = 0.0001)
    # --------------------------------------------------------------------

    # # Or, comment out above and manually set the hyper-param for some initial results (please tune for best performance):
    # global bayes_opt
    # bayes_opt = False

    # fake_tune = trial.suggest_categorical( 'fake tune', [ 'a', 'b'] )
    # normalize = 0
    # batch_size = 128
    # bins = 21
    # embedding_dimension = 241
    # optimizer = 'adam'
    # hidden_init = 'he'
    # hidden_nodes_layer1 = 58
    # hidden_nodes_layer2 = 47
    # activation = 'elu'
    # learning_rate = 0.001
    # weight_decays = 1.41e-05
    # reg_frequency = 33 # 11 x 3
    # le_reg_weight = 0.005
    # ge_reg_weight = 0.005
    # if (len(cat_features) > 0):
    #     be_reg_weight = 0.00005
    # # --------------------------------------------------------------------

    train_copy = train_data.copy()

    if normalize == '1':
        # perform quantile transform same as in the FT-Trans paper:
        objective.qt = QuantileTransformer(n_quantiles = 1000, output_distribution = 'normal', subsample = int(1e9), random_state = 0)

        nums = []
        for i in train_copy.columns:
            if i != 'target':
                if pd.api.types.is_numeric_dtype(train_copy[i]):
                    nums.append(i)

        converted_nums = objective.qt.fit_transform( train_copy[nums] )
        train_copy[nums] = converted_nums


    new_train, cut_off_dict, feature_lookup_dict, cat_bins_total = FeatureTransfers(train_copy, bins)

    for k,v in feature_lookup_dict.items():
        print(k, '   ', v)
    print()
    print()

    total_numerical_nodes = len(le_nodes) + len(ge_nodes)
    total_levels = num_of_numerical_predictors * bins + sum(cat_bins_total)
    print(total_levels, embedding_dimension)
    print(be_node_counter, embedding_dimension)

    # convert levels to indexes according to the lookup dict; for embedding vector retrival later
    levels_to_index_df = new_train.copy()

    for i in levels_to_index_df.columns:
        if (i != 'target'):
            for level in levels_to_index_df[i].unique():
                levels_to_index_df[i] = levels_to_index_df[i].replace(level, feature_lookup_dict[level])

    print(levels_to_index_df.head(5))
    levels_to_index_df_np = levels_to_index_df.to_numpy()

    train_loader = torch.utils.data.DataLoader(levels_to_index_df_np, batch_size = batch_size, shuffle = True)


    # setup val_loader
    val_copy = val_data.copy()

    if normalize == '1':
        converted_nums = objective.qt.transform( val_copy[nums] )
        val_copy[nums] = converted_nums

    if cat_as_num == True:
        # for non-ordinal Cat feature, perform target encoding via encoders from training set
        for i in val_copy.columns:
            if (i != 'target'):
                if pd.api.types.is_numeric_dtype(val_copy[i]) == False:
                    encoder = encoder_dict[i]
                    val_copy[i] = encoder.transform( val_copy[i] )

    for i in val_copy.columns:
        if ('target' in i) == False:
            if pd.api.types.is_numeric_dtype(val_copy[i]):

                cut_offs = cut_off_dict[i]

                # 把所有比 1st cutoff 还小的数 convert 成 = 1st cutoff
                # 把所有比 last cutoff 还大的数 convert 成 = last cutoff
                val_copy.loc[val_copy[i] < cut_offs[0], i] = cut_offs[0]
                val_copy.loc[val_copy[i] > cut_offs[len(cut_offs)-1], i] = cut_offs[len(cut_offs)-1]

                # use cut-offs generated from only train set to perform conversion to avoid leakage
                val_copy[i] = pd.cut(val_copy[i], cut_offs, duplicates = 'drop', right = True, include_lowest = True)
                val_copy[i] = i + '_' + val_copy[i].astype(str)

            else:
                val_copy[i] = i + '_' + val_copy[i].astype(str)

    # Now create data loader for val set
    levels_to_index_df_val = val_copy.iloc[:,:-1] # get all but last (target) feature

    for i in levels_to_index_df_val.columns:
        for level in levels_to_index_df_val[i].unique():
            levels_to_index_df_val[i] = levels_to_index_df_val[i].replace(level, feature_lookup_dict[level])

    print(levels_to_index_df_val.head(5))
    levels_to_index_df_val_np = levels_to_index_df_val.to_numpy()

    val_loader = torch.utils.data.DataLoader(levels_to_index_df_val_np, batch_size = batch_size, shuffle = False)


    # setup test_loader
    test_copy = test_data.copy()

    if normalize == '1':
        converted_nums = objective.qt.transform( test_copy[nums] )
        test_copy[nums] = converted_nums

    if cat_as_num == True:
        # for non-ordinal Cat feature, perform target encoding via encoders from training set
        for i in test_copy.columns:
            if (i != 'target'):
                if pd.api.types.is_numeric_dtype(test_copy[i]) == False:
                    encoder = encoder_dict[i]
                    test_copy[i] = encoder.transform( test_copy[i] )

    for i in test_copy.columns:
        if ('target' in i) == False:
            if pd.api.types.is_numeric_dtype(test_copy[i]):

                cut_offs = cut_off_dict[i]

                # 把所有比 1st cutoff 还小的数 convert 成 = 1st cutoff
                # 把所有比 last cutoff 还大的数 convert 成 = last cutoff
                test_copy.loc[test_copy[i] < cut_offs[0], i] = cut_offs[0]
                test_copy.loc[test_copy[i] > cut_offs[len(cut_offs)-1], i] = cut_offs[len(cut_offs)-1]

                # use cut-offs generated from only train set to perform conversion to avoid leakage
                test_copy[i] = pd.cut(test_copy[i], cut_offs, duplicates = 'drop', right = True, include_lowest = True)
                test_copy[i] = i + '_' + test_copy[i].astype(str)

            else:
                test_copy[i] = i + '_' + test_copy[i].astype(str)

    # Now create data loader for test set
    levels_to_index_df_test = test_copy.iloc[:,:-1] # get all but last (target) feature

    for i in levels_to_index_df_test.columns:
        for level in levels_to_index_df_test[i].unique():
            levels_to_index_df_test[i] = levels_to_index_df_test[i].replace(level, feature_lookup_dict[level])

    print(levels_to_index_df_test.head(5))
    levels_to_index_df_test_np = levels_to_index_df_test.to_numpy()

    test_loader = torch.utils.data.DataLoader(levels_to_index_df_test_np, batch_size = batch_size, shuffle = False)


    # get clf & evaluate performance
    if len(cat_features) > 0:
        clf, val_accuracy, test_accuracy, init_num_regs_losses, end_num_regs_losses, init_cat_regs_losses, end_cat_regs_losses = Train_NSDT_split_regs(masking, train_loader, val_loader, test_loader, early_stopper, new_train, cut_off_dict, feature_lookup_dict, cat_bins_total, total_numerical_nodes, total_levels, bins,
                                    embedding_dimension, epochs, optimizer, hidden_init, hidden_nodes_layer1, hidden_nodes_layer2, activation, learning_rate, weight_decays, reg_frequency, le_reg_weight, ge_reg_weight, be_reg_weight)
    else:
        clf, val_accuracy, test_accuracy, init_num_regs_losses, end_num_regs_losses = Train_num_feature_only_NSDT_split_regs(masking, train_loader, val_loader, test_loader, early_stopper, new_train, cut_off_dict, feature_lookup_dict, cat_bins_total, total_numerical_nodes, total_levels, bins,
                                    embedding_dimension, epochs, optimizer, hidden_init, hidden_nodes_layer1, hidden_nodes_layer2, activation, learning_rate, weight_decays, reg_frequency, le_reg_weight, ge_reg_weight)

    if len(cat_features) > 0:
        if end_num_regs_losses >= init_num_regs_losses:
            reg_accuracy = 0.5 * 0.0
        else:
            reg_accuracy = 0.5 * float( (init_num_regs_losses - end_num_regs_losses) / init_num_regs_losses )

        if end_cat_regs_losses >= init_cat_regs_losses:
            reg_accuracy += 0.5 * 0.0
        else:
            reg_accuracy += 0.5 * float( (init_cat_regs_losses - end_cat_regs_losses) / init_cat_regs_losses )

    else:
        if end_num_regs_losses >= init_num_regs_losses:
            reg_accuracy = 0.0
        else:
            reg_accuracy = float( (init_num_regs_losses - end_num_regs_losses) / init_num_regs_losses )

    # In case of internet break or memory loss; display test accuracy for each search round
    # Note: please report the test accuracy for trail with highest performance on val set
    print('----------------------- For Trial ', trial.number, ': ', 'test accuracy:', test_accuracy, ' reg loss decreased by (%):', reg_accuracy)
    print()
    print()

    # save clf to a file for future use
    with open( ('{}' + '_' + dataset + '_' + model_type + '.pickle').format(trial.number), 'wb' ) as fout:
        pickle.dump(clf, fout)

    return val_accuracy # use val accuracy to search for best hyper-param set


# start hyper-param tuning
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials = 36) # set n_trials = 1 if not tuning (we have set a dummy variable named 'fake_tune' to enable model training)

with open( ('{}' + '_' + dataset + '_' + model_type + '.pickle').format(study.best_trial.number), 'rb' ) as fin:
    clf = pickle.load(fin)

# In case you did not perform bayes opt while using the pre-defined hyper-param values;
if bayes_opt == False:
    batch_size = 128
    bins = 21

if bayes_opt:
    batch_size = study.best_params['training batch size']
    bins = study.best_params['num feature levels']

    print()
    for key, value in study.best_params.items():
        print( "    {}: {}".format(key, value) )

    print()
    print()
    importance_dict = optuna.importance.get_param_importances(study)
    for key, value in importance_dict.items():
        print( "    {}: {}".format(key, value) )


In [None]:
if bayes_opt:
    if study.best_params['normalize num features'] == '1':

        # perform quantile transform as in the FT-Trans paper:
        nums = []
        for i in train_data.columns:
            if i != 'target':
                if pd.api.types.is_numeric_dtype(train_data[i]):
                    nums.append(i)

        converted_nums = objective.qt.transform( train_data[nums] )
        train_data[nums] = converted_nums

        converted_nums = objective.qt.transform( val_data[nums] )
        val_data[nums] = converted_nums


new_train, cut_off_dict, feature_lookup_dict, cat_bins_total = FeatureTransfers( train_data, bins )

for k,v in feature_lookup_dict.items():
    print(k, '   ', v)
print()
print()

total_numerical_nodes = len(le_nodes) + len(ge_nodes)
total_levels = num_of_numerical_predictors * bins + sum(cat_bins_total)

# double-check to verify if matching with above
print(total_levels, bins)
print(be_node_counter, bins)


# Model Testing:

In [None]:
if bayes_opt:
    if study.best_params['normalize num features'] == '1':

        converted_nums = objective.qt.transform( test_data[nums] )
        test_data[nums] = converted_nums


if cat_as_num == True:
    # for non-ordinal Cat feature, perform target encoding via encoders from training set
    for i in test_data.columns:
        if (i != 'target'):
            if pd.api.types.is_numeric_dtype(test_data[i]) == False:
                encoder = encoder_dict[i]
                test_data[i] = encoder.transform( test_data[i] )


for i in test_data.columns:
    if ('target' in i) == False:
        if pd.api.types.is_numeric_dtype(test_data[i]):

            cut_offs = cut_off_dict[i]

            # 把所有比 1st cutoff 还小的数 convert 成 = 1st cutoff
            # 把所有比 last cutoff 还大的数 convert 成 = last cutoff
            test_data.loc[test_data[i] < cut_offs[0], i] = cut_offs[0]
            test_data.loc[test_data[i] > cut_offs[len(cut_offs)-1], i] = cut_offs[len(cut_offs)-1]

            # use cut-offs generated from only train set to perform conversion to avoid leakage
            test_data[i] = pd.cut(test_data[i], cut_offs, duplicates = 'drop', right = True, include_lowest = True)
            test_data[i] = i + '_' + test_data[i].astype(str)

        else:
            test_data[i] = i + '_' + test_data[i].astype(str)


# Now create  data loader for testing set
test_copy = test_data.copy()
levels_to_index_df_test = test_copy.iloc[:,:-1] # get all but last (target) feature

for i in levels_to_index_df_test.columns:
    for level in levels_to_index_df_test[i].unique():
        levels_to_index_df_test[i] = levels_to_index_df_test[i].replace(level, feature_lookup_dict[level])

print(levels_to_index_df_test.head(5))
levels_to_index_df_test_np = levels_to_index_df_test.to_numpy()

test_loader = torch.utils.data.DataLoader(levels_to_index_df_test_np, batch_size = batch_size, shuffle = False)


In [23]:
# Create context value index lists for training set:
all_batch_le_context_value_index_list = []
all_batch_ge_context_value_index_list = []
all_batch_be_context_value_index_list = []

for batch in test_loader:

    batch_size = len(batch)
    batch_le_context_value_index_list = []
    batch_ge_context_value_index_list = []
    batch_be_context_value_index_list = []

    for node in le_nodes:
        le_context_value_index_list = [ node[2] ] * batch_size
        batch_le_context_value_index_list.append(le_context_value_index_list)

    for node in ge_nodes:
        ge_context_value_index_list = [ node[2] ] * batch_size
        batch_ge_context_value_index_list.append(ge_context_value_index_list)

    for node in be_nodes:
        be_context_value_index_list = [ node[2] ] * batch_size
        batch_be_context_value_index_list.append(be_context_value_index_list)

    all_batch_le_context_value_index_list.append(batch_le_context_value_index_list)
    all_batch_ge_context_value_index_list.append(batch_ge_context_value_index_list)
    all_batch_be_context_value_index_list.append(batch_be_context_value_index_list)

In [None]:
# first convert test set feature levels to indexes just like training set
# use predictor features (X) only to do performance eval.

# start testing process:
feature_indexes_le_nodes  = []
feature_indexes_ge_nodes  = []
feature_indexes_be_nodes  = []

for node in le_nodes:
    feature_indexes_le_nodes.append(node[0] - 1)

for node in ge_nodes:
    feature_indexes_ge_nodes.append(node[0] - 1)

for node in be_nodes:
    feature_indexes_be_nodes.append(node[0] - 1)

clf = clf.eval()

pred_labels = []
targets = []

batch_count = 0
for batch in test_loader:

    subset = batch[:, :] # X only; target already removed
    batch_size = len(batch)

    if (len(cat_features) > 0):
        batch_le_feature_index_list = subset[:, feature_indexes_le_nodes].permute(1,0).to('cuda')
        batch_ge_feature_index_list = subset[:, feature_indexes_ge_nodes].permute(1,0).to('cuda')
        batch_be_feature_index_list = subset[:, feature_indexes_be_nodes].permute(1,0).to('cuda')

        batch_le_context_value_index_list = all_batch_le_context_value_index_list[batch_count]
        batch_ge_context_value_index_list = all_batch_ge_context_value_index_list[batch_count]
        batch_be_context_value_index_list = all_batch_be_context_value_index_list[batch_count]

        preds = clf.forward(batch_le_feature_index_list, batch_le_context_value_index_list,
                            batch_ge_feature_index_list, batch_ge_context_value_index_list,
                            batch_be_feature_index_list, batch_be_context_value_index_list,
                            batch_size )
    else:
        batch_le_feature_index_list = subset[:, feature_indexes_le_nodes].permute(1, 0).to('cuda')
        batch_ge_feature_index_list = subset[:, feature_indexes_ge_nodes].permute(1, 0).to('cuda')

        batch_le_context_value_index_list = all_batch_le_context_value_index_list[batch_count]
        batch_ge_context_value_index_list = all_batch_ge_context_value_index_list[batch_count]

        preds = clf.forward(batch_le_feature_index_list, batch_le_context_value_index_list,
                            batch_ge_feature_index_list, batch_ge_context_value_index_list,
                            batch_size )

    preds = torch.sigmoid(preds)
    preds = torch.squeeze(preds).tolist()

    batch_count += 1

    for i in preds:
        if i > 0.5:
            pred_labels.append(1)
        if i <= 0.5:
            pred_labels.append(0)

# Compute Balanced Accuracy:
accuracy = metrics.balanced_accuracy_score(test_data['target'], pred_labels)
print('Balanced Accuracy: ', accuracy)
