## Creating The Model

Write One Soft Tree From Scratch.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math


class SoftTreeEnsemble(torch.nn.Module):

    def __init__(self, num_trees, max_depth, leaf_dims, input_dim, activation='sigmoid', node_index=0,
                 internal_eps = 0, combine_output = True, subset_selection = False, device='cpu'):
        """
          # The max depth will determine the number of nodes that we have
          # This will be 2^{d+1} - 1 where d is the edges from root to leaf
          # This will be the dimensionality of the vector returned by the tree
          # should match the number of classes in the problem but can be more
          # if more then pass through final activation to resize
          # can only use this if we output raw score
          # s1.right_child.right_child.node_index
              This will grab the node from root, to the right_child, and to that right child


          # if combine_output is true we get (batch_size, leaf_dim)
          # if false then we get (batch_size, num_trees, leaf_dim)

          # subset selection:
              This will be a boolean value that if true we perform the randomization of feature selection in a random forrest


        """
        super(SoftTreeEnsemble, self).__init__()
        self.num_trees = num_trees
        self.combine_output = combine_output
        self.max_depth = max_depth
        self.leaf_dims = leaf_dims
        self.activation = activation
        self.node_index = node_index
        self.internal_eps = internal_eps
        self.leaf = node_index >= 2**max_depth-1
        self.subset_selection = subset_selection
        self.device = device
        self.batch_norm = nn.BatchNorm1d(self.leaf_dims)


        # instatiate through recursion
        # we will build the tree structure first
        if not self.leaf:

          # for each node we need to make a FC layer w'x
          # takes as input the input dimension
          # and outputs the probabilities for each class
          # output dim is one to resemble a tabular data structure

          # to extend this to multiple trees we can do the following
          # we have each level of the nodes but now extended to the number of
          # trees


          self.fc = nn.Linear(input_dim, num_trees)

          # we need to choose a feature of values
          if self.subset_selection:
            # create a matrix of the same row as the rows in W
            # for now will not be part of the gradient computation
            # we will keep sqrt(p) for now
            num_features = math.floor(math.sqrt(input_dim))


            temp = torch.ones(input_dim, requires_grad=False).to(self.device)

            zero_indices = torch.randperm(temp.shape[0])[:num_features]
            temp[zero_indices] = 0

            self.mask = nn.Parameter(temp, requires_grad=False)





          # builds out the left and the right child for a balanced tree
          # also gives is a node index
          self.left_child = SoftTreeEnsemble(
              num_trees, max_depth, leaf_dims,
              input_dim, activation, 2*node_index+1 ,
              combine_output = self.combine_output, subset_selection = self.subset_selection,
              device=self.device
              )



          self.right_child = SoftTreeEnsemble(
              num_trees, max_depth, leaf_dims, input_dim ,activation,
              2*node_index+2,combine_output = self.combine_output,
              subset_selection = self.subset_selection, device= self.device
              )



        else:
          # creates weights for the leaf nodes for voting
          # we also need to add this for the multiple trees
          self.leaf_weights = nn.Parameter(
              torch.randn(1, self.leaf_dims, self.num_trees, requires_grad=True).to(self.device) * 0.1
              )




    # then we call the actual forward pass of the tree
    def forward(self, x, prob=1.0):
        """
            This runs the forward class of the model
        """

        # we first check if it is a leaf or not
        if not self.leaf:
          # apply the hadamard
          if self.subset_selection:
            masked_weights = self.fc.weight * self.mask

            current_prob = torch.clamp(torch.sigmoid(F.linear(x, masked_weights, self.fc.bias)), min=self.internal_eps, max=1-self.internal_eps)

          else:
            # we make sure that the decision is not hard 1 or 0
            current_prob = torch.clamp(torch.sigmoid(self.fc(x)), min=self.internal_eps, max=1-self.internal_eps)
            # return the probability for going to the left or the right
          return self.left_child(x, prob*current_prob) + self.right_child(x, prob*(1-current_prob))

          # what do we do when we get to a leaf
        else:


          # element wise product between the probability that the data point gets to the leaf
          # to the weight that the leaf has
          # enable prob to broad cast
          output = prob.unsqueeze(1) * self.leaf_weights


          if self.combine_output:
            output = torch.sum(output, dim=2)
            # rint(output)

          output = self.batch_norm(output)

          # assert len(output.shape) == 3
          return F.softmax(output, dim=1)


## Grabbing The Data

In [None]:
!pip install pmlb

Collecting pmlb
  Downloading pmlb-1.0.1.post3-py3-none-any.whl.metadata (1.7 kB)
Downloading pmlb-1.0.1.post3-py3-none-any.whl (19 kB)
Installing collected packages: pmlb
Successfully installed pmlb-1.0.1.post3


In [None]:
######################################### GOAL #################################
# Put all of the datasets in a hdf5 file.
# Makes it super easy to grab whatever we want
################################################################################

import h5py
from sklearn.model_selection import train_test_split
from pmlb import dataset_names,fetch_data
import torch
from torch.utils.data import TensorDataset, DataLoader
import xgboost as xgb
import tensorflow as tf


# The 23 datasets used in TEL
pmlb_datasets = ['ann_thyroid', 'breast_cancer','car_evaluation','churn', 'crx', 'dermatology', 'diabetes',
                 'dna', 'ecoli', 'flare', 'heart_c', 'hypothyroid', 'nursery', 'optdigits', 'pima', 'satimage', 'sleep',
                 'solar_flare_2', 'spambase', 'texture', 'twonorm', 'vehicle', 'yeast']

# ones that need to be checked
failed_datasets = []

# Create an HDF5 file
with h5py.File('datasets.h5', 'w') as h5file:
    for classification_data_set in pmlb_datasets:

        print(f"Processing dataset: {classification_data_set}")

        try:
          # fetch the data
          X, y = fetch_data(classification_data_set, return_X_y=True)
        except:
          print(f"Dataset {classification_data_set} failed to load")
          failed_datasets.append(classification_data_set)
          continue

        # Split the data into training and testing sets

        train_X, test_X, train_y, test_y = train_test_split(X, y, train_size=0.7)

        # Create a group for each dataset
        group = h5file.create_group(classification_data_set)

        # Save train and test data
        group.create_dataset('train_X', data=train_X)
        group.create_dataset('test_X', data=test_X)
        group.create_dataset('train_y', data=train_y)
        group.create_dataset('test_y', data=test_y)

Processing dataset: ann_thyroid
Processing dataset: breast_cancer
Processing dataset: car_evaluation
Processing dataset: churn
Processing dataset: crx
Dataset crx failed to load
Processing dataset: dermatology
Processing dataset: diabetes
Dataset diabetes failed to load
Processing dataset: dna
Processing dataset: ecoli
Processing dataset: flare
Dataset flare failed to load
Processing dataset: heart_c
Dataset heart_c failed to load
Processing dataset: hypothyroid
Processing dataset: nursery
Processing dataset: optdigits
Processing dataset: pima
Dataset pima failed to load
Processing dataset: satimage
Processing dataset: sleep
Processing dataset: solar_flare_2
Dataset solar_flare_2 failed to load
Processing dataset: spambase
Processing dataset: texture
Processing dataset: twonorm
Processing dataset: vehicle
Processing dataset: yeast


In [None]:
# Write a function to retrieve the data set

def load_data(dataset_name, framework='sklearn', file_path='datasets.h5', batch_size=32):
  '''
    Function Goal:
      Loads in data from the hdf5 file containing all datasets
    Parameters:
      dataset_name: Name of the data set
      framework:
        sklearn --> numpy arrays {train_X, test_X, train_y, test_y}
        torch --> (train_loader, test_loader)
        boost --> (train, test)
        tf --> (train, test)
  '''

    # Open the HDF5 file
  with h5py.File(file_path, 'r') as h5file:
        # Navigate to the group (folder) for the specified dataset
      try:
        group = h5file[dataset_name]
      except KeyError:
        raise ValueError(f"Dataset '{dataset_name}' not found in the HDF5 file.")

        # Load train and test data as NumPy arrays
      train_X = group['train_X'][:]
      test_X = group['test_X'][:]
      train_y = group['train_y'][:]
      test_y = group['test_y'][:]

    # Format data based on the specified framework
  if framework == 'sklearn':
        # Return NumPy arrays (compatible with scikit-learn)
      return {'train_X':train_X, 'test_X':test_X, 'train_y':train_y, 'test_y':test_y}

  elif framework == 'torch':
      # Convert to PyTorch tensors
      train_dataset = TensorDataset(torch.tensor(train_X, dtype=torch.float32),
                                      torch.tensor(train_y, dtype=torch.long))
      test_dataset = TensorDataset(torch.tensor(test_X, dtype=torch.float32),
                                     torch.tensor(test_y, dtype=torch.long))

        # Create DataLoaders for batching
      train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
      test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
      return train_loader, test_loader

  elif framework == 'boost':
        # Convert to DMatrix format (XGBoost's preferred format)
      train_dmatrix = xgb.DMatrix(train_X, label=train_y)
      test_dmatrix = xgb.DMatrix(test_X, label=test_y)
      return train_dmatrix, test_dmatrix

  elif framework == 'tf':
        # Create TensorFlow datasets
      train_dataset = tf.data.Dataset.from_tensor_slices((train_X, train_y))
      test_dataset = tf.data.Dataset.from_tensor_slices((test_X, test_y))

        # Batch and shuffle data
      train_dataset = train_dataset.shuffle(buffer_size=1024).batch(batch_size)
      test_dataset = test_dataset.batch(batch_size)
      return train_dataset, test_dataset

  else:
      raise ValueError("Unsupported framework. Choose from 'sklearn', 'pytorch', 'xgboost', or 'tensorflow'.")

In [None]:
# grab the breast data set
data = load_data('breast_cancer', framework='torch', batch_size=16)

In [None]:
data

(<torch.utils.data.dataloader.DataLoader at 0x7a89fd065f50>,
 <torch.utils.data.dataloader.DataLoader at 0x7a89fd6e1b50>)

### Understanding the Weight Matricies

The weights are goverened as the following. For every node level like the root, we have that there will be (num_trees, num_features) for that weight matrix. For the leaf weights they will be shaped as the following. (leaf_dims, num_trees).

### Training
* Must always follow batch normalizaition.
* must use cross entropy loss
* Looking at the

In [None]:
# first make the model
model = SoftTreeEnsemble(num_trees=3,max_depth=2, leaf_dims=5, input_dim=9, combine_output=True, subset_selection=True)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

################################################################################
# GOAL: Train the model
################################################################################

def train_model(model, train_loader, test_loader, epochs=10, learning_rate=0.001, device='cpu'):
    # Define the optimizer and loss function
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.CrossEntropyLoss()  # Use appropriate loss function for your task

    # Move model to device (GPU or CPU)
    model.to(device)

    # Training loop
    for epoch in range(epochs):
        model.train()  # Set model to training mode
        running_loss = 0.0
        correct_preds = 0
        total_preds = 0

        for inputs, labels in train_loader:
            # Move data to device (GPU or CPU)
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)

            # Calculate the loss
            loss = criterion(outputs, labels)

            # Backward pass
            loss.backward()

            # Update parameters
            optimizer.step()

            running_loss += loss.item()

            # Track accuracy or other metrics if needed
            # For classification, you can calculate accuracy
            predicted = outputs.argmax(dim=1)
            correct_preds += (predicted == labels).sum().item()
            total_preds += labels.size(0)

        avg_loss = running_loss / len(train_loader)
        # accuracy = correct_preds / total_preds (if classification)

        test_loss = evaluate(model, test_loader, criterion, device)

        print(f"Epoch [{epoch+1}/{epochs}], Training Loss: {avg_loss:.4f} | Test Lost: {test_loss:.4f}")



def evaluate(model, test_loader, criterion, device):
    model.eval()  # Set model to evaluation mode
    running_loss = 0.0

    with torch.no_grad():  # Disable gradient computation for evaluation
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)

            # Calculate loss
            loss = criterion(outputs, labels)
            running_loss += loss.item()

    return running_loss / len(test_loader)









In [None]:
train_model(model, data[0], data[1], epochs=5, learning_rate=0.001, device='cpu')


Epoch [1/5], Training Loss: 1.0626 | Test Lost: 1.1231
Epoch [2/5], Training Loss: 1.0222 | Test Lost: 1.0996
Epoch [3/5], Training Loss: 0.9952 | Test Lost: 1.0878
Epoch [4/5], Training Loss: 1.0068 | Test Lost: 1.0847
Epoch [5/5], Training Loss: 1.0093 | Test Lost: 1.0612


In [None]:
import numpy as np
hyperparameters = {
            "learning_rate": np.logspace(-1, -5, 5),
            "batch_size": [16, 32],
            "num_epochs": range(5,20),
            "tree_depth": range(2,15)
            }

In [None]:
import numpy as np
import random
from sklearn.model_selection import train_test_split

# Random grid search function
def random_grid_search(model, hyperparameters, train_loader, test_loader, epochs=10, device='cpu', n_iter=10):
    """
    Perform random grid search on hyperparameters and train the model.

    Args:
        model: PyTorch model.
        hyperparameters: Dictionary of hyperparameters to search over.
        train_loader: DataLoader for training.
        test_loader: DataLoader for testing.
        epochs: Number of epochs to train the model.
        device: Device to use ('cpu' or 'cuda').
        n_iter: Number of random hyperparameter combinations to try.

    Returns:
        None
    """

    # Generate all possible hyperparameter combinations (for reference)
    hyperparameter_keys = list(hyperparameters.keys())

    # Randomly sample n_iter combinations
    for i in range(n_iter):
        # Randomly select hyperparameters

        sample = {key: random.choice(hyperparameters[key]) for key in hyperparameter_keys}

        # Print the combination being used
        print(f"Trial {i+1}/{n_iter}: Hyperparameters - {sample}")

        # Extract the sampled values

        learning_rate = sample["learning_rate"]
        batch_size = sample["batch_size"]
        num_epochs = sample["num_epochs"]
        tree_depth = sample["tree_depth"]
        # num_trees = sample["num_trees"]

        # first thing is load the data
        data = load_data('breast_cancer', framework='torch', batch_size=batch_size)

        # create the instance of the model
        model = SoftTreeEnsemble(num_trees=1,max_depth=tree_depth,
                                 leaf_dims=2, input_dim=9, combine_output=True, subset_selection=True)

        # Train the model with the current hyperparameters
        print(f"Training model with learning rate: {learning_rate}, batch size: {batch_size}, "
              f"epochs: {num_epochs}, tree depth: {tree_depth}")

        # Call the training function (your existing train_model function)
        train_model(model, train_loader, test_loader, epochs=num_epochs, learning_rate=learning_rate, device=device)

        # Add any additional evaluation or result saving here if needed
        print(f"Completed Trial {i+1}/{n_iter}.\n")



In [None]:
random_grid_search(model, hyperparameters, data[0], data[1], epochs=10, device='cpu', n_iter=10)


Trial 1/10: Hyperparameters - {'learning_rate': 9.999999999999999e-05, 'batch_size': 32, 'num_epochs': 10, 'tree_depth': 8}
Training model with learning rate: 9.999999999999999e-05, batch size: 32, epochs: 10, tree depth: 8
Epoch [1/10], Training Loss: 0.7311 | Test Lost: 0.6572
Epoch [2/10], Training Loss: 0.6478 | Test Lost: 0.6452
Epoch [3/10], Training Loss: 0.6284 | Test Lost: 0.6452
Epoch [4/10], Training Loss: 0.5940 | Test Lost: 0.6502
Epoch [5/10], Training Loss: 0.5816 | Test Lost: 0.6527
Epoch [6/10], Training Loss: 0.5853 | Test Lost: 0.6546
Epoch [7/10], Training Loss: 0.5861 | Test Lost: 0.6508
Epoch [8/10], Training Loss: 0.5740 | Test Lost: 0.6442
Epoch [9/10], Training Loss: 0.5720 | Test Lost: 0.6339
Epoch [10/10], Training Loss: 0.5507 | Test Lost: 0.6324
Completed Trial 1/10.

Trial 2/10: Hyperparameters - {'learning_rate': 0.001, 'batch_size': 32, 'num_epochs': 12, 'tree_depth': 5}
Training model with learning rate: 0.001, batch size: 32, epochs: 12, tree depth: 5


In [None]:
random_grid_search(model, hyperparameters, data[0], data[1], epochs=10, device='cpu', n_iter=10)


Trial 1/10: Hyperparameters - {'learning_rate': 0.001, 'batch_size': 16, 'num_epochs': 6, 'tree_depth': 14}
Training model with learning rate: 0.001, batch size: 16, epochs: 6, tree depth: 14
Epoch [1/6], Training Loss: 2.7794 | Test Lost: 2.6957
Epoch [2/6], Training Loss: 1.0208 | Test Lost: 0.7694
Epoch [3/6], Training Loss: 0.5884 | Test Lost: 1.2189
Epoch [4/6], Training Loss: 0.6640 | Test Lost: 1.0855
Epoch [5/6], Training Loss: 0.6841 | Test Lost: 1.7482
Epoch [6/6], Training Loss: 1.5776 | Test Lost: 2.2663
Completed Trial 1/10.

Trial 2/10: Hyperparameters - {'learning_rate': 9.999999999999999e-06, 'batch_size': 16, 'num_epochs': 12, 'tree_depth': 7}
Training model with learning rate: 9.999999999999999e-06, batch size: 16, epochs: 12, tree depth: 7
Epoch [1/12], Training Loss: 0.7998 | Test Lost: 0.6923
Epoch [2/12], Training Loss: 0.7854 | Test Lost: 0.6916
Epoch [3/12], Training Loss: 0.7612 | Test Lost: 0.6924
Epoch [4/12], Training Loss: 0.7618 | Test Lost: 0.6958
Epoch [

In [None]:
t = load_data('breast_cancer', framework='sklearn')

In [None]:
t['train_y']

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0])

In [None]:
{'learning_rate': 9.999999999999999e-05, 'batch_size': 16, 'num_epochs': 15, 'tree_depth': 7}

In [None]:
model = SoftTreeEnsemble(num_trees=1,max_depth=7,
                                 leaf_dims=2, input_dim=9, combine_output=True, subset_selection=False)

In [None]:
def testing(model, hyperparameters, train_loader, test_loader, device='cpu'):
    """
    Perform random grid search on hyperparameters and train the model.

    Args:
        model: PyTorch model.
        hyperparameters: Dictionary of hyperparameters to search over.
        train_loader: DataLoader for training.
        test_loader: DataLoader for testing.
        epochs: Number of epochs to train the model.
        device: Device to use ('cpu' or 'cuda').
        n_iter: Number of random hyperparameter combinations to try.

    Returns:
        None
    """

    # Generate all possible hyperparameter combinations (for reference)
    hyperparameter_keys = list(hyperparameters.keys())


    # Extract the sampled values

    learning_rate = sample["learning_rate"]
    batch_size = sample["batch_size"]
    num_epochs = sample["num_epochs"]
    tree_depth = sample["tree_depth"]
    # num_trees = sample["num_trees"]

    # first thing is load the data
    data = load_data('breast_cancer', framework='torch', batch_size=batch_size)

    # create the instance of the model
    model = SoftTreeEnsemble(num_trees=1,max_depth=tree_depth,
                              leaf_dims=2, input_dim=9, combine_output=True, subset_selection=False)

        # Train the model with the current hyperparameters
        print(f"Training model with learning rate: {learning_rate}, batch size: {batch_size}, "
              f"epochs: {num_epochs}, tree depth: {tree_depth}")

        # Call the training function (your existing train_model function)
        train_model(model, train_loader, test_loader, epochs=num_epochs, learning_rate=learning_rate, device=device)

        # Add any additional evaluation or result saving here if needed
        print(f"Completed Trial {i+1}/{n_iter}.\n")