# Second Implementation of Decision Tree.
This Notebook experiments with my second implementation of a decision tree that builds on the first draft but tries to improve performance by dealing with categorical features and missing values inside the decision tree instead of as a separate preprocessing step. 

In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score, GridSearchCV
import time 
from sklearn.tree import DecisionTreeClassifier
from sklearn.base import BaseEstimator, ClassifierMixin

from CART_Decision_Tree.dataset import preprocess 
from collections import Counter
from typing import Optional, List
import scipy.stats as stats
from itertools import combinations



ModuleNotFoundError: No module named 'nbimporter'

## First improvement: Dealing with missing values on a node-specific imputation.  
I Implemented within my decision tree a functionality to handle missing feature values based on the current node. I inferred missing values based on mode of the feature values of data points sorted
to the current node (I chose mode to deal with categorical features).

In this first implementation, missing values are inferred at each new node.

NOTE: After reading more there are way more robust ways, that are very specific to the data at hand, where in each node you can have a specific imputation method, in one node you can have mean, in another median, creating a separate branch for missing values and so forth, this also can be taxing in training time and complexity. These imputation strategies should be stored and used when training and testing. 

In [2]:
class DecisionTreeMissing:
    def __init__(self, max_depth: int = 3, feature_labels: Optional[List[str]]=None, max_features: Optional[int]=None,
                 cat_cols=None):
        if cat_cols is None:
            cat_cols = []
        self.cat_cols = cat_cols
        self.max_depth = max_depth
        self.max_features = max_features
        self.feature_labels = feature_labels
        self.left: Optional = None
        self.right: Optional = None
        self.split_idx: Optional = None
        self.thresh: Optional =  None
        self.prediction: Optional[int] = None

    @staticmethod
    def entropy(y: np.ndarray[int]) -> float:
        length = len(y)
        if length == 0:
            return 0

        counts = np.bincount(y)
        probabilities = counts[counts > 0] / length
        return -np.sum(probabilities * np.log2(probabilities))

    @staticmethod
    def entropy_o1(counter: Counter[int], labels: np.ndarray, total_count: int) -> float:
        entropy = sum(
            count * np.log2(count / total_count) for count in (counter[label] for label in labels) if count > 0)
        return entropy

    
    @staticmethod
    def fill_infer(X):
        indices = np.where(X == -1)
        X_with_nan = np.where(X == -1, np.nan, X)
        mode = stats.mode(X_with_nan, nan_policy='omit')
        X_with_nan[indices] = mode[0]
        return X_with_nan
    
    @staticmethod
    def split(X: np.ndarray, y: np.ndarray[int], idx: int, thresh: float) -> tuple:
        left_mask = X[:, idx] < thresh
        right_mask = ~left_mask
        return X[left_mask], y[left_mask], X[right_mask], y[right_mask]
    
    def fit(self, X: np.ndarray, y: np.ndarray[int]) -> 'DecisionTree':
        nb_samples, nb_features = X.shape
        labels = np.unique(y)

        if self.max_depth == 0 or self.entropy(y) == 0:
            self.prediction = Counter(y).most_common(1)[0][0]
            return self

        best_split = {'gain': 0}
        features_indices = np.random.choice(nb_features, size=min(self.max_features, nb_features),
                                            replace=False) if self.max_features else range(nb_features)

        for idx in features_indices:
            X_j = X[:, idx]
            X_j = self.fill_infer(X_j)
            sorted_indices = np.argsort(X_j)
            X_j_sorted, y_sorted = X_j[sorted_indices], y[sorted_indices]
            unique_values, indices = np.unique(X_j, return_index=True)
            y_unique = y[indices]
            if len(unique_values) < 2:
                continue

            y_entropy = self.entropy(y_sorted)
            total_sum = len(y_sorted)

            left_classes, right_classes = Counter(), Counter(y_sorted)
            counts_left, counts_right = 0, total_sum

            for i in range(len(unique_values) - 1):
                threshold = (unique_values[i] + unique_values[i + 1]) / 2
                if i == 0:
                    left_classes = Counter(y_sorted[X_j_sorted < threshold])
                    right_classes = Counter(y_sorted[X_j_sorted >= threshold])
                    counts_left, counts_right = sum(left_classes.values()), sum(right_classes.values())
                else:
                    left_classes[y_unique[i]] += 1
                    counts_left += 1
                    right_classes[y_unique[i]] = right_classes[y_unique[i]] - 1
                    counts_right -= 1

                weighted_entropy = (-1 / total_sum) * (
                            self.entropy_o1(left_classes, labels, counts_left) + self.entropy_o1(right_classes, labels,
                                                                                                 counts_right))
                information_gain = y_entropy - weighted_entropy

                if information_gain > best_split['gain']:
                    best_split = {'gain': information_gain, 'threshold': threshold, 'feature': idx}

        if best_split['gain'] == 0:
            self.prediction = Counter(y).most_common(1)[0][0]
        else:
            self.split_idx = best_split['feature']
            self.thresh = best_split['threshold']
            X_left, y_left, X_right, y_right = self.split(X, y, self.split_idx, self.thresh)

            self.left = DecisionTreeMissing(self.max_depth - 1, self.feature_labels, self.max_features).fit(X_left, y_left)
            self.right = DecisionTreeMissing(self.max_depth - 1, self.feature_labels, self.max_features).fit(X_right, y_right)

        return self
       
    def predict(self, X: np.ndarray) -> np.ndarray:
        
        if not hasattr(self, '_imputed'):
            for feature in range(X.shape[1]):
                X[:, feature] = self.fill_infer(X[:, feature])
            self._imputed = True
        
        if self.prediction is not None:
            return np.full(X.shape[0], self.prediction)

        predictions = np.empty(X.shape[0], dtype=int)
        feature_values = X[:, self.split_idx]
        left_mask = feature_values < self.thresh
        right_mask = ~left_mask

        predictions[left_mask] = self.left.predict(X[left_mask])
        predictions[right_mask] = self.right.predict(X[right_mask])

        return predictions

### Data preprocessing:

In [None]:
path_train = '../data/raw/titanic_training.csv'
data = np.genfromtxt(path_train, delimiter=',', dtype=None)
y = data[1:, 0]  # label = survived

labeled_idx = np.where(y != b'')[0]
X, onehot_features = preprocess(data[1:, 1:], onehot_cols=[0,1, 5, 7, 8], fill_mode=False)

X = X[labeled_idx, :]

features = list(data[0, 1:]) + onehot_features
df_train = pd.DataFrame(X, columns=features)

df_train.to_csv('../data/interim/titanic_training_with_missing.csv', index=False)

### Validation scores:

In [16]:
X = np.genfromtxt('../data/processed/titanic_training_cleaned.csv', delimiter=",", dtype=float)[1:, :]
X_with_missing = np.genfromtxt('../data/interim/titanic_training_with_missing.csv', delimiter=",", dtype=float)[1:, ]
y = np.genfromtxt('../data/processed/titanic_training_labels.csv', delimiter=",", dtype=int)[1:]


print("sklearn's decision tree")
clf = DecisionTreeClassifier(random_state=0, max_depth=3, min_samples_leaf= 7, criterion='entropy')
clf.fit(X, y)
start_time = time.time()  # Record the start time
validation_scores = cross_val_score(clf, X, y)
print("Cross validation", validation_scores)
print('mean:' ,np.mean(validation_scores))
print('std:', np.std(validation_scores))
end_time = time.time()  # Record the end time
execution_time = end_time - start_time
print(f"Execution time sklearn: {execution_time} seconds")


print("\n\nMy decision tree")
clf = DecisionTreeMissing(max_depth=3)
start_time = time.time()  # Record the start time
clf.fit(X_with_missing, y)
validation_scores = cross_val_score(clf, X_with_missing, y)
print("Cross validation", validation_scores)
print('mean:' ,np.mean(validation_scores))
print('std:', np.std(validation_scores))
end_time = time.time()  # Record the end time
execution_time = end_time - start_time
print(f"Execution time my implementation: {execution_time} seconds")

sklearn's decision tree
Cross validation [0.82673267 0.83663366 0.79207921 0.78712871 0.7761194 ]
mean: 0.8037387320821633
std: 0.023602806684993195
Execution time sklearn: 0.0024559497833251953 seconds


My decision tree
Cross validation [0.83663366 0.82673267 0.78217822 0.8019802  0.7960199 ]
mean: 0.8087089305945521
std: 0.02007392361949834
Execution time my implementation: 0.7439107894897461 seconds


Results are the same as scores I got with an implementation that replaced all missing values with mode as part of preprocessing. Maybe dealing with categorical features and quantitative features differently can lead to better results. 

## Dealing with missing values with node-specific imputation with a distinction between categorical and quantitative features.  
We inferred missing values based on mode of the feature values for categorical features and based on the mean for quantitative.

In [12]:
class DecisionTreeMissingV2(DecisionTreeMissing):
   def __init__(self, max_depth: int = 3, feature_labels: Optional[List[str]]=None, max_features: Optional[int]=None, cat_cols = None):
        super().__init__(max_depth, feature_labels, max_features)
        if cat_cols is None:
            cat_cols = []
        self.cat_cols = cat_cols

   def fill_infer_cat(self, X, idx):
        indices = np.where(X == -1)
        X_with_nan = np.where(X == -1, np.nan, X)
        non_nan_values = X_with_nan[~np.isnan(X_with_nan)]              
        
        if len(non_nan_values) == 0:
            return X, True
    
        if idx in self.cat_cols:
          fill_with = stats.mode(non_nan_values)[0]
        else:
          fill_with = np.mean(non_nan_values)
            
        X_with_nan[indices] = fill_with
        return X_with_nan, False 
    
   def fit(self, X: np.ndarray, y: np.ndarray[int]) -> 'DecisionTreeMissingV2':
        nb_samples, nb_features = X.shape
        labels = np.unique(y)
        if self.max_depth == 0 or self.entropy(y) == 0:
            self.prediction = Counter(y).most_common(1)[0][0]
            return self

        best_split = {'gain': 0}
        features_indices = np.random.choice(nb_features, size=min(self.max_features, nb_features),
                                            replace=False) if self.max_features else range(nb_features)

        for idx in features_indices:
            X_j = X[:, idx]
            X_j, stop = self.fill_infer_cat(X_j, idx)
            # When all values are missing I'm skipping the node
            if stop: 
               continue
            sorted_indices = np.argsort(X_j)
            X_j_sorted, y_sorted = X_j[sorted_indices], y[sorted_indices]
            unique_values, indices = np.unique(X_j, return_index=True)
            y_unique = y[indices]
            if len(unique_values) < 2:
                continue
                
            y_entropy = self.entropy(y_sorted)
            total_sum = len(y_sorted)

            left_classes, right_classes = Counter(), Counter(y_sorted)
            counts_left, counts_right = 0, total_sum

            for i in range(len(unique_values) - 1):
                threshold = (unique_values[i] + unique_values[i + 1]) / 2
                if i == 0:
                    left_classes = Counter(y_sorted[X_j_sorted < threshold])
                    right_classes = Counter(y_sorted[X_j_sorted >= threshold])
                    counts_left, counts_right = sum(left_classes.values()), sum(right_classes.values())
                else:
                    left_classes[y_unique[i]] += 1
                    counts_left += 1
                    right_classes[y_unique[i]] = right_classes[y_unique[i]] - 1
                    counts_right -= 1

                weighted_entropy = (-1 / total_sum) * (
                            self.entropy_o1(left_classes, labels, counts_left) + self.entropy_o1(right_classes, labels,
                                                                                                 counts_right))
                information_gain = y_entropy - weighted_entropy
                
                left_mask = X_j < threshold
                right_mask = ~left_mask
    
                if information_gain > best_split['gain']:
                    best_split = {'gain': information_gain, 'threshold': threshold, 'feature': idx,
                                      'left_mask': left_mask, 'right_mask': right_mask }
                
        if best_split['gain'] == 0:
            self.prediction = Counter(y).most_common(1)[0][0]
        else:
            
            self.split_idx = best_split['feature']
            self.thresh = best_split['threshold']
            left_mask = best_split['left_mask']
            right_mask = best_split['right_mask']
            X_left, y_left, X_right, y_right = X[left_mask], y[left_mask], X[right_mask], y[right_mask]
            if len(y_left) == 0 :
                print('Well',best_split['gain'])
                print('Well',best_split['threshold'])
            self.left = DecisionTreeMissingV2(self.max_depth - 1, self.feature_labels, self.max_features).fit(X_left, y_left)
            self.right = DecisionTreeMissingV2(self.max_depth - 1, self.feature_labels, self.max_features).fit(X_right, y_right)

        return self
   
   def predict(self, X: np.ndarray) -> np.ndarray:
        
        if not hasattr(self, '_imputed'):
            for feature in range(X.shape[1]):
                X[:, feature], _ = self.fill_infer_cat(X[:, feature], feature)
            self._imputed = True
        
        if self.prediction is not None:
            return np.full(X.shape[0], self.prediction)

        predictions = np.empty(X.shape[0], dtype=int)
        feature_values = X[:, self.split_idx]
        left_mask = feature_values < self.thresh
        right_mask = ~left_mask

        predictions[left_mask] = self.left.predict(X[left_mask])
        predictions[right_mask] = self.right.predict(X[right_mask])

        return predictions

I encountered the problem of a node having no more meaningful values, all missing, I dealt with this by skipping the feature. There are other approaches such as having a fallback values for each feature 


NOTE: Big mistake, I was not consistent between training and prediction, When predicting values I used data that is either globally imputed, which made my implementation without meaning or predicted with data that is not imputed but did not impute it inside the predict method.

### validation Scores:

In [13]:
X = np.genfromtxt('../data/processed/titanic_training_cleaned.csv', delimiter=",", dtype=float)[1:, :]
X_with_missing = np.genfromtxt('../data/interim/titanic_training_with_missing.csv', delimiter=",", dtype=float)[1:, ]
y = np.genfromtxt('../data/processed/titanic_training_labels.csv', delimiter=",", dtype=int)[1:]


print("sklearn's decision tree")
clf = DecisionTreeClassifier(random_state=0, max_depth=3, min_samples_leaf= 7, criterion='entropy')
clf.fit(X, y)
start_time = time.time()  # Record the start time
end_time = time.time()  # Record the end time
execution_time = end_time - start_time
print(f"Execution time sklearn: {execution_time} seconds")



validation_scores = cross_val_score(clf, X, y)
print("Cross validation", validation_scores)
print('mean:' ,np.mean(validation_scores))
print('std:', np.std(validation_scores))


print("\n\nMy decision tree")
clf = DecisionTreeMissingV2(max_depth=3, cat_cols= [9, 10 ,11 , 12, 13, 14, 15, 16])
start_time = time.time()  # Record the start time
clf.fit(X_with_missing, y) 
end_time = time.time()  # Record the end time
execution_time = end_time - start_time
print(f"Execution time my implementation: {execution_time} seconds")

validation_scores = cross_val_score(clf, X_with_missing, y)
print("Cross validation", validation_scores)
print('mean:' ,np.mean(validation_scores))
print('std:', np.std(validation_scores))


sklearn's decision tree
Execution time sklearn: 0.0 seconds
Cross validation [0.82673267 0.83663366 0.79207921 0.78712871 0.7761194 ]
mean: 0.8037387320821633
std: 0.023602806684993195


My decision tree
Execution time my implementation: 0.08470678329467773 seconds
Cross validation [0.80693069 0.82673267 0.79207921 0.81188119 0.80597015]
mean: 0.8087187823259938
std: 0.011154629178836396


Although I thought the identical performance was due to not being distinctive between categ/quant features. It might be due to using a shallow tree. I will cross-validate to search for depth to see.

## Grid search for Hyperparameters

In [15]:
param_grid = {
    'max_depth': [3, 5, 7, 8],
}


print("Decision tree with mode")
model = DecisionTreeMissing()
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X, y)
print("Best hyperparameter:",grid_search.best_params_)





print('\n\n Decision tree with mode/mean')
model = DecisionTreeMissingV2()
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_with_missing, y)
print("Best hyperparameter:",grid_search.best_params_)

Decision tree with mode
Best hyperparameter: {'max_depth': 3}


 Decision tree with mode/mean
Best hyperparameter: {'max_depth': 3}


**Conclusion:**  The model performs best with a shallow tree which means that a node-specific imputation does not improve performance. Node-specific imputation converges to a global imputation in shallow decision trees.  

## Second Improvement: Subset selection for categorical features.
I implemented functionality in decision trees to determine split rules based on the subsets of categorical variables that maximize information gain

In [35]:
class DecisionTreeCategorical(DecisionTreeMissingV2):
    def __init__(self, max_depth: int = 3, feature_labels: Optional[List[str]]=None, max_features: Optional[int]=None, cat_cols = None):
        super().__init__(max_depth, feature_labels, max_features, cat_cols)

    @staticmethod
    def get_subsets_combinations(arr):
        all_subsets = []
        for r in range(1, len(arr)):
            subsets = combinations(arr, r)
            all_subsets.extend(subsets)
        return [np.array(subset) for subset in all_subsets]    


    def information_gain(self, X, y, subset):
        total_entropy = self.entropy(y)

        left_mask = np.isin(X, subset[0])
        right_mask = ~left_mask

        y_right = y[right_mask]
        y_left = y[left_mask]

        if not len(y_right) or not len(y_left):
            return 0, left_mask, right_mask

        weighted_entropy = (len(y_right) * self.entropy(y_right) + len(y_left) * self.entropy(y_left)) / len(y)
        return total_entropy - weighted_entropy, left_mask, right_mask


    def fit(self, X: np.ndarray, y: np.ndarray[int]) -> 'DecisionTreeCategorical':
        nb_samples, nb_features = X.shape
        labels = np.unique(y)

        if self.max_depth == 0 or self.entropy(y) == 0:
            self.prediction = Counter(y).most_common(1)[0][0]
            return self

        best_split = {'gain': 0}
        features_indices = np.random.choice(nb_features, size=min(self.max_features, nb_features),
                                            replace=False) if self.max_features else range(nb_features)

        for idx in features_indices:
            X_j = X[:, idx]
            X_j, stop = self.fill_infer_cat(X_j, idx)
            # When all values are missing I'm skipping the node
            if stop:
                continue
            sorted_indices = np.argsort(X_j)
            X_j_sorted, y_sorted = X_j[sorted_indices], y[sorted_indices]
            unique_values, indices = np.unique(X_j, return_index=True)
            y_unique = y[indices]

            if len(unique_values) < 2:
                continue

            if idx in self.cat_cols:
                subsets_combinations = self.get_subsets_combinations(unique_values)
                n = len(subsets_combinations)
                for index, combination in enumerate(subsets_combinations):
                    if index == n / 2:
                        break
                    split = (combination, subsets_combinations[n - index - 1])
                    information_gain_2, left_mask, right_mask = self.information_gain(X_j, y, split)   
                    if information_gain_2 > best_split['gain']:
                        best_split = {'gain': information_gain_2, 'threshold':split , 'feature': idx,
                                      'left_mask': left_mask, 'right_mask': right_mask }
            else:
                y_entropy = self.entropy(y_sorted)
                total_sum = len(y_sorted)

                left_classes, right_classes = Counter(), Counter(y_sorted)
                counts_left, counts_right = 0, total_sum

                for i in range(len(unique_values) - 1):
                    threshold = (unique_values[i] + unique_values[i + 1]) / 2
                    if i == 0:
                        left_classes = Counter(y_sorted[X_j_sorted < threshold])
                        right_classes = Counter(y_sorted[X_j_sorted >= threshold])
                        counts_left, counts_right = sum(left_classes.values()), sum(right_classes.values())
                    else:
                        left_classes[y_unique[i]] += 1
                        counts_left += 1
                        right_classes[y_unique[i]] = right_classes[y_unique[i]] - 1
                        counts_right -= 1

                    weighted_entropy = (-1 / total_sum) * (
                                self.entropy_o1(left_classes, labels, counts_left) + self.entropy_o1(right_classes, labels, counts_right))
                    information_gain = y_entropy - weighted_entropy
                    left_mask = X_j < threshold
                    right_mask = ~left_mask

                    if information_gain > best_split['gain']:
                        best_split = {'gain': information_gain, 'threshold': threshold, 'feature': idx,
                                      'left_mask': left_mask, 'right_mask': right_mask }

        if best_split['gain'] == 0:
            self.prediction = Counter(y).most_common(1)[0][0]
        else:
            self.split_idx = best_split['feature']
            self.thresh = best_split['threshold']
            left_mask = best_split['left_mask']
            right_mask = best_split['right_mask']
            X_left, y_left, X_right, y_right = X[left_mask], y[left_mask], X[right_mask], y[right_mask]
            self.left = DecisionTreeCategorical(self.max_depth - 1, self.feature_labels, self.max_features).fit(X_left, y_left)
            self.right = DecisionTreeCategorical(self.max_depth - 1, self.feature_labels, self.max_features).fit(X_right, y_right)
        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        if not hasattr(self, '_imputed'):
            for feature in range(X.shape[1]):
                X[:, feature], _ = self.fill_infer_cat(X[:, feature], feature)
            self._imputed = True
        if self.prediction is not None:
            return np.full(X.shape[0], self.prediction)

        if isinstance(self.thresh, tuple):  # Categorical split
            left_mask = np.isin(X[:, self.split_idx], self.thresh[0])
        else:
            left_mask = X[:, self.split_idx] < self.thresh

        predictions = np.empty(X.shape[0], dtype=int)
        predictions[left_mask] = self.left.predict(X[left_mask])
        predictions[~left_mask] = self.right.predict(X[~left_mask])
        return predictions


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Load the data
data = pd.read_csv('../data/raw/titanic_testing_data.csv')

# Select features
features = ['pclass', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked']
X = data[features].copy()

# Replace empty strings with NaN
X = X.replace('', np.nan)

# Encode categorical variables with LabelEncoder
le_sex = LabelEncoder()
le_embarked = LabelEncoder()

# Fit and transform categorical features, handling NaN separately by filling with 'missing'
X['sex'] = le_sex.fit_transform(X['sex'].fillna('missing'))
X['embarked'] = le_embarked.fit_transform(X['embarked'].fillna('missing'))

# Map 'missing' to -1 for categorical features
if 'missing' in le_sex.classes_:
    X['sex'] = X['sex'].replace(le_sex.transform(['missing'])[0], -1)
if 'missing' in le_embarked.classes_:
    X['embarked'] = X['embarked'].replace(le_embarked.transform(['missing'])[0], -1)

# Fill all remaining missing values in the dataset with -1
X = X.fillna(-1)

# Convert all features to float (for compatibility)
X = X.astype(float)

# Save the processed DataFrame to CSV
X.to_csv('../data/interim/titanic_testing_V3.csv', index=False)


In [31]:
X = np.genfromtxt('../data/interim/titanic_training_V3.csv', delimiter=",", dtype=float)[1:, ]
X_validation = np.genfromtxt('../data/processed/titanic_training_cleaned.csv', delimiter=",", dtype=float)[1:, ]
y = np.genfromtxt('../data/processed/titanic_training_labels.csv', delimiter=",", dtype=int)[1:]

print("\nsklearn's decision tree")
clf = DecisionTreeClassifier(random_state=0, max_depth=3, min_samples_leaf=7, criterion='entropy')
start_time = time.time()
clf.fit(X, y)
end_time = time.time()
execution_time = end_time - start_time
validation_scores = cross_val_score(clf, X, y)
print("Cross validation", validation_scores)
print('mean:' ,np.mean(validation_scores))
print('std:', np.std(validation_scores))
print(f"Execution time sklearn: {execution_time} seconds")

print("\nMy decision tree")
clf = DecisionTreeCategorical(max_depth=8, cat_cols=[0, 1, 6])
start_time = time.time()
clf.fit(X, y)
end_time = time.time()
execution_time = end_time - start_time

validation_scores = cross_val_score(clf, X, y)
print("Cross validation", validation_scores)
print('mean:' ,np.mean(validation_scores))
print('std:', np.std(validation_scores))
print(f"Execution time my implementation: {execution_time} seconds")


sklearn's decision tree
Cross validation [0.81188119 0.78217822 0.77722772 0.75742574 0.77114428]
mean: 0.7799714299788187
std: 0.01798253112472074
Execution time sklearn: 0.020105838775634766 seconds

My decision tree
0
1
6
0
1
6
0
1
6
0
1
6
0
1
6
0
1
6
Cross validation [0.81683168 0.82178218 0.8019802  0.80693069 0.7761194 ]
mean: 0.8047288310920646
std: 0.01592606814699095
Execution time my implementation: 1.0001866817474365 seconds


In [None]:
param_grid = {
    'max_depth': [3, 5,6,  7, 8, 9, 10],
}

print('\n\n Decision tree categorical')
model = DecisionTreeCategorical()
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X, y)
print("Best hyperparameter:",grid_search.best_params_)

## Random Forest


In [33]:
class BaggedTrees(BaseEstimator, ClassifierMixin):
    def __init__(self, Tree, params: Optional[dict] = None, n: Optional[int] = 200):
        if params is None:
            params = {}
        self.params: Optional[dict] = params
        self.n: Optional[int] = n
        self.Tree = Tree
        self.decision_trees = [
            Tree(**self.params) for _ in range(self.n)
        ]

    def fit(self, X: np.ndarray, y: np.ndarray) -> 'BaggedTrees':
        nb_samples = X.shape[0]
        for decisionTree in self.decision_trees:
            indices = np.random.choice(nb_samples, size=nb_samples, replace=True)
            decisionTree.fit(X[indices], y[indices])
        return self

    def predict(self, X: np.ndarray) -> np.ndarray:
        decisions = np.array([decisionTree.predict(X) for decisionTree in self.decision_trees])
        return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=decisions)


class RandomForest(BaggedTrees):
    def __init__(self, Tree, params: Optional[dict] = None, n: Optional[int] = 200, m: Optional[int] = 1):
        if params is None:
            params = {}
        self.m: Optional[int] = m
        self.n: Optional[int] = n
        self.Tree = Tree
        super().__init__(Tree, params, n)

    def fit(self, X: np.ndarray, y: np.ndarray):
        for decisionTree in self.decision_trees:
            if hasattr(decisionTree, 'max_features'):
                setattr(decisionTree, 'max_features', self.m)
        return super().fit(X, y)


In [38]:
X = np.genfromtxt('../data/interim/titanic_training_V3.csv', delimiter=",", dtype=float)[1:, ]
y = np.genfromtxt('../data/processed/titanic_training_labels.csv', delimiter=",", dtype=int)[1:]
test = np.genfromtxt('../data/interim/titanic_testing_V3.csv', delimiter=",", dtype=float)[1:, ]

params = {
    'max_depth': 8, 
    'cat_cols':[0, 1, 6]
}
print("\n Categorical Decision Tree")
clf = RandomForest(DecisionTreeCategorical, params, 100, 3)
start_time = time.time()
clf.fit(X, y)
end_time = time.time()
execution_time = end_time - start_time

validation_scores = cross_val_score(clf, X, y)
print("Cross validation", validation_scores)
print('mean:' ,np.mean(validation_scores))
print('std:', np.std(validation_scores))
print(f"Execution time my implementation: {execution_time} seconds")


X = np.genfromtxt('../data/processed/titanic_training_cleaned.csv', delimiter=",", dtype=float)[1:, :]

print("\n Simple Decision Tree")
params = {
    'max_depth': 12
}
clf = RandomForest(DecisionTreeMissing, params, 300, 3)
start_time = time.time()
clf.fit(X, y)
end_time = time.time()
execution_time = end_time - start_time
validation_scores = cross_val_score(clf, X, y)


print("Cross validation", validation_scores)
print('mean:' ,np.mean(validation_scores))
print('std:', np.std(validation_scores))
print(f"Execution time my implementation: {execution_time} seconds")


 Categorical Decision Tree


### Random forest: Grid Search for hyperparameters 

In [None]:
X = np.genfromtxt('../data/interim/titanic_training_V3.csv', delimiter=",", dtype=float)[1:, ]
y = np.genfromtxt('../data/processed/titanic_training_labels.csv', delimiter=",", dtype=int)[1:]

params = {
    'max_depth': 10, 
    'cat_cols':[0, 1, 6]
}

param_grid = {
    'n': [50, 100, 200],
    'm': [3 , 7]
}

print('\n\n Decision tree categorical')
model = RandomForest(DecisionTreeCategorical, params)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_search.fit(X, y)
print("Best hyperparameter:",grid_search.best_params_)