In [1]:
import numpy as np
import pandas as pd
from copy import deepcopy
from statistics import mode
from sklearn.metrics import confusion_matrix

In [2]:
test = 0

In [3]:
if not test:
    # df = pd.read_pickle(r'/home/jason/Desktop/school/dc/distrf/dataset/cancer.pkl')
    df = pd.read_pickle(r"/Users/duong-jason/Desktop/dc/distrf/dataset/cancer.pkl")
    X, y = df.iloc[:, :-1], df.iloc[:, -1]

In [4]:
if not test:
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

In [5]:
if test:
    df = pd.read_csv(r"/Users/duong-jason/Desktop/dc/distrf/dataset/golf.csv")
    X_train, y_train = df.iloc[:, :-1], df.iloc[:, -1]

# Decision Tree Implementation
### Entropy *(Bits)*
$\mathcal{H}(t, \mathcal{D})=-\sum_{l\in levels(t)\\}^{}{P(t=l)\cdot\log_2(P(t=l))}\ \text{bits}$

### Rem
$rem(d,\mathcal{D})=\sum_{l\in levels(t)}{}\frac{|\mathcal{D}_{d=l}|}{\mathcal{D}}\cdot \mathcal{H}(t, \mathcal{D}_{d=l})$

### Information Gain
$IG(d, \mathcal{D})=\mathcal{H}(t, \mathcal{D})-rem(d, \mathcal{D})$

In [6]:
class Node:
    def __init__(
        self,
        *,
        feature=None,
        data=None,
        branch=None,
        parent=None,
        leaf=False,
        children=[]
    ):
        self.feature = feature
        self.data = data
        self.branch = branch
        self.parent = parent
        self.leaf = leaf
        self.children = children

    @property
    def isLeaf(self):
        return self.leaf

    @property
    def X(self):
        return self.data.iloc[:, :-1]

    @property
    def y(self):
        return self.data.iloc[:, -1]

In [7]:
class DecisionTree:
    """A Rudimentary Decision Tree Classifier"""
    def __init__(self, *, criterion=None):
        """Pre-pruning criterion = {max_depth, partition_threshold, low_gain}"""
        self.root = None
        self.levels = None
        self.criterion = criterion

    def __repr__(self, node=None, depth=0):
        """Displays the decision tree"""
        if not node:
            node = self.root

        print(depth * '\t', node.feature, f"(Branch={node.branch})")
        for child in node.children:
            self.__repr__(child, depth+1)

        return ""
    
    def partition(self, X, y, d, t):
        """Returns a subset of the training data with feature (d) of level (t)"""
        D = pd.concat([X.loc[X[d]==t], y.loc[X[d]==t]], axis=1)
        D = D.drop([d], axis=1)
        return D.iloc[:, :-1], D.iloc[:, -1], t

    def entropy(self, X, y):
        """Measures the amount of uncertainty/impurity/heterogeneity in (X, y)"""
        proba = lambda t: len(X.loc[y==t]) / len(X)
        return -sum([proba(t) * np.log2(proba(t)) for t in y.unique()])

    def rem(self, X, y, d):
        """Measures the entropy after partitioning (X, y) on feature (d)"""
        weight = lambda t: len(X.loc[X[d]==t]) / len(X)
        return sum([weight(t) * self.entropy(X.loc[X[d]==t], y.loc[X[d]==t]) for t in X[d].unique()])

    def information_gain(self, X, y, d):
        """Measures the reduction in the overall entropy in (X, y) achieved by testing on feature (d)"""
        # if debug:
        #     print(f"{d} = {self.entropy(X, y):.3f} - {self.rem(X, y, d):.3f} = {self.entropy(X, y) - self.rem(X, y, d):.3f}") 
        return self.entropy(X, y) - self.rem(X, y, d)

    def build_tree(self, X, y, *, parent=None, branch=None, depth=0):
        """Performs the ID3 algorithm"""
        if len(y.unique()) == 1:  # all instances have the same target feature values
            # if debug:
            #     print("All instances have the same target feature value\n")
            return Node(feature=y.iat[0],
                        data=pd.concat([X, y], axis=1),
                        branch=branch,
                        parent=parent,
                        leaf=True)
        elif X.empty:  # dataset is empty, return a leaf node labeled with the majority class of the parent
            # if debug:
            #     print("Dataset is empty\n")
            return Node(feature=mode(parent.y),
                        branch=branch,
                        parent=parent,
                        leaf=True)
        elif all((X[d] == X[d].iloc[0]).all() for d in X.columns):  # if all feature values are identical
            # if debug:
            #     print("All instances have the same descriptive features\n")
                return Node(feature=mode(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            leaf=True)
        elif self.criterion.get("max_depth"):
            if depth >= self.criterion["max_depth"]:
                # if debug:
                #     print("Stopping at Max Depth\n")
                return Node(feature=mode(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            leaf=True)
        elif self.criterion.get("partition_threshold"):
            if len(X) < self.criterion["partition_threshold"]:
                # if debug:
                #     print(f"Stopping at {len(X)} instances\n")
                return Node(feature=mode(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            leaf=True)

        # if debug:
        #     print("===Information Gain===")
        gain = np.argmax([self.information_gain(X, y, d) for d in X.columns])

        if self.criterion.get('low_gain'):
            if gain <= self.criterion["low_gain"]:
                # if debug:
                #     print(f"Stopping at Gain={gain}\n")
                return Node(feature=mode(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            leaf=True)

        best_feature = X.columns[gain]
        best_node = deepcopy(Node(feature=best_feature,
                                  data=pd.concat([X, y], axis=1),
                                  branch=branch,
                                  parent=parent))

        # if debug:
        #     print()
        #     print("===Best Feature===")
        #     print(best_feature)
        #     print()

        X_levels = [self.partition(X, y, best_feature, level) for level in self.levels[best_feature]]

        for *d, level in X_levels:
            # if debug:
            #     print(f"===Partitioned Dataset ({t})===")
            #     print(pd.concat(d, axis=1).head())
            #     print()
            best_node.children.append(self.build_tree(*d, parent=best_node, branch=level, depth=depth+1))
        return best_node

    def fit(self, X, y):
        self.levels = {k: X[k].unique() for k in X.columns}
        self.root = self.build_tree(X, y)
        return self

    def predict(self, x):
        node = self.root
        while not node.isLeaf:
            for child in node.children:
                if child.branch == x.get(node.feature).values:
                    node = child
                    break
        return node.feature

    def score(self, X, y):
        y_hat = [self.predict(X.iloc[x].to_frame().T) for x in range(len(X))]
        # return confusion_matrix(y, y_hat, labels=[1, 0])
        return confusion_matrix(y, y_hat, labels=y.unique())  # for the golf dataset

In [9]:
%%time
dt = DecisionTree(criterion={'max_depth': 4}).fit(X_train, y_train)
dt.score(X_test, y_test)

CPU times: user 6.75 s, sys: 109 ms, total: 6.86 s
Wall time: 6.87 s


# Random Forest Implementation

In [12]:
class RandomForest:
    def __init__(self, n_estimators=5, n_sample=2, criterion=None):
        self.n_estimators = n_estimators
        self.n_sample = n_sample
        self.forest = [DecisionTree(criterion=criterion) for _ in range(n_estimators)]

    def sub_sample(self, X, n_sample=2):
        """Enforces feature randomness"""
        return np.random.choice(X.columns.to_numpy(), n_sample, replace=False)

    def bootstrap_sample(self, X, y, n_sample, key=True):
        feature_subset = self.sub_sample(X, int(np.log2(len(X))))
        d = pd.concat([X, y], axis=1)
        d = d.sample(n=n_sample, replace=key)
        return d.iloc[:, :-1][feature_subset], d.iloc[:, -1]

    def fit(self, X, y):
        for tree in self.forest:
            tree.fit(*self.bootstrap_sample(X, y, self.n_sample))
        return self

    def predict(self, x):
        assert all(isinstance(model, DecisionTree) for model in self.forest)
        return mode([dt.predict(x) for dt in self.forest])

    def score(self, X, y):
        y_hat = [self.predict(X.iloc[x].to_frame().T) for x in range(len(X))]
        # return confusion_matrix(y, y_hat, labels=[1, 0])
        return confusion_matrix(y, y_hat, labels=y.unique())

In [17]:
%%time

rf = RandomForest(n_estimators=10, n_sample=len(X_train), criterion={'partition_threshold': 5}).fit(X_train, y_train)
rf.score(X_test, y_test)

CPU times: user 19.7 s, sys: 1.85 s, total: 21.5 s
Wall time: 23.5 s


<__main__.RandomForest at 0x2c44373a0>