In [1]:
import numpy as np
import pandas as pd
from copy import deepcopy
from statistics import mode, mean

from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import train_test_split

debug = 0

In [2]:
df = pd.read_pickle(r"/Users/duong-jason/Desktop/dc/project_2/dataset/cancer.pkl")
X, y = df.iloc[:, :-1], df.iloc[:, -1]

In [3]:
# data = {
#     'Stream': ['false', 'true', 'true', 'false', 'false', 'true', 'true'],
#     'Slope': ['steep', 'moderate', 'steep', 'steep', 'flat', 'steep', 'steep'],
#     'Elevation': ['high', 'low', 'medium', 'medium', 'high', 'highest', 'high'],
#     'Vegetation': ['chapparal', 'riparian', 'riparian', 'chapparal', 'conifer', 'conifer', 'chapparal']
# }

# df = pd.DataFrame(data)
# X, y = df.iloc[:, :-1], df.iloc[:, -1]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Decision Tree Implementation

In [5]:
class Node:
    """
    A Decision Tree Node

    Parameters
    ----------
    feature - the value of a descriptive/target feature of a node
    data - the partitioned dataset resulting from the parent node on a feature value
    branch - the feature value from the parent node
    parent - the immediate adjacent node along the path from the root
    leaf - denotes a terminal node whose prediction is based on the path from the root to the node
    depth - the number of levels from the root to a node
    children - the nodes resulting from each unique feature value of the parent
    """
    def __init__(
        self,
        *,
        feature=None,
        data=None,
        branch=None,
        parent=None,
        leaf=False,
        depth=0,
        children=[]
    ):
        self.feature = feature
        self.data = data
        self.branch = branch
        self.parent = parent
        self.leaf = leaf
        self.depth = depth
        self.children = children

    def __str__(self):
        return self.depth * '\t' + f" {self.feature} (Branch={self.branch})"

    @property
    def is_leaf(self):
        """Returns whether a node is terminal"""
        return self.leaf

    @property
    def X(self):
        """Returns the partitioned feature matrix of a node"""
        return self.data.iloc[:, :-1]

    @property
    def y(self):
        """Returns the partitioned target vector of a node"""
        return self.data.iloc[:, -1]

In [6]:
class DecisionTreeEstimator:
    """A Decision Tree Estimator"""
    def __init__(self, criterion={}):
        """
        Parameters
        ----------
        root: the starting node of the decision tree
        n_levels: contains a list of all unique feature values for each descriptive feature
        criterion (pre-pruning): {max_depth, partition_threshold, low_gain}
        """
        self.root = None
        self.n_levels = None
        self.criterion = criterion

    def __repr__(self, node=None):
        """Displays the decision tree (Pre-Order Traversal)"""
        if not node:
            node = self.root
        return str(node) + ''.join(['\n' + self.__repr__(child) for child in node.children])

    def partition(self, X, y, d, t):
        """Returns a subset of the training data with feature (d) of level (t)"""
        D = pd.concat([X.loc[X[d]==t], y.loc[X[d]==t]], axis=1)
        D = D.drop([d], axis=1)
        return D.iloc[:, :-1], D.iloc[:, -1], t

    def fit(self, X, y):
        self.n_levels = {d: X[d].unique() for d in X.columns}
        self.root = self.make_tree(X, y)
        return self

    def predict(self, x):
        node = self.root
        while not node.is_leaf:
            for child in node.children:
                if child.branch == x.get(node.feature).values:
                    node = child
                    break
            else:
                raise ValueError(f"Branch {child.feature} -> {node.branch} does not exist")
        return node

    def score(self, X, y):
        return [self.predict(X.iloc[x].to_frame().T).feature for x in range(len(X))]

## Decision Tree Classifier

###  Gini Index
$Gini(t, \mathcal{D})=1-\sum_{l\in levels(t)}P(t=l)^2$

### Entropy *(Bits)*
$\mathcal{H}(t, \mathcal{D})=-\sum_{l\in levels(t)\\}^{}{P(t=l)\cdot\log_2(P(t=l))}$

### Rem
$rem(d,\mathcal{D})=\sum_{l\in levels(t)}{}\frac{|\mathcal{D}_{d=l}|}{\mathcal{D}}\cdot \mathcal{H}(t, \mathcal{D}_{d=l})$

### Information Gain
$IG(d, \mathcal{D})=\mathcal{H}(t, \mathcal{D})-rem(d, \mathcal{D})$

### Information Gain Ratio
$GR(d, \mathcal{D})=\frac{IG(d, \mathcal{D})}{\mathcal{H}(d, \mathcal{D})}$

In [7]:
class DecisionTreeClassifier(DecisionTreeEstimator):
    """A Rudimentary Decision Tree Classifier"""
    def __init__(self, *, metric="entropy", eval="info_gain", criterion={}):
        """
        Metric: {gain, gini}
        Eval: {info_gain, gain_ratio}
        """
        super().__init__(criterion)
        self.metric = self.entropy if metric == "entropy" else self.gini
        self.eval = self.information_gain if eval == "info_gain" else self.information_gain_ratio

    def gini(self, X, y):
        proba = lambda t: len(X.loc[y==t]) / len(X)
        return 1 - np.sum([proba(t)**2 for t in y.unique()])

    def entropy(self, X, y):
        """Measures the amount of uncertainty/impurity/heterogeneity in (X, y)"""
        proba = lambda t: len(X.loc[y==t]) / len(X)
        return -np.sum([proba(t) * np.log2(proba(t)) for t in y.unique()])

    def rem(self, X, y, d):
        """Measures the entropy after partitioning (X, y) on feature (d)"""
        weight = lambda t: len(X.loc[X[d]==t]) / len(X)
        return np.sum([weight(t) * self.metric(X.loc[X[d]==t], y.loc[X[d]==t]) for t in X[d].unique()])

    def information_gain(self, X, y, d):
        """Measures the reduction in the overall entropy in (X, y) achieved by testing on feature (d)"""
        if debug:
            print(f"{d} = {self.metric(X, y):.3f} - {self.rem(X, y, d):.3f} = {self.metric(X, y) - self.rem(X, y, d):.3f}") 
        return self.metric(X, y) - self.rem(X, y, d)

    def information_gain_ratio(self, X, y, d):
        proba = lambda t: len(X.loc[X[d]==t]) / len(X)
        entropy = lambda: -np.sum([proba(t) * np.log2(proba(t)) for t in X[d].unique()])

        if debug:
            print(f"{d} = ({self.metric(X, y):.3f} - {self.rem(X, y, d):.3f}) / {entropy()} = {(self.metric(X, y) - self.rem(X, y, d)) / entropy()}")
        return self.metric(X, y) - self.rem(X, y, d) / entropy()

    def make_tree(self, X, y, *, parent=None, branch=None, depth=0):
        """Performs the ID3 algorithm"""
        make_node = lambda f, t: Node(feature=f, data=pd.concat([X, y], axis=1), branch=branch, parent=parent, depth=depth, leaf=t)

        if len(y.unique()) == 1:  # all instances have the same target feature values
            if debug:
                print("All instances have the same target feature value\n")
            return make_node(y.iat[0], True)
        elif X.empty:  # dataset is empty, return a leaf node labeled with the majority class of the parent
            if debug:
                print("Dataset is empty\n")
            return make_node(mode(parent.y), True)
        elif all((X[d] == X[d].iloc[0]).all() for d in X.columns):  # if all feature values are identical
            if debug:
                print("All instances have the same descriptive features\n")
            return make_node(mode(y), True)
        elif self.criterion.get("max_depth", float('inf')) <= depth:  # max depth reached
            if debug:
                print("Stopping at Max Depth\n")
            return make_node(mode(y), True)
        elif self.criterion.get("partition_threshold", float('-inf')) >= len(X):  # max number of instances in partitioned dataset reached
            if debug:
                print(f"Stopping at {len(X)} instances\n")
            return make_node(mode(y), True)

        if debug:
            print("===Information Gain===")

        max_gain = np.argmax([self.eval(X, y, d) for d in X.columns])

        if self.criterion.get('low_gain', float('-inf')) >= max_gain:
            if debug:
                print(f"Stopping at Gain={max_gain}\n")
            return make_node(mode(y), True)

        best_feature = X.columns[max_gain]
        best_node = deepcopy(make_node(best_feature, False))

        if debug:
            print()
            print("===Best Feature===")
            print(best_feature)
            print()

        X_levels = [self.partition(X, y, best_feature, level) for level in self.n_levels[best_feature]]

        for *d, level in X_levels:
            if debug:
                print(f"===Partitioned Dataset ({level})===")
                print(pd.concat(d, axis=1).head())
                print()
            best_node.children.append(self.make_tree(*d, parent=best_node, branch=level, depth=depth+1))
        return best_node

    def score(self, X, y):
        y_hat = super().score(X, y)
        return confusion_matrix(y, y_hat, labels=y.unique())

In [8]:
%%time
dt_clf = DecisionTreeClassifier(metric='entropy', criterion={'partition_threshold': len(X_train) * 5e-2}).fit(X_train, y_train)
dt_clf.score(X_test, y_test)

CPU times: user 1.45 s, sys: 9.25 ms, total: 1.46 s
Wall time: 1.46 s


array([[ 56,   9],
       [  7, 116]])

In [9]:
# dt_clf = DecisionTreeClassifier(metric='entropy').fit(X, y)

In [10]:
dt_clf

 worst perimeter (Branch=None)
	 1 (Branch=105.95000076293945)
	 compactness error (Branch=251.2)
		 0 (Branch=0.01373999984934926)
		 0 (Branch=0.04922000132501125)
		 0 (Branch=0.1354)
		 0 (Branch=0.024205000139772892)
		 1 (Branch=0.009051499888300896)
		 0 (Branch=0.01838000025600195)
		 0 (Branch=0.014734999742358923)
	 perimeter error (Branch=87.36999893188477)
		 1 (Branch=2.76200008392334)
		 1 (Branch=21.98)
		 1 (Branch=4.102499961853027)
		 1 (Branch=1.5145000219345093)
		 1 (Branch=2.3585000038146973)
		 1 (Branch=3.45550000667572)
		 1 (Branch=1.7509999871253967)
	 mean concavity (Branch=101.64999771118164)
		 1 (Branch=0.04660499840974808)
		 0 (Branch=0.4268)
		 1 (Branch=0.07226499915122986)
		 1 (Branch=0.11919999867677689)
		 1 (Branch=0.09329499676823616)
		 1 (Branch=0.1550999954342842)
		 1 (Branch=0.10614999756217003)
	 mean texture (Branch=117.44999694824219)
		 1 (Branch=15.045000076293945)
		 0 (Branch=24.0)
		 0 (Branch=18.460000038146973)
		 0 (Branch=39.28)

## Decision Tree Regressor

$var(t, \mathcal{D})=\frac{\sum_{i=1}^n(t_i-\bar{t})^2}{n-1}$

$weighted\ var(t, \mathcal{D}) = \sum_{l\in levels(d)}{} \frac{|\mathcal{D}_{d=l}|}{|\mathcal{D}|} \times var(t, \mathcal{D}_{d=l})$

In [11]:
class DecisionTreeRegressor(DecisionTreeEstimator):
    """A Rudimentary Decision Tree Regressor"""
    def __init__(self, *, metric="variance", criterion={}):
        """
        Metric = {variance}
        """
        super().__init__(criterion)
        self.metric = metric

    def variance(self, X, y, d):
        if len(X) == 1:
            return 0
        if debug:
            print(f"{d} = {np.sum([(t-mean(y))**2 for t in y]) / (len(X)-1)}")
        return np.sum([(t-mean(y))**2 for t in y]) / (len(X)-1)

    def weighted_variance(self, X, y, d):
        weight = lambda t: len(X.loc[X[d]==t]) / len(X)
        return np.sum([weight(t) * self.variance(X.loc[X[d]==t], y.loc[X[d]==t], d) for t in X[d].unique()])

    def make_tree(self, X, y, *, parent=None, branch=None, depth=0):
        """Performs the ID3 algorithm"""
        make_node = lambda f, t: Node(feature=f, data=pd.concat([X, y], axis=1), branch=branch, parent=parent, depth=depth, leaf=t)

        if len(y.unique()) == 1:  # all instances have the same target feature values
            if debug:
                print("All instances have the same target feature value\n")
            return make_node(y.iat[0], True)
        elif X.empty:  # dataset is empty, return a leaf node labeled with the majority class of the parent
            if debug:
                print("Dataset is empty\n")
            return make_node(mean(y), True)
        elif self.criterion.get("max_depth", float('inf')) <= depth:
            if debug:
                print("Stopping at Max Depth\n")
            return make_node(mean(y), True)
        elif self.criterion.get("partition_threshold", float('-inf')) >= len(X):
            if debug:
                print(f"Stopping at {len(X)} instances\n")
            return make_node(mean(y), True)

        if debug:
            print("===Variance===")

        min_var = np.argmin([self.weighted_variance(X, y, d) for d in X.columns])

        best_feature = X.columns[min_var]
        best_node = deepcopy(make_node(best_feature, False))

        if debug:
            print()
            print("===Best Feature===")
            print(best_feature)
            print()

        X_levels = [self.partition(X, y, best_feature, level) for level in self.n_levels[best_feature]]

        for *d, level in X_levels:
            if debug:
                print(f"===Partitioned Dataset ({level})===")
                print(pd.concat(d, axis=1).head())
                print()
            best_node.children.append(self.make_tree(*d, parent=best_node, branch=level, depth=depth+1))
        return best_node

    def score(self, X, y):
        y_hat = super().score(X, y)
        return mean_squared_error(y, y_hat, squared=False)

In [12]:
data = {
    'Season': ['winter', 'winter', 'winter', 'spring', 'spring', 'spring', 'summer', 'summer', 'summer', 'autumn', 'autumn', 'autumn'],
    'Work Day': ['false', 'false', 'true', 'false', 'true', 'true', 'false', 'true', 'true', 'false', 'false', 'true'],
    'Rentals': [800, 826, 900, 2100, 4740, 4900, 3000, 5800, 6200, 2910, 2880, 2820],
}

df = pd.DataFrame(data)
A, b = df.iloc[:, :-1], df.iloc[:, -1]

In [13]:
dt_regr = DecisionTreeRegressor().fit(A, b)

In [14]:
dt_regr

 Season (Branch=None)
	 Work Day (Branch=winter)
		 813 (Branch=false)
		 900 (Branch=true)
	 Work Day (Branch=spring)
		 2100 (Branch=false)
		 4820 (Branch=true)
	 Work Day (Branch=summer)
		 3000 (Branch=false)
		 6000 (Branch=true)
	 Work Day (Branch=autumn)
		 2895 (Branch=false)
		 2820 (Branch=true)

# Random Forest Implementation

In [15]:
class RandomForest:
    def __init__(self, n_estimators=5, n_sample=2, eval="info_gain", criterion={}):
        self.n_estimators = n_estimators
        self.n_sample = n_sample
        self.forest = [DecisionTreeClassifier(eval=eval, criterion=criterion) for _ in range(n_estimators)]

    def sub_sample(self, X, n_sample=2):
        """Enforces feature randomness"""
        return np.random.choice(X.columns.to_numpy(), n_sample, replace=False)

    def bootstrap_sample(self, X, y, n_sample, key=True):
        feature_subset = self.sub_sample(X, int(np.log2(len(X))))
        d = pd.concat([X, y], axis=1)
        d = d.sample(n=n_sample, replace=key)
        return d.iloc[:, :-1][feature_subset], d.iloc[:, -1]

    def fit(self, X, y):
        for _, tree in enumerate(self.forest):
            print(f"Decision Tree #{_}")
            tree.fit(*self.bootstrap_sample(X, y, self.n_sample))
        return self

    def predict(self, x):
        assert all(isinstance(model, DecisionTreeClassifier) for model in self.forest)
        return mode([dt.predict(x).feature for dt in self.forest])

    def score(self, X, y):
        y_hat = [self.predict(X.iloc[x].to_frame().T) for x in range(len(X))]
        return confusion_matrix(y, y_hat, labels=y.unique())

In [16]:
%%time
rf = RandomForest(n_estimators=1000, n_sample=len(X_train), criterion={'max_depth': 1}).fit(X_train, y_train)
rf.score(X_test, y_test)

Decision Tree #0
Decision Tree #1
Decision Tree #2
Decision Tree #3
Decision Tree #4
Decision Tree #5
Decision Tree #6
Decision Tree #7
Decision Tree #8
Decision Tree #9
Decision Tree #10
Decision Tree #11
Decision Tree #12
Decision Tree #13
Decision Tree #14
Decision Tree #15
Decision Tree #16
Decision Tree #17
Decision Tree #18
Decision Tree #19
Decision Tree #20
Decision Tree #21
Decision Tree #22
Decision Tree #23
Decision Tree #24
Decision Tree #25
Decision Tree #26
Decision Tree #27
Decision Tree #28
Decision Tree #29
Decision Tree #30
Decision Tree #31
Decision Tree #32
Decision Tree #33
Decision Tree #34
Decision Tree #35
Decision Tree #36
Decision Tree #37
Decision Tree #38
Decision Tree #39
Decision Tree #40
Decision Tree #41
Decision Tree #42
Decision Tree #43
Decision Tree #44
Decision Tree #45
Decision Tree #46
Decision Tree #47
Decision Tree #48
Decision Tree #49
Decision Tree #50
Decision Tree #51
Decision Tree #52
Decision Tree #53
Decision Tree #54
Decision Tree #55
De

array([[ 59,   6],
       [  8, 115]])

In [17]:
# rf = RandomForest(n_estimators=20, n_sample=len(X)).fit(X, y)