In [12]:
import numpy as np
import pandas as pd
from statistics import mode
from copy import deepcopy

In [2]:
data = {
    'Stream': ['false', 'true', 'true', 'false', 'false', 'true', 'true'],
    'Slope': ['steep', 'moderate', 'steep', 'steep', 'flat', 'steep', 'steep'],
    'Elevation': ['high', 'low', 'medium', 'medium', 'high', 'highest', 'high'],
    'Vegetation': ['chapparal', 'riparian', 'riparian', 'chapparal', 'conifer', 'conifer', 'chapparal']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Stream,Slope,Elevation,Vegetation
0,False,steep,high,chapparal
1,True,moderate,low,riparian
2,True,steep,medium,riparian
3,False,steep,medium,chapparal
4,False,flat,high,conifer
5,True,steep,highest,conifer
6,True,steep,high,chapparal


In [3]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]

### Entropy *(Bits)*
$\mathcal{H}(t, \mathcal{D})=-\sum_{I\in levels(t)\\}^{}{P(t=I)\cdot\log_2(P(t=I))}$

### Rem
$rem(d,\mathcal{D})=\sum_{I\in levels(t)}{}\frac{|\mathcal{D}_{d=I}|}{\mathcal{D}}\cdot \mathcal{H}(t, \mathcal{D}_{d=I})$

### Information Gain
$IG(d, \mathcal{D})=\mathcal{H}(t, \mathcal{D})-rem(d, \mathcal{D})$

In [4]:
class Node:
    def __init__(
        self,
        feature=None,
        data=None,
        branch=None,
        parent=None,
        leaf=False,
        children=[]
    ):
        self.feature = feature
        self.data = data
        self.branch = branch
        # self.threshold = threshold
        self.parent = parent
        self.leaf = leaf
        self.children = children

    @property
    def isLeaf(self):
        return self.leaf

    @property
    def X(self):
        return self.data.iloc[:, :-1]

    @property
    def y(self):
        return self.data.iloc[:, -1]

In [5]:
class DecisionTree:
    """
    A Rudimentary Decision Tree Classifier

    TODO
        - Pruning
        - Continuous Feature Values
    """
    def __init__(self):
        self.root = None
        self.levels = None

    def __repr__(self, node=None, depth=0):
        """Displays the decision tree"""
        if not node:
            node = self.root

        print(depth * '\t', node.feature, f"({node.branch})")
        for child in node.children:
            self.__repr__(child, depth+1)

        return ""

    def partition(self, X, y, d, t):
        p = pd.concat([X.loc[X[d]==t], y.loc[X[d]==t]], axis=1)
        p = p.drop([d], axis=1)
        return p.iloc[:, :-1], p.iloc[:, -1], t

    def entropy(self, X, y):
        """Measures the amount of uncertainty/impurity/heterogeneity in (X, y)"""
        proba = lambda t: len(X.loc[y==t]) / len(X)
        return -sum([proba(t) * np.log2(proba(t)) for t in y.unique()])

    def rem(self, X, y, d):
        """Measures the entropy after partitioning (X, y) on feature (d)"""
        weight = lambda t: len(X.loc[X[d]==t]) / len(X)
        return sum([weight(t) * self.entropy(X.loc[X[d]==t], y.loc[X[d]==t]) for t in X[d].unique()])

    def information_gain(self, X, y, d):
        """Measures the reduction in the overall entropy in (X, y) achieved by testing on feature (d)"""
#         if debug:
#             print(f"{d} = {self.entropy(X, y):.3f} - {self.rem(X, y, d):.3f} = {self.entropy(X, y) - self.rem(X, y, d):.3f}") 

        return self.entropy(X, y) - self.rem(X, y, d)

    def build_tree(self, X, y, *, parent=None, branch=None):
        """Performs the ID3 algorithm"""
        if len(y.unique()) == 1:  # all instances have the same target feature values
#             if debug:
#                 print("All instances have the same target feature value\n")
            best_node = Node(feature=y.iat[0],
                             data=pd.concat([X, y], axis=1),
                             branch=branch,
                             parent=parent,
                             leaf=True)
        elif X.empty:  # dataset is empty, return a leaf node labeled with the majority class of the parent
#             if debug:
#                 print("Dataset is empty\n")
            best_node =  Node(feature=mode(parent.y),
                              branch=branch,
                              parent=parent,
                              leaf=True)
        elif all((X[d] == X[d].iloc[0]).all() for d in X.columns):  # if all feature values are identical
#             if debug:
#                 print("All instances have the same descriptive features\n")
            best_node = Node(feature=mode(y),
                             data=pd.concat([X, y], axis=1),
                             branch=branch,
                             parent=parent,
                             leaf=True)

        else:
#             if debug:
#                 print("===Information Gain===")
            best_feature = X.columns[np.argmax([self.information_gain(X, y, d) for d in X.columns])]
            best_node = deepcopy(Node(feature=best_feature,
                                 data=pd.concat([X, y], axis=1),
                                 branch=branch,
                                 parent=parent))

#             if debug:
#                 print()
#                 print("===Best Feature===")
#                 print(best_feature)
#                 print()

            partitions = [self.partition(X, y, best_feature, t) for t in self.levels[best_feature]]

            for *d, t in partitions:
#                 if debug:
#                     print(f"===Partitioned Dataset ({t})===")
#                     print(pd.concat([*d], axis=1))
#                     print()
                best_node.children.append(self.build_tree(*d, parent=best_node, branch=t))
        return best_node

    def fit(self, X, y):
        self.levels = {k: X[k].unique() for k in X.columns}
        self.root = self.build_tree(X, y)
        return self

    def predict(self, x):
        node = self.root
        while not node.isLeaf:
            for child in node.children:
                if child.branch == x.get(node.feature).values:
                    node = child
                    break
        return node.feature

In [6]:
debug = 1
dt = DecisionTree().fit(X, y)

In [7]:
dt

 Elevation (None)
	 Slope (high)
		 chapparal (steep)
		 chapparal (moderate)
		 conifer (flat)
	 riparian (low)
	 Stream (medium)
		 chapparal (false)
		 riparian (true)
	 conifer (highest)




In [8]:
X_test = pd.DataFrame({'Stream': ['true'],
                       'Slope': ['steep'],
                       'Elevation': ['high']})
dt.predict(X_test)

'chapparal'

In [9]:
class RandomForest():
    def __init__(self, n_estimators=5, n_sample=2):
        self.n_estimators = n_estimators
        self.n_sample = n_sample
        self.forest = []

    def __repr__(self):
        return self.forest

    def subsample(self, X, n_sample=2):
        return np.random.choice(X.columns.to_numpy(), n_sample, replace=False)

    def make_bootstrap(self, X, y, n_sample, key=True):
        feature_subset = self.subsample(X, int(np.log2(len(X))))
        d = pd.concat([X, y], axis=1)
        d = d.sample(n=n_sample, replace=key)
        return d.iloc[:, :-1][feature_subset], d.iloc[:, -1]

    def fit(self, X, y):
        for _ in range(self.n_estimators):
            a, b = self.make_bootstrap(X, y, self.n_sample)
            i = DecisionTree().fit(a, b)
            print(f"===Decision Tree {_+1}===\n")
            print(pd.concat([a, b], axis=1))
            print()
            print(i)
            print()
            self.forest.append(i)
        # self.forest = [DecisionTree().fit(*self.make_bootstrap(X, y, self.n_sample))
        #                for _ in range(self.n_estimators)]
        return self

    def predict(self, X):
        """Aggregation"""
        assert all(isinstance(m, DecisionTree) for m in self.forest)
        return mode([dt.predict(X) for dt in self.forest])

In [10]:
rf = RandomForest(n_estimators=10, n_sample=len(X)).fit(X, y)

===Decision Tree 1===

  Stream Elevation Vegetation
0  false      high  chapparal
4  false      high    conifer
5   true   highest    conifer
5   true   highest    conifer
0  false      high  chapparal
0  false      high  chapparal
5   true   highest    conifer

 Stream (None)
	 chapparal (false)
	 conifer (true)


===Decision Tree 2===

  Stream Elevation Vegetation
5   true   highest    conifer
2   true    medium   riparian
3  false    medium  chapparal
5   true   highest    conifer
2   true    medium   riparian
3  false    medium  chapparal
4  false      high    conifer

 Elevation (None)
	 conifer (highest)
	 Stream (medium)
		 riparian (true)
		 chapparal (false)
	 conifer (high)


===Decision Tree 3===

  Elevation     Slope Vegetation
1       low  moderate   riparian
0      high     steep  chapparal
1       low  moderate   riparian
4      high      flat    conifer
5   highest     steep    conifer
0      high     steep  chapparal
4      high      flat    conifer

 Slope (None)
	

In [11]:
X_test = pd.DataFrame({'Stream': ['true'],
                       'Slope': ['steep'],
                       'Elevation': ['high']})
rf.predict(X_test)

'chapparal'