In [1]:
import numpy as np
import pandas as pd
import statistics as stat
from copy import deepcopy

In [2]:
data = {
    'Stream': ['false', 'true', 'true', 'false', 'false', 'true', 'true'],
    'Slope': ['steep', 'moderate', 'steep', 'steep', 'flat', 'steep', 'steep'],
    'Elevation': ['high', 'low', 'medium', 'medium', 'high', 'highest', 'high'],
    'Vegetation': ['chapparal', 'riparian', 'riparian', 'chapparal', 'conifer', 'conifer', 'chapparal']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Stream,Slope,Elevation,Vegetation
0,False,steep,high,chapparal
1,True,moderate,low,riparian
2,True,steep,medium,riparian
3,False,steep,medium,chapparal
4,False,flat,high,conifer
5,True,steep,highest,conifer
6,True,steep,high,chapparal


In [3]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]

In [4]:
class Node:
    def __init__(
        self,
        feature=None,
        data=None,
        arc=None,
        leaf=False,
        parent=None,
        children=[]
    ):
        self.feature = feature
        self.data = data
        self.arc = arc
        self.leaf = leaf
        # self.threshold = threshold
        self.parent = parent
        self.children = children

    def __str__(self):
        curr = self
        while curr.parent:
            print(curr.feature, f"<--{curr.arc}-- ", end='')
            curr = curr.parent
        return curr.feature

    @property
    def isLeaf(self):
        return self.leaf

    @property
    def y_data(self):
        return self.data.iloc[:, -1]
    
    @property
    def X_data(self):
        return self.data.iloc[:, :-1]

In [5]:
class DecisionTree:
    """
    A Rudimentary Decision Tree Classifier
    
    TODO
        - Pruning
        - Continuous Feature Values
    """
    def __init__(self):
        self.root = None

    def display_tree(self, node=None, depth=''):
        """Displays the decision tree"""
        if not depth: node = self.root

        print(depth, node.feature)
        if node:
            for child in node.children:
                self.display_tree(child, depth+'\t')

    def partition(self, X, y, d, t):
        p = pd.concat([X.loc[X[d]==t], y.loc[X[d]==t]], axis=1)
        p = p.drop([d], axis=1)
        return p.iloc[:, :-1], p.iloc[:, -1], t

    def entropy(self, X, y):
        """Measures the amount of uncertainty/impurity/heterogeneity in (X, y)"""
        proba = lambda t: len(X.loc[y==t]) / len(X)
        return -sum([proba(t) * np.log2(proba(t)) for t in y.unique()])
    
    def rem(self, X, y, d):
        """Measures the entropy after partitioning (X, y) on feature (d)"""
        weight = lambda t: len(X.loc[X[d]==t]) / len(X)
        return sum([weight(t) * self.entropy(X.loc[X[d]==t], y.loc[X[d]==t]) for t in X[d].unique()])

    def information_gain(self, X, y, d):
        """Measures the reduction in the overall entropy in (X, y) achieved by testing on feature (d)"""
        if debug:
            print(f"{d} = {self.entropy(X, y):.3f} - {self.rem(X, y, d):.3f} = {self.entropy(X, y) - self.rem(X, y, d):.3f}") 

        return self.entropy(X, y) - self.rem(X, y, d)

    def fit(self, X, y, *, parent=None, level=None):
        """Performs the ID3 algorithm"""
        if len(y.unique()) == 1:  # all instances have the same target feature values
            if debug:
                print("All instances have the same target feature value\n")
            return Node(feature=y.iloc[0],
                        data=pd.concat([X, y], axis=1),
                        arc=level,
                        leaf=True,
                        parent=parent)
        elif all((X[d] == X[d].iloc[0]).all() for d in X.columns):  # if all feature values are identical
            if debug:
                print("All instances have the same descriptive features\n")
            return Node(feature=stat.mode(y),
                        data=pd.concat([X, y], axis=1),
                        arc=level,
                        leaf=True,
                        parent=parent)
        elif X.empty:  # dataset is empty, return a leaf node labeled with the majority class of the parent
            if debug:
                print("Dataset is empty\n")
            return Node(feature=stat.mode(parent.y_data),
                        arc=level,
                        leaf=True,
                        parent=parent)

        if debug:
            print("===Information Gain===")
        best_feature = X.columns[np.argmax([self.information_gain(X, y, d) for d in X.columns])]

        best_node = deepcopy(Node(feature=best_feature,
                             data=pd.concat([X, y], axis=1),
                             arc=level,
                             parent=parent))

        if debug:
            print()
            print("===Best Feature===")
            print(best_feature)
            print()

        partitions = [self.partition(X, y, best_feature, t) for t in X[best_feature].unique()]

        if not self.root:
            self.root = best_node

        for *d, t in partitions:
            if debug:
                print(f"===Partitioned Dataset {t}===")
                print(pd.concat([*d], axis=1))
                print()
            best_node.children.append(self.fit(*d, parent=best_node, level=t))

        if self.root is best_node:
            return self
        return best_node

    def predict(self, X):
        node = self.root
        while not node.isLeaf:
            arc = X[node.feature].iloc[0]
            for child in node.children:
                if arc == child.arc:
                    node = child
                    break  
        return node.feature

In [6]:
debug = 0
dt = DecisionTree().fit(X, y)

In [7]:
dt.display_tree()

 Elevation
	 Slope
		 chapparal
		 conifer
	 riparian
	 Stream
		 riparian
		 chapparal
	 conifer


In [8]:
X_test = pd.DataFrame({'Stream': ['true'],
                       'Slope': ['steep'],
                       'Elevation': ['high']})
dt.predict(X_test)

'chapparal'

In [9]:
class RandomForest():
    def __init__(self, n_estimators=5, n_sample=2):
        self.n_estimators = n_estimators
        self.n_sample = n_sample
        self.forest = []

    def __repr__(self):
        return self.forest

    def subsample(self, X, n_sample=2):
        return np.random.choice(X.columns.to_numpy(), n_sample, replace=False)

    def make_bootstrap(self, X, y, n_sample, key=True):
        feature_subset = self.subsample(X, int(np.log2(len(X))))
        d = pd.concat([X, y], axis=1)
        d = d.sample(n=n_sample, replace=key)
        return d.iloc[:, :-1][feature_subset], d.iloc[:, -1]

    def fit(self, X, y):
        if debug:
            for _ in range(self.n_estimators):
                print(f"===Decision Tree {_+1}===")
                a, b = self.make_bootstrap(X, y, self.n_sample)
                print(pd.concat([a, b], axis=1))
                _ = DecisionTree().fit(a, b)
                _.display_tree()
                self.forest.append(_)
        else:
            self.forest = [DecisionTree().fit(*self.make_bootstrap(X, y, self.n_sample))
                           for _ in range(self.n_estimators)]

    def predict(self, X):
        """Aggregation"""
        assert all(isinstance(m, DecisionTree) for m in self.forest)
        pred = [model.predict(X) for model in self.forest]
        print(pred)
        return stat.mode(pred)
        # return stat.mode([dt.predict(X) for dt in self.forest])

In [10]:
rf = RandomForest(n_estimators=50, n_sample=len(X))

In [11]:
rf.fit(X, y)

In [12]:
X_test = pd.DataFrame({'Stream': ['true'],
                       'Slope': ['steep'],
                       'Elevation': ['high']})
rf.predict(X_test)

['conifer', 'chapparal', 'conifer', 'conifer', 'riparian', 'riparian', 'chapparal', 'conifer', 'chapparal', 'chapparal', 'chapparal', 'chapparal', 'conifer', 'chapparal', 'chapparal', 'conifer', 'chapparal', 'conifer', 'chapparal', 'chapparal', 'conifer', 'chapparal', 'conifer', 'chapparal', 'conifer', 'chapparal', 'chapparal', 'conifer', 'chapparal', 'chapparal', 'chapparal', 'chapparal', 'chapparal', 'conifer', 'chapparal', 'chapparal', 'chapparal', 'chapparal', 'chapparal', 'chapparal', 'chapparal', 'conifer', 'conifer', 'riparian', 'chapparal', 'chapparal', 'chapparal', 'riparian', 'chapparal', 'conifer']


'chapparal'