In [103]:
import numpy as np
import pandas as pd
import statistics as stat

In [104]:
np.random.seed(42)

In [105]:
data = {
    'Stream': ['false', 'true', 'true', 'false', 'false', 'true', 'true'],
    'Slope': ['steep', 'moderate', 'steep', 'steep', 'flat', 'steep', 'steep'],
    'Elevation': ['high', 'low', 'medium', 'medium', 'high', 'highest', 'high'],
    'Vegetation': ['chapparal', 'riparian', 'riparian', 'chapparal', 'conifer', 'conifer', 'chapparal']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Stream,Slope,Elevation,Vegetation
0,False,steep,high,chapparal
1,True,moderate,low,riparian
2,True,steep,medium,riparian
3,False,steep,medium,chapparal
4,False,flat,high,conifer
5,True,steep,highest,conifer
6,True,steep,high,chapparal


In [106]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]

In [107]:
class Node:
    def __init__(self, feature=None, data=None, arc=None, leaf=False, parent=None, children=[]):
        self.feature = feature
        self.data = data
        self.arc = arc
        self.leaf = leaf
        # self.threshold = threshold
        self.parent = parent
        self.children = children
    
    @property
    def isLeaf(self):
        return self.leaf

    @property
    def y_data(self):
        return self.data.iloc[:, -1]
    
    @property
    def X_data(self):
        return self.data.iloc[:, :-1]

In [108]:
class DecisionTree:
    """
    A Rudimentary Decision Tree Classifier
    
    TODO
        - Pruning
        - Continuous Feature Values
    """
    def __init__(self):
        self.root = None
        # self.max_depth = float('inf')

    def display(self, node, depth=0):
        """Displays the decision tree"""
        if node is None:
            return
        for child in node.children:
            print(child.parent.feature)
            print(' ' * 4 * depth + '-> ' + child.feature)
            # self.display(child, depth+1)

    def entropy(self, X, y):
        """Measures the amount of uncertainty/impurity/heterogeneity in (X, y)"""
        proba = lambda t: len(X.loc[y==t]) / len(X)
        return -sum([proba(t) * np.log2(proba(t)) for t in y.unique()])
    
    def rem(self, X, y, d):
        """Measures the entropy after partitioning (X, y) on feature (d)"""
        weight = lambda t: len(X.loc[X[d]==t]) / len(X)
        return sum([weight(t) * self.entropy(X.loc[X[d]==t], y.loc[X[d]==t]) for t in X[d].unique()])

    def information_gain(self, X, y, d):
        """Measures the reduction in the overall entropy in (X, y) achieved by testing on feature (d)"""
        print(f"Info-Gain({d}):", self.entropy(X, y) - self.rem(X, y, d))
        return self.entropy(X, y) - self.rem(X, y, d)

    def fit(self, X, y, *, level=None, parent=None):
        """Performs the ID3 algorithm"""
        if len(y.unique()) == 1:  # all instances have the same target feature values
            return Node(feature=y.iloc[0],
                        data=pd.concat([X, y], axis=1),
                        arc=level,
                        leaf=True,
                        parent=parent)
        elif all((X[d] == X[d].iloc[0]).all() for d in X.columns):  # if all feature values are identical
            return Node(feature=stat.mode(y),
                        data=pd.concat([X, y], axis=1),
                        arc=level,
                        leaf=True,
                        parent=parent)
        elif X.empty:  # dataset is empty, return a leaf node labeled with the majority class of the parent
            return Node(feature=stat.mode(parent.y_data),
                        leaf=True,
                        arc=level,
                        parent=parent)

        best_feature = X.columns[np.argmax([self.information_gain(X, y, d) for d in X.columns])]
        best_node = Node(feature=best_feature,
                         data=pd.concat([X, y], axis=1),
                         arc=level,
                         parent=parent)
        
        print('Best Feature =', best_feature)

        partitions = [[X.loc[X[best_feature]==t], y.loc[X[best_feature]==t], t]
                      for t in X[best_feature].unique()]
        for i, _ in enumerate(partitions):
            partitions[i][0] = partitions[i][0].drop([best_feature], axis=1)

        # if not self.root:
        #     self.root = best_node

        for *d, t in partitions:
            print(pd.concat([*d], axis=1))
            best_node.children.append(self.fit(*d, level=t, parent=best_node))

        # if self.root is best_node:
        #     self.root = best_node

        return best_node

    def predict(self, X):
        return X

In [109]:
dt = DecisionTree()

In [110]:
root = dt.fit(X, y)

Info-Gain(Stream): 0.30595849286804166
Info-Gain(Slope): 0.5774062828523452
Info-Gain(Elevation): 0.8773870642966131
Best Feature = Elevation
  Stream  Slope Vegetation
0  false  steep  chapparal
4  false   flat    conifer
6   true  steep  chapparal
Info-Gain(Stream): 0.2516291673878229
Info-Gain(Slope): 0.9182958340544896
Best Feature = Slope
  Stream Vegetation
0  false  chapparal
6   true  chapparal
  Stream Vegetation
4  false    conifer
  Stream     Slope Vegetation
1   true  moderate   riparian
  Stream  Slope Vegetation
2   true  steep   riparian
3  false  steep  chapparal
Info-Gain(Stream): 1.0
Info-Gain(Slope): 0.0
Best Feature = Stream
   Slope Vegetation
2  steep   riparian
   Slope Vegetation
3  steep  chapparal
  Stream  Slope Vegetation
5   true  steep    conifer


In [111]:
X_test = pd.DataFrame({'Stream': ['true'],
        'Slope': ['moderate'],
        'Elevation': ['high']})

dt.predict(X_test)

Unnamed: 0,Stream,Slope,Elevation
0,True,moderate,high


In [112]:
class RandomForest():
    def __init__(self, n_estimators=5, n_sample=2):
        self.n_estimators = n_estimators
        self.n_sample = n_sample
        self.forest = []

    def __repr__(self):
        return self.forest

    def subsample(self, X, n_sample=2):
        return np.random.choice(X.columns.to_numpy(), n_sample, replace=False)

    def make_bootstrap(self, X, y, n_sample, key=True):
        feature_subset = self.subsample(X, int(np.floor(np.log2(len(X)))))
        X = X.sample(n=n_sample, replace=key, random_state=42)
        y = y.sample(n=n_sample, replace=key, random_state=42)
        return X[feature_subset], y

    def fit(self, X, y):
        for _ in range(self.n_estimators):
            print(f"===Decision Tree {_+1}===")
            a, b = self.make_bootstrap(X, y, self.n_sample)
            print(pd.concat([a, b], axis=1))
            DecisionTree().fit(a, b)
        # self.forest = [DecisionTree().fit(*self.make_bootstrap(X, y, self.n_sample))
        #                for _ in range(self.n_estimators)]

    def predict(self, X):
        """Aggregation"""
        return stat.mode([dt.predict(X) for dt in self.forest])

In [113]:
rf = RandomForest(n_sample=len(X))

In [114]:
rf.fit(X, y)

===Decision Tree 1===
  Stream  Slope Vegetation
6   true  steep  chapparal
3  false  steep  chapparal
4  false   flat    conifer
6   true  steep  chapparal
2   true  steep   riparian
4  false   flat    conifer
4  false   flat    conifer
Info-Gain(Stream): 0.5916727785823276
Info-Gain(Slope): 0.9852281360342516
Best Feature = Slope
  Stream Vegetation
6   true  chapparal
3  false  chapparal
6   true  chapparal
2   true   riparian
Info-Gain(Stream): 0.12255624891826566
Best Feature = Stream
  Vegetation
6  chapparal
6  chapparal
2   riparian
  Vegetation
3  chapparal
  Stream Vegetation
4  false    conifer
4  false    conifer
4  false    conifer
===Decision Tree 2===
   Slope Elevation Vegetation
6  steep      high  chapparal
3  steep    medium  chapparal
4   flat      high    conifer
6  steep      high  chapparal
2  steep    medium   riparian
4   flat      high    conifer
4   flat      high    conifer
Info-Gain(Slope): 0.9852281360342516
Info-Gain(Elevation): 0.46956521111470717
Best F