In [31]:
import numpy as np
import pandas as pd
import statistics as stat

In [32]:
np.random.seed(42)

In [33]:
data = {
    'Stream': ['false', 'true', 'true', 'false', 'false', 'true', 'true'],
    'Slope': ['steep', 'moderate', 'steep', 'steep', 'flat', 'steep', 'steep'],
    'Elevation': ['high', 'low', 'medium', 'medium', 'high', 'highest', 'high'],
    'Vegetation': ['chapparal', 'riparian', 'riparian', 'chapparal', 'conifer', 'conifer', 'chapparal']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Stream,Slope,Elevation,Vegetation
0,False,steep,high,chapparal
1,True,moderate,low,riparian
2,True,steep,medium,riparian
3,False,steep,medium,chapparal
4,False,flat,high,conifer
5,True,steep,highest,conifer
6,True,steep,high,chapparal


In [34]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]
X['Stream'][0]

'false'

In [35]:
class Node:
    def __init__(self, data=None, feature=None, parent=None, leaf=False, arc=None, children=[]):
        self.data = data
        self.feature = feature
        self.parent = parent
        self.leaf = leaf
        self.arc = arc
        self.children = children
    
    @property
    def isLeaf(self):
        return self.leaf

    @property
    def y_data(self):
        return self.data.iloc[:, -1]
    
    @property
    def X_data(self):
        return self.data.iloc[:, :-1]

In [36]:
class DecisionTree:
    """
    A Rudimentary Decision Tree Classifier
    
    TODO
        - Pruning
        - Continuous Feature Values
    """
    def __init__(self):
        self.root = None
        # self.max_depth = float('inf')

    def entropy(self, X, y):
        proba = lambda t: len(X.loc[y==t]) / len(X)
        return -sum([proba(t) * np.log2(proba(t)) for t in y.unique()])
    
    def rem(self, X, y, d):
        weight = lambda t: len(X.loc[X[d]==t]) / len(X)
        return sum([weight(t) * self.entropy(X.loc[X[d]==t], y.loc[X[d]==t]) for t in X[d].unique()])

    def information_gain(self, X, y, d):
        return self.entropy(X, y) - self.rem(X, y, d)

    def fit(self, X, y, *, arc=None, parent=None):
        """Performs the ID3 algorithm"""
        if (len(y.unique())) == 1:  # all instances have the same target feature values
            return Node(data=pd.concat([X, y], axis=1),
                        feature=y.iloc[0],
                        leaf=True,
                        arc=arc,
                        parent=parent)
        elif all((X[d] == X[d].iloc[0]).all() for d in X.columns):  # if all feature values are identical
            return Node(feature=stat.mode(parent.y_data),
                        leaf=True,
                        arc=arc,
                        parent=parent)
        elif not len(X):  # dataset is empty, return a leaf node labeled with the majority class of the parent
            return Node(feature=stat.mode(parent.y_data),
                        leaf=True,
                        arc=arc,
                        parent=parent)

        best_feature = X.columns[np.argmax([self.information_gain(X, y, d) for d in X.columns])]
        best_node = Node(
            data=pd.concat([X, y], axis=1),
            feature=best_feature,
            arc=arc,
            parent=parent)
        
        print('Best Feature', best_feature)

        partitions = [[X.loc[X[best_feature]==t], y.loc[X[best_feature]==t], t]
                      for t in X[best_feature].unique()]
        for i, _ in enumerate(partitions):
            partitions[i][0] = partitions[i][0].drop([best_feature], axis=1)

        # if not self.root:
        #     self.root = best_node

        for *d, t in partitions:
            # print(pd.concat([*d], axis=1))
            best_node.children.append(self.fit(*d, arc=t, parent=best_node))

        # if self.root is best_node:
        #     self.root = best_node

        return best_node

    def predict(self, X):
        pass

In [37]:
dt = DecisionTree()

In [38]:
root = dt.fit(X, y)

Best Feature Elevation
Best Feature Slope
Best Feature Stream


In [39]:
for i in root.children:
    print(i.parent.feature, '->', i.arc)
    print('\t', i.feature)

Slope -> steep
	 chapparal
Slope -> flat
	 conifer
Elevation -> high
	 Slope
Elevation -> low
	 riparian
Stream -> true
	 riparian
Stream -> false
	 chapparal
Elevation -> medium
	 Stream
Elevation -> highest
	 conifer


In [40]:
class RandomForest():
    def __init__(self, n_estimators=5):
        self.forest = []
        self.n_estimators = n_estimators

    def subsample(self, X, n_sample=2):
        return np.random.choice(X.columns.to_numpy(), n_sample, replace=False)

    def make_bootstrap(self, X, y, n_sample, key=True):
        feature_subset = self.subsample(X, int(np.floor(np.log2(len(X)))))
        X = X.sample(n=n_sample, replace=key, random_state=42)
        y = y.sample(n=n_sample, replace=key, random_state=42)
        return X[feature_subset], y

    def fit(self, X, y):
        test_sample = 7
        for _ in range(self.n_estimators):
            print("===Bootstrap Sample===")
            a, b = self.make_bootstrap(X, y, test_sample)
            print(pd.concat([a, b], axis=1))
            DecisionTree().fit(a, b)
        # self.forest = [DecisionTree().fit(*self.make_bootstrap(X, y, test_sample))
        #                for _ in range(self.n_estimators)]

    def predict(self, X):
        """Aggregation"""
        return stat.mode([dt.predict(X) for dt in self.forest])

In [41]:
rf = RandomForest()

In [42]:
rf.fit(X, y)

===Bootstrap Sample===
  Stream  Slope Vegetation
6   true  steep  chapparal
3  false  steep  chapparal
4  false   flat    conifer
6   true  steep  chapparal
2   true  steep   riparian
4  false   flat    conifer
4  false   flat    conifer
Best Feature Slope
Best Feature Stream
===Bootstrap Sample===
   Slope Elevation Vegetation
6  steep      high  chapparal
3  steep    medium  chapparal
4   flat      high    conifer
6  steep      high  chapparal
2  steep    medium   riparian
4   flat      high    conifer
4   flat      high    conifer
Best Feature Slope
Best Feature Elevation
===Bootstrap Sample===
  Stream  Slope Vegetation
6   true  steep  chapparal
3  false  steep  chapparal
4  false   flat    conifer
6   true  steep  chapparal
2   true  steep   riparian
4  false   flat    conifer
4  false   flat    conifer
Best Feature Slope
Best Feature Stream
===Bootstrap Sample===
   Slope Elevation Vegetation
6  steep      high  chapparal
3  steep    medium  chapparal
4   flat      high    coni