In [1]:
import numpy as np
import pandas as pd
import statistics as stat
from copy import deepcopy

In [2]:
data = {
    'Stream': ['false', 'true', 'true', 'false', 'false', 'true', 'true'],
    'Slope': ['steep', 'moderate', 'steep', 'steep', 'flat', 'steep', 'steep'],
    'Elevation': ['high', 'low', 'medium', 'medium', 'high', 'highest', 'high'],
    'Vegetation': ['chapparal', 'riparian', 'riparian', 'chapparal', 'conifer', 'conifer', 'chapparal']
}

df = pd.DataFrame(data)
df

Unnamed: 0,Stream,Slope,Elevation,Vegetation
0,False,steep,high,chapparal
1,True,moderate,low,riparian
2,True,steep,medium,riparian
3,False,steep,medium,chapparal
4,False,flat,high,conifer
5,True,steep,highest,conifer
6,True,steep,high,chapparal


In [3]:
X, y = df.iloc[:, :-1], df.iloc[:, -1]

In [4]:
# Base Case 1
a = y.loc[X['Elevation']=='low']
len(a.unique()) == 1

True

In [5]:
# Base Case 2
data = {
    'A':[0,0],
    'B':[1,1]
}

a = pd.DataFrame(data)
print(a)

print(all((a[d] == a[d].iloc[0]).all() for d in a.columns))

data = {
    'A':[0,1],
    'B':[0,1]
}

a = pd.DataFrame(data)
print(a)
all((a[d] == a[d].iloc[0]).all() for d in a.columns)

   A  B
0  0  1
1  0  1
True
   A  B
0  0  0
1  1  1


False

In [6]:
# Base Case 3
a = pd.DataFrame({})
a.empty

True

In [7]:
class Node:
    def __init__(
        self,
        feature=None,
        data=None,
        arc=None,
        leaf=False,
        parent=None,
        children=[]
    ):
        self.feature = feature
        self.data = data
        self.arc = arc
        self.leaf = leaf
        # self.threshold = threshold
        self.parent = parent
        self.children = children

    def __str__(self):
        curr = self
        while curr.parent:
            print(curr.feature, f"<--{curr.arc}-- ", end='')
            curr = curr.parent
        return curr.feature

    @property
    def isLeaf(self):
        return self.leaf

    @property
    def y_data(self):
        return self.data.iloc[:, -1]
    
    @property
    def X_data(self):
        return self.data.iloc[:, :-1]

In [8]:
class DecisionTree:
    """
    A Rudimentary Decision Tree Classifier
    
    TODO
        - Pruning
        - Continuous Feature Values
    """
    def __init__(self):
        self.root = None

    def display_tree(self, node=None, depth=''):
        """Displays the decision tree"""
        if not depth: node = self.root

        print(depth, node.feature)
        if node:
            for child in node.children:
                self.display_tree(child, depth+'\t')

    def partition(self, X, y, d, t):
        p = pd.concat([X.loc[X[d]==t], y.loc[X[d]==t]], axis=1)
        p = p.drop([d], axis=1)
        return p.iloc[:, :-1], p.iloc[:, -1], t

    def entropy(self, X, y):
        """Measures the amount of uncertainty/impurity/heterogeneity in (X, y)"""
        proba = lambda t: len(X.loc[y==t]) / len(X)
        return -sum([proba(t) * np.log2(proba(t)) for t in y.unique()])
    
    def rem(self, X, y, d):
        """Measures the entropy after partitioning (X, y) on feature (d)"""
        weight = lambda t: len(X.loc[X[d]==t]) / len(X)
        return sum([weight(t) * self.entropy(X.loc[X[d]==t], y.loc[X[d]==t]) for t in X[d].unique()])

    def information_gain(self, X, y, d):
        """Measures the reduction in the overall entropy in (X, y) achieved by testing on feature (d)"""
        if debug:
            print(f"{d} = {self.entropy(X, y):.3f} - {self.rem(X, y, d):.3f} = {self.entropy(X, y) - self.rem(X, y, d):.3f}") 

        return self.entropy(X, y) - self.rem(X, y, d)

    def fit(self, X, y, *, parent=None, level=None):
        """Performs the ID3 algorithm"""
        if len(y.unique()) == 1:  # all instances have the same target feature values
            if debug:
                print("All instances have the same target feature value\n")
            return Node(feature=y.iloc[0],
                        data=pd.concat([X, y], axis=1),
                        arc=level,
                        leaf=True,
                        parent=parent)
        elif all((X[d] == X[d].iloc[0]).all() for d in X.columns):  # if all feature values are identical
            if debug:
                print("All instances have the same descriptive features\n")
            return Node(feature=stat.mode(y),
                        data=pd.concat([X, y], axis=1),
                        arc=level,
                        leaf=True,
                        parent=parent)
        elif X.empty:  # dataset is empty, return a leaf node labeled with the majority class of the parent
            if debug:
                print("Dataset is empty\n")
            return Node(feature=stat.mode(parent.y_data),
                        arc=level,
                        leaf=True,
                        parent=parent)

        if X.columns.size == 1:
            best_feature = X.columns[0]
        else:
            if debug:
                print("===Information Gain===")
            best_feature = X.columns[np.argmax([self.information_gain(X, y, d) for d in X.columns])]

        best_node = deepcopy(Node(feature=best_feature,
                             data=pd.concat([X, y], axis=1),
                             arc=level,
                             parent=parent))

        if debug:
            print()
            print("===Best Feature===")
            print(best_feature)
            print()

        partitions = [self.partition(X, y, best_feature, t) for t in X[best_feature].unique()]

        if not self.root:
            self.root = best_node

        for *d, t in partitions:
            if debug:
                print(f"===Partitioned Dataset {t}===")
                print(pd.concat([*d], axis=1))
                print()
            best_node.children.append(self.fit(*d, parent=best_node, level=t))

        if self.root is best_node:
            return self
        return best_node

    def predict(self, X):
        node = self.root
        while not node.isLeaf:
            arc = X[node.feature].iloc[0]
            for child in node.children:
                if arc == child.arc:
                    node = child
                    break  
        return node.feature

In [9]:
debug = 1
dt = DecisionTree().fit(X, y)

===Information Gain===
Stream = 1.557 - 1.251 = 0.306
Slope = 1.557 - 0.979 = 0.577
Elevation = 1.557 - 0.679 = 0.877

===Best Feature===
Elevation

===Partitioned Dataset high===
  Stream  Slope Vegetation
0  false  steep  chapparal
4  false   flat    conifer
6   true  steep  chapparal

===Information Gain===
Stream = 0.918 - 0.667 = 0.252
Slope = 0.918 - 0.000 = 0.918

===Best Feature===
Slope

===Partitioned Dataset steep===
  Stream Vegetation
0  false  chapparal
6   true  chapparal

All instances have the same target feature value

===Partitioned Dataset flat===
  Stream Vegetation
4  false    conifer

All instances have the same target feature value

===Partitioned Dataset low===
  Stream     Slope Vegetation
1   true  moderate   riparian

All instances have the same target feature value

===Partitioned Dataset medium===
  Stream  Slope Vegetation
2   true  steep   riparian
3  false  steep  chapparal

===Information Gain===
Stream = 1.000 - 0.000 = 1.000
Slope = 1.000 - 1.000 = 0

In [10]:
dt.display_tree()

 Elevation
	 Slope
		 chapparal
		 conifer
	 riparian
	 Stream
		 riparian
		 chapparal
	 conifer


In [11]:
X_test = pd.DataFrame({'Stream': ['true'],
                       'Slope': ['steep'],
                       'Elevation': ['high']})
dt.predict(X_test)

'chapparal'

In [12]:
class RandomForest():
    def __init__(self, n_estimators=5, n_sample=2):
        self.n_estimators = n_estimators
        self.n_sample = n_sample
        self.forest = []

    def __repr__(self):
        return self.forest

    def subsample(self, X, n_sample=2):
        return np.random.choice(X.columns.to_numpy(), n_sample, replace=False)

    def make_bootstrap(self, X, y, n_sample, key=True):
        feature_subset = self.subsample(X, int(np.log2(len(X))))
        d = pd.concat([X, y], axis=1)
        d = d.sample(n=n_sample, replace=key)
        return d.iloc[:, :-1][feature_subset], d.iloc[:, -1]

    def fit(self, X, y):
        if debug:
            for _ in range(self.n_estimators):
                print(f"===Decision Tree {_+1}===")
                a, b = self.make_bootstrap(X, y, self.n_sample)
                print(pd.concat([a, b], axis=1))
                _ = DecisionTree().fit(a, b)
                _.display_tree()
                self.forest.append(_)
        else:
            self.forest = [DecisionTree().fit(*self.make_bootstrap(X, y, self.n_sample))
                           for _ in range(self.n_estimators)]

    def predict(self, X):
        """Aggregation"""
        assert all(isinstance(m, DecisionTree) for m in self.forest)
        pred = [model.predict(X) for model in self.forest]
        print(pred)
        return stat.mode(pred)
        # return stat.mode([dt.predict(X) for dt in self.forest])

In [13]:
rf = RandomForest(n_estimators=50, n_sample=len(X))

In [14]:
rf.fit(X, y)

===Decision Tree 1===
      Slope Stream Vegetation
0     steep  false  chapparal
2     steep   true   riparian
1  moderate   true   riparian
1  moderate   true   riparian
4      flat  false    conifer
3     steep  false  chapparal
3     steep  false  chapparal
===Information Gain===
Slope = 1.449 - 0.464 = 0.985
Stream = 1.449 - 0.464 = 0.985

===Best Feature===
Slope

===Partitioned Dataset steep===
  Stream Vegetation
0  false  chapparal
2   true   riparian
3  false  chapparal
3  false  chapparal


===Best Feature===
Stream

===Partitioned Dataset false===
  Vegetation
0  chapparal
3  chapparal
3  chapparal

All instances have the same target feature value

===Partitioned Dataset true===
  Vegetation
2   riparian

All instances have the same target feature value

===Partitioned Dataset moderate===
  Stream Vegetation
1   true   riparian
1   true   riparian

All instances have the same target feature value

===Partitioned Dataset flat===
  Stream Vegetation
4  false    conifer

All i

All instances have the same target feature value

===Partitioned Dataset low===
      Slope Vegetation
1  moderate   riparian
1  moderate   riparian
1  moderate   riparian

All instances have the same target feature value

===Partitioned Dataset highest===
   Slope Vegetation
5  steep    conifer

All instances have the same target feature value

===Partitioned Dataset medium===
   Slope Vegetation
2  steep   riparian

All instances have the same target feature value

 Elevation
	 chapparal
	 riparian
	 conifer
	 riparian
===Decision Tree 11===
  Stream  Slope Vegetation
0  false  steep  chapparal
6   true  steep  chapparal
6   true  steep  chapparal
4  false   flat    conifer
4  false   flat    conifer
2   true  steep   riparian
3  false  steep  chapparal
===Information Gain===
Stream = 1.379 - 0.965 = 0.414
Slope = 1.379 - 0.516 = 0.863

===Best Feature===
Slope

===Partitioned Dataset steep===
  Stream Vegetation
0  false  chapparal
6   true  chapparal
6   true  chapparal
2   true   

All instances have the same target feature value

===Partitioned Dataset high===
   Slope Vegetation
6  steep  chapparal
0  steep  chapparal

All instances have the same target feature value

===Partitioned Dataset medium===
   Slope Vegetation
2  steep   riparian
3  steep  chapparal

All instances have the same descriptive features

===Partitioned Dataset low===
      Slope Vegetation
1  moderate   riparian

All instances have the same target feature value

 Elevation
	 conifer
	 chapparal
	 riparian
	 riparian
===Decision Tree 20===
  Elevation     Slope Vegetation
4      high      flat    conifer
3    medium     steep  chapparal
1       low  moderate   riparian
1       low  moderate   riparian
0      high     steep  chapparal
2    medium     steep   riparian
2    medium     steep   riparian
===Information Gain===
Elevation = 1.379 - 0.679 = 0.700
Slope = 1.379 - 0.571 = 0.807

===Best Feature===
Slope

===Partitioned Dataset flat===
  Elevation Vegetation
4      high    conifer

All

All instances have the same target feature value

 Elevation
	 riparian
	 conifer
	 riparian
===Decision Tree 28===
      Slope Elevation Vegetation
1  moderate       low   riparian
2     steep    medium   riparian
0     steep      high  chapparal
6     steep      high  chapparal
0     steep      high  chapparal
4      flat      high    conifer
6     steep      high  chapparal
===Information Gain===
Slope = 1.379 - 0.516 = 0.863
Elevation = 1.379 - 0.516 = 0.863

===Best Feature===
Slope

===Partitioned Dataset moderate===
  Elevation Vegetation
1       low   riparian

All instances have the same target feature value

===Partitioned Dataset steep===
  Elevation Vegetation
2    medium   riparian
0      high  chapparal
6      high  chapparal
0      high  chapparal
6      high  chapparal


===Best Feature===
Elevation

===Partitioned Dataset medium===
  Vegetation
2   riparian

All instances have the same target feature value

===Partitioned Dataset high===
  Vegetation
0  chapparal
6  ch

Stream = 1.557 - 1.251 = 0.306

===Best Feature===
Slope

===Partitioned Dataset steep===
  Stream Vegetation
2   true   riparian
6   true  chapparal
0  false  chapparal
5   true    conifer
0  false  chapparal


===Best Feature===
Stream

===Partitioned Dataset true===
  Vegetation
2   riparian
6  chapparal
5    conifer

All instances have the same descriptive features

===Partitioned Dataset false===
  Vegetation
0  chapparal
0  chapparal

All instances have the same target feature value

===Partitioned Dataset moderate===
  Stream Vegetation
1   true   riparian

All instances have the same target feature value

===Partitioned Dataset flat===
  Stream Vegetation
4  false    conifer

All instances have the same target feature value

 Slope
	 Stream
		 riparian
		 chapparal
	 riparian
	 conifer
===Decision Tree 39===
  Stream     Slope Vegetation
5   true     steep    conifer
3  false     steep  chapparal
4  false      flat    conifer
2   true     steep   riparian
2   true     steep   r

All instances have the same target feature value

===Partitioned Dataset false===
  Vegetation
0  chapparal
4    conifer
0  chapparal
0  chapparal

All instances have the same descriptive features

===Partitioned Dataset medium===
  Stream Vegetation
3  false  chapparal

All instances have the same target feature value

===Partitioned Dataset low===
  Stream Vegetation
1   true   riparian

All instances have the same target feature value

 Elevation
	 Stream
		 chapparal
		 chapparal
	 chapparal
	 riparian
===Decision Tree 48===
  Stream     Slope Vegetation
0  false     steep  chapparal
0  false     steep  chapparal
5   true     steep    conifer
6   true     steep  chapparal
1   true  moderate   riparian
5   true     steep    conifer
4  false      flat    conifer
===Information Gain===
Stream = 1.449 - 1.251 = 0.198
Slope = 1.449 - 0.694 = 0.755

===Best Feature===
Slope

===Partitioned Dataset steep===
  Stream Vegetation
0  false  chapparal
0  false  chapparal
5   true    conifer
6 

In [15]:
X_test = pd.DataFrame({'Stream': ['true'],
                       'Slope': ['steep'],
                       'Elevation': ['high']})
rf.predict(X_test)

['riparian', 'conifer', 'chapparal', 'chapparal', 'chapparal', 'riparian', 'chapparal', 'conifer', 'chapparal', 'chapparal', 'chapparal', 'chapparal', 'chapparal', 'chapparal', 'conifer', 'chapparal', 'chapparal', 'chapparal', 'chapparal', 'chapparal', 'chapparal', 'conifer', 'chapparal', 'chapparal', 'chapparal', 'conifer', 'conifer', 'chapparal', 'chapparal', 'riparian', 'riparian', 'riparian', 'conifer', 'chapparal', 'chapparal', 'chapparal', 'chapparal', 'riparian', 'riparian', 'chapparal', 'chapparal', 'riparian', 'conifer', 'chapparal', 'chapparal', 'conifer', 'chapparal', 'conifer', 'riparian', 'conifer']


'chapparal'