In [53]:
import numpy as np
import pandas as pd
from copy import deepcopy
from statistics import mode
from sklearn.metrics import confusion_matrix

In [54]:
df = pd.read_pickle(r"/Users/duong-jason/Desktop/dc/project_2/dataset/cancer.pkl")
X, y = df.iloc[:, :-1], df.iloc[:, -1]

In [55]:
data = {
    'Stream': ['false', 'true', 'true', 'false', 'false', 'true', 'true'],
    'Slope': ['steep', 'moderate', 'steep', 'steep', 'flat', 'steep', 'steep'],
    'Elevation': ['high', 'low', 'medium', 'medium', 'high', 'highest', 'high'],
    'Vegetation': ['chapparal', 'riparian', 'riparian', 'chapparal', 'conifer', 'conifer', 'chapparal']
}

df = pd.DataFrame(data)
X, y = df.iloc[:, :-1], df.iloc[:, -1]
X.loc[X["Stream"]=="true"]
X["Stream"].unique()

array(['false', 'true'], dtype=object)

In [56]:
# from sklearn.model_selection import train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Decision Tree Implementation

###  Gini Index
$Gini(t, \mathcal{D})=1-\sum_{l\in levels(t)}P(t=l)^2$

### Entropy *(Bits)*
$\mathcal{H}(t, \mathcal{D})=-\sum_{l\in levels(t)\\}^{}{P(t=l)\cdot\log_2(P(t=l))}$

### Rem
$rem(d,\mathcal{D})=\sum_{l\in levels(t)}{}\frac{|\mathcal{D}_{d=l}|}{\mathcal{D}}\cdot \mathcal{H}(t, \mathcal{D}_{d=l})$

### Information Gain
$IG(d, \mathcal{D})=\mathcal{H}(t, \mathcal{D})-rem(d, \mathcal{D})$

### Information Gain Ratio
$GR(d, \mathcal{D})=\frac{IG(d, \mathcal{D})}{\mathcal{H}(d, \mathcal{D})}$

In [57]:
class Node:
    def __init__(
        self,
        *,
        feature=None,
        data=None,
        branch=None,
        parent=None,
        leaf=False,
        children=[]
    ):
        self.feature = feature
        self.data = data
        self.branch = branch
        self.parent = parent
        self.leaf = leaf
        self.children = children

    @property
    def isLeaf(self):
        return self.leaf

    @property
    def X(self):
        return self.data.iloc[:, :-1]

    @property
    def y(self):
        return self.data.iloc[:, -1]

In [68]:
class DecisionTree:
    """A Rudimentary Decision Tree Classifier"""
    def __init__(self, *, metric="entropy", info="info_gain", criterion={}):
        """
        Criterion (Pre-Pruning) = {max_depth, partition_threshold, low_gain}
        Metric = {gain, gini}
        Info = {info_gain, info_gain_ratio}
        """
        self.root = None
        self.levels = None
        self.metric = self.entropy if metric == "entropy" else self.gini
        self.info = info
        self.criterion = criterion

    def __repr__(self, node=None, depth=0):
        """Displays the decision tree"""
        if not node:
            node = self.root

        print(depth * '\t', node.feature, f"(Branch={node.branch})")
        for child in node.children:
            self.__repr__(child, depth+1)

        return ""
    
    def partition(self, X, y, d, t):
        """Returns a subset of the training data with feature (d) of level (t)"""
        D = pd.concat([X.loc[X[d]==t], y.loc[X[d]==t]], axis=1)
        D = D.drop([d], axis=1)
        return D.iloc[:, :-1], D.iloc[:, -1], t

    def gini(self, X, y):
        proba = lambda t: len(X.loc[y==t]) / len(X)
        return 1-sum([proba(t)**2 for t in y.unique()])

    def entropy(self, X, y):
        """Measures the amount of uncertainty/impurity/heterogeneity in (X, y)"""
        proba = lambda t: len(X.loc[y==t]) / len(X)
        return -sum([proba(t) * np.log2(proba(t)) for t in y.unique()])

    def rem(self, X, y, d):
        """Measures the entropy after partitioning (X, y) on feature (d)"""
        weight = lambda t: len(X.loc[X[d]==t]) / len(X)
        return sum([weight(t) * self.metric(X.loc[X[d]==t], y.loc[X[d]==t]) for t in X[d].unique()])

    def information_gain(self, X, y, d):
        """Measures the reduction in the overall entropy in (X, y) achieved by testing on feature (d)"""
        if debug:
            print(f"{d} = {self.metric(X, y):.3f} - {self.rem(X, y, d):.3f} = {self.metric(X, y) - self.rem(X, y, d):.3f}") 
        return self.metric(X, y) - self.rem(X, y, d)

    def information_gain_ratio(self, X, y, d):
        proba = lambda t: len(X.loc[X[d]==t]) / len(X)
        entropy = lambda: -sum([proba(t) * np.log2(proba(t)) for t in X[d].unique()])

        if debug:
            print(f"{d} = ({self.metric(X, y):.3f} - {self.rem(X, y, d):.3f}) / {entropy()} = {(self.metric(X, y) - self.rem(X, y, d)) / entropy()}")
        return self.metric(X, y) - self.rem(X, y, d) / entropy()


    def build_tree(self, X, y, *, parent=None, branch=None, depth=0):
        """Performs the ID3 algorithm"""
        if len(y.unique()) == 1:  # all instances have the same target feature values
            if debug:
                print("All instances have the same target feature value\n")
            return Node(feature=y.iat[0],
                        data=pd.concat([X, y], axis=1),
                        branch=branch,
                        parent=parent,
                        leaf=True)
        elif X.empty:  # dataset is empty, return a leaf node labeled with the majority class of the parent
            if debug:
                print("Dataset is empty\n")
            return Node(feature=mode(parent.y),
                        branch=branch,
                        parent=parent,
                        leaf=True)
        elif all((X[d] == X[d].iloc[0]).all() for d in X.columns):  # if all feature values are identical
            if debug:
                print("All instances have the same descriptive features\n")
                return Node(feature=mode(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            leaf=True)
        elif self.criterion.get("max_depth"):
            if depth >= self.criterion["max_depth"]:
                if debug:
                    print("Stopping at Max Depth\n")
                return Node(feature=mode(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            leaf=True)
        elif self.criterion.get("partition_threshold"):
            if len(X) < self.criterion["partition_threshold"]:
                if debug:
                    print(f"Stopping at {len(X)} instances\n")
                return Node(feature=mode(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            leaf=True)

        if debug:
            print("===Information Gain===")

        if self.info == "info_gain":
            gain = np.argmax([self.information_gain(X, y, d) for d in X.columns])
        elif self.info == "info_gain_ratio":
            gain = np.argmax([self.information_gain_ratio(X, y, d) for d in X.columns])

        if self.criterion.get('low_gain'):
            if gain <= self.criterion["low_gain"]:
                if debug:
                    print(f"Stopping at Gain={gain}\n")
                return Node(feature=mode(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            leaf=True)

        best_feature = X.columns[gain]
        best_node = deepcopy(Node(feature=best_feature,
                                  data=pd.concat([X, y], axis=1),
                                  branch=branch,
                                  parent=parent))

        if debug:
            print()
            print("===Best Feature===")
            print(best_feature)
            print()

        X_levels = [self.partition(X, y, best_feature, level) for level in self.levels[best_feature]]

        for *d, level in X_levels:
            if debug:
                print(f"===Partitioned Dataset ({level})===")
                print(pd.concat(d, axis=1).head())
                print()
            best_node.children.append(self.build_tree(*d, parent=best_node, branch=level, depth=depth+1))
        return best_node

    def fit(self, X, y):
        self.levels = {k: X[k].unique() for k in X.columns}
        self.root = self.build_tree(X, y)
        return self

    def predict(self, x):
        node = self.root
        while not node.isLeaf:
            for child in node.children:
                if child.branch == x.get(node.feature).values:
                    node = child
                    break
        return node

    def predict_prob(self, x):
        node = self.predict(x)
        proba = lambda t: len(node.X.loc[node.y==t]) / len(node.X)
        return np.array([proba(t) for t in node.y.unique()])

    def score(self, X, y):
        y_hat = [self.predict(X.iloc[x].to_frame().T).feature for x in range(len(X))]
        return confusion_matrix(y, y_hat, labels=y.unique())

In [69]:
# %%time
# dt = DecisionTree(metric='entropy', criterion={'partition_threshold': 5}).fit(X_train, y_train)
# dt.score(X_test, y_test)

In [70]:
debug = 1
dt = DecisionTree(metric='entropy', info="info_gain_ratio").fit(X, y)

===Information Gain===
Stream = (1.557 - 1.251) / 0.9852281360342515 = 0.310545833678267
Slope = (1.557 - 0.979) / 1.1488348542809166 = 0.5026016408718359
Elevation = (1.557 - 0.679) / 1.8423709931771084 = 0.4762271375015451

===Best Feature===
Elevation

===Partitioned Dataset (high)===
  Stream  Slope Vegetation
0  false  steep  chapparal
4  false   flat    conifer
6   true  steep  chapparal

===Information Gain===
Stream = (0.918 - 0.667) / 0.9182958340544896 = 0.274017542121281
Slope = (0.918 - 0.000) / 0.9182958340544896 = 1.0

===Best Feature===
Slope

===Partitioned Dataset (steep)===
  Stream Vegetation
0  false  chapparal
6   true  chapparal

All instances have the same target feature value

===Partitioned Dataset (moderate)===
Empty DataFrame
Columns: [Stream, Vegetation]
Index: []

Dataset is empty

===Partitioned Dataset (flat)===
  Stream Vegetation
4  false    conifer

All instances have the same target feature value

===Partitioned Dataset (low)===
  Stream     Slope Veg

  print(f"{d} = ({self.metric(X, y):.3f} - {self.rem(X, y, d):.3f}) / {entropy()} = {(self.metric(X, y) - self.rem(X, y, d)) / entropy()}")
  return self.metric(X, y) - self.rem(X, y, d) / entropy()


In [71]:
dt

 Elevation (Branch=None)
	 Slope (Branch=high)
		 chapparal (Branch=steep)
		 chapparal (Branch=moderate)
		 conifer (Branch=flat)
	 riparian (Branch=low)
	 Slope (Branch=medium)
		 Stream (Branch=steep)
			 chapparal (Branch=false)
			 riparian (Branch=true)
		 riparian (Branch=moderate)
		 riparian (Branch=flat)
	 conifer (Branch=highest)




In [72]:
dt.predict_prob(X.iloc[0].to_frame().T)

array([1.])

# Random Forest Implementation

In [73]:
class RandomForest:
    def __init__(self, n_estimators=5, n_sample=2, info="info_gain", criterion={}):
        self.n_estimators = n_estimators
        self.n_sample = n_sample
        self.forest = [DecisionTree(info=info, criterion=criterion) for _ in range(n_estimators)]

    def sub_sample(self, X, n_sample=2):
        """Enforces feature randomness"""
        return np.random.choice(X.columns.to_numpy(), n_sample, replace=False)

    def bootstrap_sample(self, X, y, n_sample, key=True):
        feature_subset = self.sub_sample(X, int(np.log2(len(X))))
        d = pd.concat([X, y], axis=1)
        d = d.sample(n=n_sample, replace=key)
        return d.iloc[:, :-1][feature_subset], d.iloc[:, -1]

    def fit(self, X, y):
        for tree in self.forest:
            tree.fit(*self.bootstrap_sample(X, y, self.n_sample))
        return self

    def predict(self, x):
        assert all(isinstance(model, DecisionTree) for model in self.forest)
        return mode([dt.predict(x).feature for dt in self.forest])

    def score(self, X, y):
        y_hat = [self.predict(X.iloc[x].to_frame().T) for x in range(len(X))]
        return confusion_matrix(y, y_hat, labels=y.unique())

In [74]:
# %%time
# rf = RandomForest(n_estimators=20, n_sample=len(X_train), criterion={'partition_threshold': 5}).fit(X_train, y_train)
# rf.score(X_test, y_test)

In [75]:
rf = RandomForest(n_estimators=20, n_sample=len(X), info="info_gain_ratio").fit(X, y)

===Information Gain===
Slope = (1.379 - 0.857) / 0.9852281360342515 = 0.5294617736385712
Elevation = (1.379 - 0.694) / 1.1488348542809166 = 0.5964715920974528

===Best Feature===
Elevation

===Partitioned Dataset (medium)===
   Slope Vegetation
2  steep   riparian

All instances have the same target feature value

===Partitioned Dataset (high)===
   Slope Vegetation
6  steep  chapparal
4   flat    conifer
4   flat    conifer
4   flat    conifer
6  steep  chapparal

===Information Gain===
Slope = (0.971 - 0.000) / 0.9709505944546686 = 1.0

===Best Feature===
Slope

===Partitioned Dataset (steep)===
  Vegetation
6  chapparal
6  chapparal

All instances have the same target feature value

===Partitioned Dataset (flat)===
  Vegetation
4    conifer
4    conifer
4    conifer

All instances have the same target feature value

===Partitioned Dataset (highest)===
   Slope Vegetation
5  steep    conifer

All instances have the same target feature value

===Information Gain===
Stream = (1.379 - 1

  print(f"{d} = ({self.metric(X, y):.3f} - {self.rem(X, y, d):.3f}) / {entropy()} = {(self.metric(X, y) - self.rem(X, y, d)) / entropy()}")
  return self.metric(X, y) - self.rem(X, y, d) / entropy()
