In [27]:
import pickle
import numpy as np
import pandas as pd
from pathlib import Path
from copy import deepcopy
from statistics import mode
from optbinning import OptimalBinning

from sklearn.metrics import confusion_matrix
from sklearn.datasets import load_breast_cancer

pd.set_option('mode.chained_assignment', None)

In [28]:
debug = 0

In [29]:
data = load_breast_cancer(as_frame=True)
X, y = data.data, data.target

# Data Preprocessing
Source: http://gnpalencia.org/optbinning/tutorials/tutorial_binary.html

In [30]:
for d in X.columns:
    op = OptimalBinning(name=d, dtype="numerical", solver="cp")
    op.fit(X[d].values, y)
    bins = [0] + list(op.splits) + [X[d].max()]
    X[d] = pd.cut(X[d], bins, labels=range(len(bins)-1))

In [31]:
for d in ['mean concavity', 'mean concave points', 'concavity error', 'concave points error', 'worst concavity', 'worst concave points']:
    X[d].fillna(mode(X[d].values), inplace=True)

In [32]:
path = Path(r'/home/jason/Desktop/school/dc/distrf/dataset/cancer.pkl')

df = pd.concat([X, y], axis=1)
df.to_pickle(path)

In [33]:
df = pd.read_pickle(path)
X, y = df.iloc[:, :-1], df.iloc[:, -1]

# Train-Test Split

In [34]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Decision Tree Implementation
### Entropy *(Bits)*
$\mathcal{H}(t, \mathcal{D})=-\sum_{I\in levels(t)\\}^{}{P(t=I)\cdot\log_2(P(t=I))}$

### Rem
$rem(d,\mathcal{D})=\sum_{I\in levels(t)}{}\frac{|\mathcal{D}_{d=I}|}{\mathcal{D}}\cdot \mathcal{H}(t, \mathcal{D}_{d=I})$

### Information Gain
$IG(d, \mathcal{D})=\mathcal{H}(t, \mathcal{D})-rem(d, \mathcal{D})$

In [35]:
class Node:
    def __init__(
        self,
        *,
        feature=None,
        data=None,
        branch=None,
        parent=None,
        leaf=False,
        children=[]
    ):
        self.feature = feature
        self.data = data
        self.branch = branch
        self.parent = parent
        self.leaf = leaf
        self.children = children

    @property
    def isLeaf(self):
        return self.leaf

    @property
    def X(self):
        return self.data.iloc[:, :-1]

    @property
    def y(self):
        return self.data.iloc[:, -1]

In [36]:
class DecisionTree:
    """A Rudimentary Decision Tree Classifier"""
    def __init__(self, *, criterion=None):
        """
        Pre-pruning criterion = {max_depth, partition_threshold, low_gain}
        """
        self.root = None
        self.levels = None
        self.criterion = criterion

    def __repr__(self, node=None, depth=0):
        """Displays the decision tree"""
        if not node:
            node = self.root

        print(depth * '\t', node.feature, f"(Branch={node.branch})")
        for child in node.children:
            self.__repr__(child, depth+1)

        return ""

    def partition(self, X, y, d, t):
        D = pd.concat([X.loc[X[d]==t], y.loc[X[d]==t]], axis=1)
        D = D.drop([d], axis=1)
        return D.iloc[:, :-1], D.iloc[:, -1], t

    def entropy(self, X, y):
        """Measures the amount of uncertainty/impurity/heterogeneity in (X, y)"""
        proba = lambda t: len(X.loc[y==t]) / len(X)
        return -sum([proba(t) * np.log2(proba(t)) for t in y.unique()])

    def rem(self, X, y, d):
        """Measures the entropy after partitioning (X, y) on feature (d)"""
        weight = lambda t: len(X.loc[X[d]==t]) / len(X)
        return sum([weight(t) * self.entropy(X.loc[X[d]==t], y.loc[X[d]==t]) for t in X[d].unique()])

    def information_gain(self, X, y, d):
        """Measures the reduction in the overall entropy in (X, y) achieved by testing on feature (d)"""
        if debug:
            print(f"{d} = {self.entropy(X, y):.3f} - {self.rem(X, y, d):.3f} = {self.entropy(X, y) - self.rem(X, y, d):.3f}") 

        return self.entropy(X, y) - self.rem(X, y, d)

    def build_tree(self, X, y, *, parent=None, branch=None, depth=0):
        """Performs the ID3 algorithm"""
        if len(y.unique()) == 1:  # all instances have the same target feature values
            if debug:
                print("All instances have the same target feature value\n")
            return Node(feature=y.iat[0],
                        data=pd.concat([X, y], axis=1),
                        branch=branch,
                        parent=parent,
                        leaf=True)
        elif X.empty:  # dataset is empty, return a leaf node labeled with the majority class of the parent
            if debug:
                print("Dataset is empty\n")
            return Node(feature=mode(parent.y),
                        branch=branch,
                        parent=parent,
                        leaf=True)
        elif all((X[d] == X[d].iloc[0]).all() for d in X.columns):  # if all feature values are identical
            if debug:
                print("All instances have the same descriptive features\n")
                return Node(feature=mode(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            leaf=True)
        elif self.criterion.get("max_depth"):
            if depth >= self.criterion["max_depth"]:
                if debug:
                    print("Stopping at Max Depth\n")
                return Node(feature=mode(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            leaf=True)
        elif self.criterion.get("partition_threshold"):
            if len(X) < self.criterion["partition_threshold"]:
                if debug:
                    print(f"Stopping at {len(X)} instances\n")
                return Node(feature=mode(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            leaf=True)

        if debug:
            print("===Information Gain===")

        gain = np.argmax([self.information_gain(X, y, d) for d in X.columns])

        if self.criterion.get('low_gain'):
            if gain <= self.criterion["low_gain"]:
                if debug:
                    print(f"Stopping at Gain={gain}\n")
                return Node(feature=mode(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            leaf=True)

        best_feature = X.columns[gain]
        best_node = deepcopy(Node(feature=best_feature,
                                  data=pd.concat([X, y], axis=1),
                                  branch=branch,
                                  parent=parent))

        if debug:
            print()
            print("===Best Feature===")
            print(best_feature)
            print()

        partitions = [self.partition(X, y, best_feature, t) for t in self.levels[best_feature]]

        for *d, t in partitions:
            if debug:
                print(f"===Partitioned Dataset ({t})===")
                print(pd.concat(d, axis=1).head())
                print()
            best_node.children.append(self.build_tree(*d, parent=best_node, branch=t, depth=depth+1))
        return best_node

    def fit(self, X, y):
        self.levels = {k: X[k].unique() for k in X.columns}
        self.root = self.build_tree(X, y)
        return self

    def predict(self, x):
        node = self.root
        while not node.isLeaf:
            for child in node.children:
                if child.branch == x.get(node.feature).values:
                    node = child
                    break
        return node.feature

    def score(self, X, y):
        y_hat = [self.predict(X.iloc[x].to_frame().T) for x in range(len(X))]
        return confusion_matrix(y, y_hat, labels=[1, 0])

In [37]:
%%time
dt = DecisionTree(criterion={'low_gain': 5e-2}).fit(X_train, y_train)

===Information Gain===
mean radius = 0.964 - 0.454 = 0.510
mean texture = 0.964 - 0.737 = 0.227
mean perimeter = 0.964 - 0.427 = 0.537
mean area = 0.964 - 0.448 = 0.516
mean smoothness = 0.964 - 0.843 = 0.121
mean compactness = 0.964 - 0.662 = 0.302
mean concavity = 0.964 - 0.460 = 0.503
mean concave points = 0.964 - 0.373 = 0.590
mean symmetry = 0.964 - 0.837 = 0.127
mean fractal dimension = 0.964 - 0.929 = 0.034
radius error = 0.964 - 0.626 = 0.338
texture error = 0.964 - 0.927 = 0.037
perimeter error = 0.964 - 0.618 = 0.345
area error = 0.964 - 0.483 = 0.481
smoothness error = 0.964 - 0.937 = 0.026
compactness error = 0.964 - 0.836 = 0.127
concavity error = 0.964 - 0.769 = 0.195
concave points error = 0.964 - 0.773 = 0.191
symmetry error = 0.964 - 0.923 = 0.041
fractal dimension error = 0.964 - 0.888 = 0.076
worst radius = 0.964 - 0.316 = 0.648
worst texture = 0.964 - 0.761 = 0.202
worst perimeter = 0.964 - 0.304 = 0.660
worst area = 0.964 - 0.307 = 0.656
worst smoothness = 0.964 - 

In [38]:
dt

 worst perimeter (Branch=None)
	 mean concavity (Branch=1)
		 1 (Branch=1)
		 0 (Branch=6)
		 mean area (Branch=0)
			 1 (Branch=2)
			 1 (Branch=6)
			 1 (Branch=3)
			 0 (Branch=5)
			 1 (Branch=0)
			 1 (Branch=4)
			 1 (Branch=1)
		 1 (Branch=4)
		 1 (Branch=5)
		 1 (Branch=2)
		 1 (Branch=3)
	 fractal dimension error (Branch=4)
		 0 (Branch=8)
		 0 (Branch=5)
		 0 (Branch=2)
		 0 (Branch=1)
		 0 (Branch=6)
		 0 (Branch=3)
		 0 (Branch=4)
		 0 (Branch=7)
		 mean smoothness (Branch=0)
			 1 (Branch=3)
			 1 (Branch=5)
			 1 (Branch=0)
			 1 (Branch=2)
			 0 (Branch=1)
			 1 (Branch=4)
	 worst texture (Branch=3)
		 0 (Branch=3)
		 mean texture (Branch=8)
			 0 (Branch=3)
			 0 (Branch=8)
			 0 (Branch=7)
			 0 (Branch=1)
			 0 (Branch=6)
			 0 (Branch=2)
			 0 (Branch=0)
			 1 (Branch=9)
			 0 (Branch=5)
			 0 (Branch=10)
			 0 (Branch=4)
		 mean concave points (Branch=7)
			 1 (Branch=1)
			 0 (Branch=6)
			 0 (Branch=0)
			 0 (Branch=5)
			 0 (Branch=2)
			 0 (Branch=4)
			 0 (Bran



In [39]:
dt.score(X_test, y_test)

array([[122,   2],
       [  1,  63]])

# Random Forest Implementation

In [40]:
class RandomForest:
    def __init__(self, n_estimators=5, n_sample=2, criterion=None):
        self.n_estimators = n_estimators
        self.n_sample = n_sample
        self.forest = [DecisionTree(criterion=criterion) for _ in range(n_estimators)]

    def sub_sample(self, X, n_sample=2):
        """Enforces feature randomness"""
        return np.random.choice(X.columns.to_numpy(), n_sample, replace=False)

    def bootstrap_sample(self, X, y, n_sample, key=True):
        feature_subset = self.sub_sample(X, int(np.log2(len(X))))
        d = pd.concat([X, y], axis=1)
        d = d.sample(n=n_sample, replace=key)
        return d.iloc[:, :-1][feature_subset], d.iloc[:, -1]

    def fit(self, X, y):
        for tree in self.forest:
            tree.fit(*self.bootstrap_sample(X, y, self.n_sample))
        return self

    def predict(self, x):
        assert all(isinstance(model, DecisionTree) for model in self.forest)
        return mode([dt.predict(x) for dt in self.forest])

    def score(self, X, y):
        y_hat = [self.predict(X.iloc[x].to_frame().T) for x in range(len(X))]
        return confusion_matrix(y, y_hat, labels=[1, 0])

In [42]:
%%time

rf = RandomForest(n_estimators=10, n_sample=len(X_train), criterion={'partition_threshold': 5})
rf.fit(X_train, y_train)

===Information Gain===
worst smoothness = 0.964 - 0.829 = 0.135
fractal dimension error = 0.964 - 0.857 = 0.107
worst compactness = 0.964 - 0.610 = 0.354
worst symmetry = 0.964 - 0.760 = 0.204
mean symmetry = 0.964 - 0.800 = 0.164
worst area = 0.964 - 0.257 = 0.707
mean perimeter = 0.964 - 0.362 = 0.602
worst perimeter = 0.964 - 0.245 = 0.719

===Best Feature===
worst perimeter

===Partitioned Dataset (1)===
    worst smoothness fractal dimension error worst compactness worst symmetry  \
111                3                       7                 4              0   
528                4                       7                 2              0   
58                 1                       3                 0              2   
221                4                       3                 4              5   
81                 6                       7                 7              6   

    mean symmetry worst area mean perimeter  target  
111             3          0              2    

<__main__.RandomForest at 0x7f951a9fb2e0>

In [43]:
rf.score(X_test, y_test)

array([[123,   1],
       [  2,  62]])