In [1]:
import numpy as np
import pandas as pd
from statistics import mode
from copy import deepcopy

from sklearn.metrics import confusion_matrix

pd.set_option('mode.chained_assignment', None)

In [2]:
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer(as_frame=True)

X, y = data.data, data.target

In [3]:
X.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

# Data Preprocessing

In [5]:
# Source: http://gnpalencia.org/optbinning/tutorials/tutorial_binary.html
from optbinning import OptimalBinning

for d in X.columns:
    op = OptimalBinning(name=d, dtype="numerical", solver="cp")
    op.fit(X[d].values, y)
    bins = [0] + list(op.splits) + [X[d].max()]
    X[d] = pd.cut(X[d], bins, labels=range(len(bins)-1))

In [6]:
for d in ['mean concavity', 'mean concave points', 'concavity error', 'concave points error', 'worst concavity', 'worst concave points']:
    X[d].fillna(mode(X[d].values), inplace=True)

# Train-Test Split

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

### Entropy *(Bits)*
$\mathcal{H}(t, \mathcal{D})=-\sum_{I\in levels(t)\\}^{}{P(t=I)\cdot\log_2(P(t=I))}$

### Rem
$rem(d,\mathcal{D})=\sum_{I\in levels(t)}{}\frac{|\mathcal{D}_{d=I}|}{\mathcal{D}}\cdot \mathcal{H}(t, \mathcal{D}_{d=I})$

### Information Gain
$IG(d, \mathcal{D})=\mathcal{H}(t, \mathcal{D})-rem(d, \mathcal{D})$

In [8]:
class Node:
    def __init__(
        self,
        feature=None,
        data=None,
        branch=None,
        parent=None,
        leaf=False,
        children=[]
    ):
        self.feature = feature
        self.data = data
        self.branch = branch
        self.parent = parent
        self.leaf = leaf
        self.children = children

    @property
    def isLeaf(self):
        return self.leaf

    @property
    def X(self):
        return self.data.iloc[:, :-1]

    @property
    def y(self):
        return self.data.iloc[:, -1]

In [9]:
class DecisionTree:
    """A Rudimentary Decision Tree Classifier"""
    def __init__(self):
        self.root = None
        self.levels = None

    def __repr__(self, node=None, depth=0):
        """Displays the decision tree"""
        if not node:
            node = self.root

        print(depth * '\t', node.feature, f"(Branch={node.branch})")
        for child in node.children:
            self.__repr__(child, depth+1)

        return ""

    def partition(self, X, y, d, t):
        D = pd.concat([X.loc[X[d]==t], y.loc[X[d]==t]], axis=1)
        D = D.drop([d], axis=1)
        return D.iloc[:, :-1], D.iloc[:, -1], t

    def entropy(self, X, y):
        """Measures the amount of uncertainty/impurity/heterogeneity in (X, y)"""
        proba = lambda t: len(X.loc[y==t]) / len(X)
        return -sum([proba(t) * np.log2(proba(t)) for t in y.unique()])

    def rem(self, X, y, d):
        """Measures the entropy after partitioning (X, y) on feature (d)"""
        weight = lambda t: len(X.loc[X[d]==t]) / len(X)
        return sum([weight(t) * self.entropy(X.loc[X[d]==t], y.loc[X[d]==t]) for t in X[d].unique()])

    def information_gain(self, X, y, d):
        """Measures the reduction in the overall entropy in (X, y) achieved by testing on feature (d)"""
        if debug:
            print(f"{d} = {self.entropy(X, y):.3f} - {self.rem(X, y, d):.3f} = {self.entropy(X, y) - self.rem(X, y, d):.3f}") 

        return self.entropy(X, y) - self.rem(X, y, d)

    def build_tree(self, X, y, *, parent=None, branch=None):
        """Performs the ID3 algorithm"""
        if len(y.unique()) == 1:  # all instances have the same target feature values
            if debug:
                print("All instances have the same target feature value\n")
            best_node = Node(feature=y.iat[0],
                             data=pd.concat([X, y], axis=1),
                             branch=branch,
                             parent=parent,
                             leaf=True)
        elif X.empty:  # dataset is empty, return a leaf node labeled with the majority class of the parent
            if debug:
                print("Dataset is empty\n")
            best_node = Node(feature=mode(parent.y),
                              branch=branch,
                              parent=parent,
                              leaf=True)
        elif all((X[d] == X[d].iloc[0]).all() for d in X.columns):  # if all feature values are identical
            if debug:
                print("All instances have the same descriptive features\n")
            best_node = Node(feature=mode(y),
                             data=pd.concat([X, y], axis=1),
                             branch=branch,
                             parent=parent,
                             leaf=True)

        else:
            if debug:
                print("===Information Gain===")
            best_feature = X.columns[np.argmax([self.information_gain(X, y, d) for d in X.columns])]
            best_node = deepcopy(Node(feature=best_feature,
                                 data=pd.concat([X, y], axis=1),
                                 branch=branch,
                                 parent=parent))

            if debug:
                print()
                print("===Best Feature===")
                print(best_feature)
                print()

            partitions = [self.partition(X, y, best_feature, t) for t in self.levels[best_feature]]

            for *d, t in partitions:
                if debug:
                    print(f"===Partitioned Dataset ({t})===")
                    print(pd.concat([*d], axis=1).head())
                    print()
                best_node.children.append(self.build_tree(*d, parent=best_node, branch=t))
        return best_node

    def fit(self, X, y):
        self.levels = {k: X[k].unique() for k in X.columns}
        self.root = self.build_tree(X, y)
        return self

    def predict(self, x):
        node = self.root
        while not node.isLeaf:
            for child in node.children:
                if child.branch == x.get(node.feature).values:
                    node = child
                    break
        return node.feature

    def score(self, X, y):
        y_hat = [self.predict(X.iloc[x].to_frame().T) for x in range(len(X))]
        return confusion_matrix(y, y_hat, labels=[1, 0])

In [10]:
%%time
debug = 0
dt = DecisionTree().fit(X_train, y_train)

CPU times: user 10.7 s, sys: 115 ms, total: 10.8 s
Wall time: 10.8 s


In [11]:
dt

 worst perimeter (Branch=None)
	 mean concave points (Branch=0)
		 1 (Branch=1)
		 1 (Branch=6)
		 radius error (Branch=2)
			 1 (Branch=0)
			 1 (Branch=4)
			 1 (Branch=2)
			 0 (Branch=6)
			 1 (Branch=3)
			 1 (Branch=1)
			 1 (Branch=5)
		 1 (Branch=0)
		 1 (Branch=5)
		 mean symmetry (Branch=4)
			 1 (Branch=2)
			 1 (Branch=4)
			 0 (Branch=5)
			 1 (Branch=1)
			 1 (Branch=0)
			 1 (Branch=3)
			 1 (Branch=6)
		 1 (Branch=3)
	 fractal dimension error (Branch=4)
		 0 (Branch=7)
		 0 (Branch=4)
		 0 (Branch=3)
		 0 (Branch=8)
		 0 (Branch=2)
		 0 (Branch=1)
		 mean smoothness (Branch=0)
			 1 (Branch=3)
			 1 (Branch=4)
			 1 (Branch=0)
			 0 (Branch=1)
			 1 (Branch=2)
			 1 (Branch=5)
		 0 (Branch=5)
		 0 (Branch=6)
	 mean concavity (Branch=1)
		 1 (Branch=0)
		 1 (Branch=5)
		 mean texture (Branch=1)
			 1 (Branch=0)
			 1 (Branch=6)
			 1 (Branch=5)
			 1 (Branch=4)
			 1 (Branch=2)
			 1 (Branch=1)
			 1 (Branch=10)
			 1 (Branch=8)
			 1 (Branch=3)
			 1 (Branch=9)
			 mean



In [12]:
dt.score(X_test, y_test)

array([[68,  2],
       [ 7, 37]])

In [13]:
class RandomForest:
    def __init__(self, n_estimators=5, n_sample=2):
        self.n_estimators = n_estimators
        self.n_sample = n_sample
        self.forest = [DecisionTree() for _ in range(n_estimators)]

    def sub_sample(self, X, n_sample=2):
        """Enforces feature randomness"""
        return np.random.choice(X.columns.to_numpy(), n_sample, replace=False)

    def bootstrap_sample(self, X, y, n_sample, key=True):
        feature_subset = self.sub_sample(X, int(np.log2(len(X))))
        d = pd.concat([X, y], axis=1)
        d = d.sample(n=n_sample, replace=key)
        return d.iloc[:, :-1][feature_subset], d.iloc[:, -1]

    def fit(self, X, y):
        for tree in self.forest:
            tree.fit(*self.bootstrap_sample(X, y, self.n_sample))
        return self

    def predict(self, x):
        assert all(isinstance(model, DecisionTree) for model in self.forest)
        return mode([dt.predict(x) for dt in self.forest])

    def score(self, X, y):
        y_hat = [self.predict(X.iloc[x].to_frame().T) for x in range(len(X))]
        return confusion_matrix(y, y_hat, labels=[1, 0])

In [14]:
%%time
rf = RandomForest(n_estimators=20, n_sample=len(X_train)).fit(X_train, y_train)

CPU times: user 2min 18s, sys: 2.01 s, total: 2min 20s
Wall time: 2min 20s


In [15]:
rf.score(X_test, y_test)

array([[70,  0],
       [ 3, 41]])