In [211]:
import numpy as np
import pandas as pd
from copy import deepcopy
from statistics import mode, mean

from sklearn.metrics import confusion_matrix, mean_squared_error
from sklearn.model_selection import train_test_split

In [212]:
df = pd.read_pickle(r"/Users/duong-jason/Desktop/dc/project_2/dataset/cancer.pkl")
X, y = df.iloc[:, :-1], df.iloc[:, -1]

In [213]:
# data = {
#     'Stream': ['false', 'true', 'true', 'false', 'false', 'true', 'true'],
#     'Slope': ['steep', 'moderate', 'steep', 'steep', 'flat', 'steep', 'steep'],
#     'Elevation': ['high', 'low', 'medium', 'medium', 'high', 'highest', 'high'],
#     'Vegetation': ['chapparal', 'riparian', 'riparian', 'chapparal', 'conifer', 'conifer', 'chapparal']
# }

# df = pd.DataFrame(data)
# X, y = df.iloc[:, :-1], df.iloc[:, -1]

In [214]:
# data = {
#     'Season': ['winter', 'winter', 'winter', 'spring', 'spring', 'spring', 'summer', 'summer', 'summer', 'autumn', 'autumn', 'autumn'],
#     'Work Day': ['false', 'false', 'true', 'false', 'true', 'true', 'false', 'true', 'true', 'false', 'false', 'true'],
#     'Rentals': [800, 826, 900, 2100, 4740, 4900, 3000, 5800, 6200, 2910, 2880, 2820]
# }

# df = pd.DataFrame(data)
# X, y = df.iloc[:, :-1], df.iloc[:, -1]

In [215]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

# Decision Tree Implementation

In [216]:
class Node:
    def __init__(
        self,
        *,
        feature=None,
        data=None,
        branch=None,
        parent=None,
        leaf=False,
        depth=0,
        children=[]
    ):
        self.feature = feature
        self.data = data
        self.branch = branch
        self.parent = parent
        self.leaf = leaf
        self.depth = depth
        self.children = children

    def __str__(self, depth=0):
        return self.depth * '\t' + f" {self.feature} (Branch={self.branch})"

    @property
    def isLeaf(self):
        return self.leaf

    @property
    def X(self):
        return self.data.iloc[:, :-1]

    @property
    def y(self):
        return self.data.iloc[:, -1]

In [217]:
class DecisionTreeEstimator:
    def __init__(self, criterion={}):
        self.root = None
        self.levels = None
        self.criterion = criterion

    def __repr__(self, node=None):
        """Displays the Decision Tree (Pre-Order Traversal)"""
        if not node:
            node = self.root

        print(node)
        for child in node.children:
            self.__repr__(child)
        return ""

    def makeLeaf(self):
        pass

    def partition(self, X, y, d, t):
        """Returns a subset of the training data with feature (d) of level (t)"""
        D = pd.concat([X.loc[X[d]==t], y.loc[X[d]==t]], axis=1)
        D = D.drop([d], axis=1)
        return D.iloc[:, :-1], D.iloc[:, -1], t

    def fit(self, X, y):
        self.levels = {k: X[k].unique() for k in X.columns}
        self.root = self.build_tree(X, y)
        return self

    def predict(self, x):
        node = self.root
        while not node.isLeaf:
            for child in node.children:
                if child.branch == x.get(node.feature).values:
                    node = child
                    break
            else:
                raise ValueError(f"Branch {child.feature} -> {x.branch} does not exist")
        return node

    def score(self, X, y):
        return [self.predict(X.iloc[x].to_frame().T).feature for x in range(len(X))]

## Decision Tree Classifier

###  Gini Index
$Gini(t, \mathcal{D})=1-\sum_{l\in levels(t)}P(t=l)^2$

### Entropy *(Bits)*
$\mathcal{H}(t, \mathcal{D})=-\sum_{l\in levels(t)\\}^{}{P(t=l)\cdot\log_2(P(t=l))}$

### Rem
$rem(d,\mathcal{D})=\sum_{l\in levels(t)}{}\frac{|\mathcal{D}_{d=l}|}{\mathcal{D}}\cdot \mathcal{H}(t, \mathcal{D}_{d=l})$

### Information Gain
$IG(d, \mathcal{D})=\mathcal{H}(t, \mathcal{D})-rem(d, \mathcal{D})$

### Information Gain Ratio
$GR(d, \mathcal{D})=\frac{IG(d, \mathcal{D})}{\mathcal{H}(d, \mathcal{D})}$

In [218]:
class DecisionTreeClassifier(DecisionTreeEstimator):
    """A Rudimentary Decision Tree Classifier"""
    def __init__(self, *, metric="entropy", info="info_gain", criterion={}):
        """
        Early Stopping Criterion (Pre-Pruning): {max_depth, partition_threshold, low_gain}
        Metric: {gain, gini}
        Info: {info_gain, gain_ratio}
        """
        super().__init__(criterion)
        self.metric = self.entropy if metric == "entropy" else self.gini
        self.info = self.information_gain if info == "info_gain" else self.information_gain_ratio

    def gini(self, X, y):
        proba = lambda t: len(X.loc[y==t]) / len(X)
        return 1-sum([proba(t)**2 for t in y.unique()])

    def entropy(self, X, y):
        """Measures the amount of uncertainty/impurity/heterogeneity in (X, y)"""
        proba = lambda t: len(X.loc[y==t]) / len(X)
        return -sum([proba(t) * np.log2(proba(t)) for t in y.unique()])

    def rem(self, X, y, d):
        """Measures the entropy after partitioning (X, y) on feature (d)"""
        weight = lambda t: len(X.loc[X[d]==t]) / len(X)
        return sum([weight(t) * self.metric(X.loc[X[d]==t], y.loc[X[d]==t]) for t in X[d].unique()])

    def information_gain(self, X, y, d):
        """Measures the reduction in the overall entropy in (X, y) achieved by testing on feature (d)"""
        if debug:
            print(f"{d} = {self.metric(X, y):.3f} - {self.rem(X, y, d):.3f} = {self.metric(X, y) - self.rem(X, y, d):.3f}") 
        return self.metric(X, y) - self.rem(X, y, d)

    def information_gain_ratio(self, X, y, d):
        proba = lambda t: len(X.loc[X[d]==t]) / len(X)
        entropy = lambda: -sum([proba(t) * np.log2(proba(t)) for t in X[d].unique()])

        if debug:
            print(f"{d} = ({self.metric(X, y):.3f} - {self.rem(X, y, d):.3f}) / {entropy()} = {(self.metric(X, y) - self.rem(X, y, d)) / entropy()}")
        return self.metric(X, y) - self.rem(X, y, d) / entropy()


    def build_tree(self, X, y, *, parent=None, branch=None, depth=0):
        """Performs the ID3 algorithm"""
        if len(y.unique()) == 1:  # all instances have the same target feature values
            if debug:
                print("All instances have the same target feature value\n")
            return Node(feature=y.iat[0],
                        data=pd.concat([X, y], axis=1),
                        branch=branch,
                        parent=parent,
                        depth=depth,
                        leaf=True)
        elif X.empty:  # dataset is empty, return a leaf node labeled with the majority class of the parent
            if debug:
                print("Dataset is empty\n")
            return Node(feature=mode(parent.y),
                        branch=branch,
                        parent=parent,
                        depth=depth,
                        leaf=True)
        elif all((X[d] == X[d].iloc[0]).all() for d in X.columns):  # if all feature values are identical
            if debug:
                print("All instances have the same descriptive features\n")
                return Node(feature=mode(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            depth=depth,
                            leaf=True)
        elif self.criterion.get("max_depth"):
            if depth >= self.criterion["max_depth"]:
                if debug:
                    print("Stopping at Max Depth\n")
                return Node(feature=mode(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            depth=depth,
                            leaf=True)
        elif self.criterion.get("partition_threshold"):
            if len(X) < self.criterion["partition_threshold"]:
                if debug:
                    print(f"Stopping at {len(X)} instances\n")
                return Node(feature=mode(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            depth=depth,
                            leaf=True)

        if debug:
            print("===Information Gain===")

        max_gain = np.argmax([self.info(X, y, d) for d in X.columns])

        if self.criterion.get('low_gain'):
            if max_gain <= self.criterion["low_gain"]:
                if debug:
                    print(f"Stopping at Gain={max_gain}\n")
                return Node(feature=mode(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            depth=depth,
                            leaf=True)

        best_feature = X.columns[max_gain]
        best_node = deepcopy(Node(feature=best_feature,
                                  data=pd.concat([X, y], axis=1),
                                  branch=branch,
                                  parent=parent,
                                  depth=depth))

        if debug:
            print()
            print("===Best Feature===")
            print(best_feature)
            print()

        X_levels = [self.partition(X, y, best_feature, level) for level in self.levels[best_feature]]

        for *d, level in X_levels:
            if debug:
                print(f"===Partitioned Dataset ({level})===")
                print(pd.concat(d, axis=1).head())
                print()
            best_node.children.append(self.build_tree(*d, parent=best_node, branch=level, depth=depth+1))
        return best_node

    def score(self, X, y):
        y_hat = super().score(X, y)
        return confusion_matrix(y, y_hat, labels=y.unique())

In [219]:
%%time
dt_clf = DecisionTreeClassifier(metric='entropy', criterion={'partition_threshold': len(X_train) * 5e-2}).fit(X_train, y_train)
dt_clf.score(X_test, y_test)

===Information Gain===
mean radius = 0.962 - 0.396 = 0.566
mean texture = 0.962 - 0.751 = 0.211
mean perimeter = 0.962 - 0.382 = 0.581
mean area = 0.962 - 0.390 = 0.572
mean smoothness = 0.962 - 0.832 = 0.130
mean compactness = 0.962 - 0.645 = 0.317
mean concavity = 0.962 - 0.421 = 0.541
mean concave points = 0.962 - 0.330 = 0.632
mean symmetry = 0.962 - 0.854 = 0.109
mean fractal dimension = 0.962 - 0.901 = 0.061
radius error = 0.962 - 0.569 = 0.393
texture error = 0.962 - 0.917 = 0.045
perimeter error = 0.962 - 0.541 = 0.421
area error = 0.962 - 0.403 = 0.559
smoothness error = 0.962 - 0.931 = 0.031
compactness error = 0.962 - 0.794 = 0.168
concavity error = 0.962 - 0.736 = 0.226
concave points error = 0.962 - 0.708 = 0.254
symmetry error = 0.962 - 0.935 = 0.027
fractal dimension error = 0.962 - 0.893 = 0.069
worst radius = 0.962 - 0.272 = 0.691
worst texture = 0.962 - 0.777 = 0.185
worst perimeter = 0.962 - 0.257 = 0.705
worst area = 0.962 - 0.257 = 0.705
worst smoothness = 0.962 - 

array([[ 54,  11],
       [ 10, 113]])

In [220]:
# debug = 1
# dt_clf = DecisionTreeClassifier(metric='entropy').fit(X, y)

In [221]:
dt_clf

 worst perimeter (Branch=None)
	 worst concavity (Branch=105.95000076293945)
		 1 (Branch=0.1457500010728836)
		 0 (Branch=1.252)
		 0 (Branch=0.49140000343322754)
		 1 (Branch=0.20795000344514847)
		 1 (Branch=0.37815000116825104)
		 1 (Branch=0.2604999989271164)
		 0 (Branch=0.31530000269412994)
	 perimeter error (Branch=87.36999893188477)
		 1 (Branch=1.7509999871253967)
		 1 (Branch=1.5145000219345093)
		 0 (Branch=21.98)
		 1 (Branch=2.3585000038146973)
		 1 (Branch=2.76200008392334)
		 1 (Branch=3.45550000667572)
		 1 (Branch=4.102499961853027)
	 worst concave points (Branch=251.2)
		 1 (Branch=0.08554999902844429)
		 0 (Branch=0.07441999763250351)
		 0 (Branch=0.291)
		 0 (Branch=0.1423499956727028)
		 0 (Branch=0.16029999405145645)
		 0 (Branch=0.10954999923706055)
	 mean texture (Branch=117.44999694824219)
		 1 (Branch=15.045000076293945)
		 1 (Branch=26.979999542236328)
		 0 (Branch=19.469999313354492)
		 1 (Branch=18.460000038146973)
		 1 (Branch=17.03499984741211)
		 0 (Bra



## Decision Tree Regressor

$var(t, \mathcal{D})=\frac{\sum_{i=1}^n(t_i-\bar{t})^2}{n-1}$

$weighted\ var(t, \mathcal{D}) = \sum_{l\in levels(d)}{} \frac{|\mathcal{D}_{d=l}|}{|\mathcal{D}|} \times var(t, \mathcal{D}_{d=l})$

In [222]:
class DecisionTreeRegressor(DecisionTreeEstimator):
    """A Rudimentary Decision Tree Regressor"""
    def __init__(self, *, metric="variance", criterion={}):
        """
        Early Stopping Criterion (Pre-Pruning) = {max_depth, partition_threshold}
        Metric = {variance}
        """
        super().__init__(criterion)
        self.metric = metric

    def variance(self, X, y, d):
        if len(X) == 1:
            return 0
        if debug:
            print(f"{d} = {sum([(t-mean(y))**2 for t in y]) / (len(X)-1)}")
        return sum([(t-mean(y))**2 for t in y]) / (len(X)-1)

    def weighted_variance(self, X, y, d):
        weight = lambda t: len(X.loc[X[d]==t]) / len(X)
        return sum([weight(t) * self.variance(X.loc[X[d]==t], y.loc[X[d]==t], d) for t in X[d].unique()])

    def build_tree(self, X, y, *, parent=None, branch=None, depth=0):
        """Performs the ID3 algorithm"""
        if len(y.unique()) == 1:  # all instances have the same target feature values
            if debug:
                print("All instances have the same target feature value\n")
            return Node(feature=y.iat[0],
                        data=pd.concat([X, y], axis=1),
                        branch=branch,
                        parent=parent,
                        depth=depth,
                        leaf=True)
        elif X.empty:  # dataset is empty, return a leaf node labeled with the majority class of the parent
            if debug:
                print("Dataset is empty\n")
            return Node(feature=mean(y),
                        branch=branch,
                        parent=parent,
                        depth=depth,
                        leaf=True)
        elif self.criterion.get("max_depth"):
            if depth >= self.criterion["max_depth"]:
                if debug:
                    print("Stopping at Max Depth\n")
                return Node(feature=mean(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            depth=depth,
                            leaf=True)
        elif self.criterion.get("partition_threshold"):
            if len(X) < self.criterion["partition_threshold"]:
                if debug:
                    print(f"Stopping at {len(X)} instances\n")
                return Node(feature=mean(y),
                            data=pd.concat([X, y], axis=1),
                            branch=branch,
                            parent=parent,
                            depth=depth,
                            leaf=True)

        if debug:
            print("===Information Gain===")

        min_var = np.argmin([self.weighted_variance(X, y, d) for d in X.columns])

        best_feature = X.columns[min_var]
        best_node = deepcopy(Node(feature=best_feature,
                                  data=pd.concat([X, y], axis=1),
                                  branch=branch,
                                  depth=depth,
                                  parent=parent))

        if debug:
            print()
            print("===Best Feature===")
            print(best_feature)
            print()

        X_levels = [self.partition(X, y, best_feature, level) for level in self.levels[best_feature]]

        for *d, level in X_levels:
            if debug:
                print(f"===Partitioned Dataset ({level})===")
                print(pd.concat(d, axis=1).head())
                print()
            best_node.children.append(self.build_tree(*d, parent=best_node, branch=level, depth=depth+1))
        return best_node

    def score(self, X, y):
        y_hat = super().score(X, y)
        return mean_squared_error(y, y_hat, squared=False)

In [223]:
data = {
    'Season': ['winter', 'winter', 'winter', 'spring', 'spring', 'spring', 'summer', 'summer', 'summer', 'autumn', 'autumn', 'autumn'],
    'Work Day': ['false', 'false', 'true', 'false', 'true', 'true', 'false', 'true', 'true', 'false', 'false', 'true'],
    'Rentals': [800, 826, 900, 2100, 4740, 4900, 3000, 5800, 6200, 2910, 2880, 2820],
}

df = pd.DataFrame(data)
X, y = df.iloc[:, :-1], df.iloc[:, -1]



In [224]:
dt_regr = DecisionTreeRegressor().fit(X, y)

===Information Gain===
Season = 2692.0
Season = 2472533.3333333335
Season = 3040000.0
Season = 2100.0
Work Day = 1077280.0
Work Day = 4026346.666666667

===Best Feature===
Season

===Partitioned Dataset (winter)===
  Work Day  Rentals
0    false      800
1    false      826
2     true      900

===Information Gain===
Work Day = 338.0

===Best Feature===
Work Day

===Partitioned Dataset (false)===
   Rentals
0      800
1      826

Dataset is empty

===Partitioned Dataset (true)===
   Rentals
2      900

All instances have the same target feature value

===Partitioned Dataset (spring)===
  Work Day  Rentals
3    false     2100
4     true     4740
5     true     4900

===Information Gain===
Work Day = 12800.0

===Best Feature===
Work Day

===Partitioned Dataset (false)===
   Rentals
3     2100

All instances have the same target feature value

===Partitioned Dataset (true)===
   Rentals
4     4740
5     4900

Dataset is empty

===Partitioned Dataset (summer)===
  Work Day  Rentals
6    fa

In [225]:
dt_regr

 Season (Branch=None)
	 Work Day (Branch=winter)
		 813 (Branch=false)
		 900 (Branch=true)
	 Work Day (Branch=spring)
		 2100 (Branch=false)
		 4820 (Branch=true)
	 Work Day (Branch=summer)
		 3000 (Branch=false)
		 6000 (Branch=true)
	 Work Day (Branch=autumn)
		 2895 (Branch=false)
		 2820 (Branch=true)




# Random Forest Implementation

In [226]:
class RandomForest:
    def __init__(self, n_estimators=5, n_sample=2, info="info_gain", criterion={}):
        self.n_estimators = n_estimators
        self.n_sample = n_sample
        self.forest = [DecisionTreeClassifier(info=info, criterion=criterion) for _ in range(n_estimators)]

    def sub_sample(self, X, n_sample=2):
        """Enforces feature randomness"""
        return np.random.choice(X.columns.to_numpy(), n_sample, replace=False)

    def bootstrap_sample(self, X, y, n_sample, key=True):
        feature_subset = self.sub_sample(X, int(np.log2(len(X))))
        d = pd.concat([X, y], axis=1)
        d = d.sample(n=n_sample, replace=key)
        return d.iloc[:, :-1][feature_subset], d.iloc[:, -1]

    def fit(self, X, y):
        for tree in self.forest:
            tree.fit(*self.bootstrap_sample(X, y, self.n_sample))
        return self

    def predict(self, x):
        assert all(isinstance(model, DecisionTreeClassifier) for model in self.forest)
        return mode([dt.predict(x).feature for dt in self.forest])

    def score(self, X, y):
        y_hat = [self.predict(X.iloc[x].to_frame().T) for x in range(len(X))]
        return confusion_matrix(y, y_hat, labels=y.unique())

In [227]:
%%time
rf = RandomForest(n_estimators=20, n_sample=len(X_train), criterion={'partition_threshold': len(X_train) * 5e-2}).fit(X_train, y_train)
rf.score(X_test, y_test)

===Information Gain===
mean smoothness = 0.965 - 0.825 = 0.140
mean concavity = 0.965 - 0.403 = 0.562
worst fractal dimension = 0.965 - 0.852 = 0.113
worst smoothness = 0.965 - 0.777 = 0.188
fractal dimension error = 0.965 - 0.882 = 0.084
mean symmetry = 0.965 - 0.829 = 0.136
worst radius = 0.965 - 0.284 = 0.682
mean perimeter = 0.965 - 0.360 = 0.605

===Best Feature===
worst radius

===Partitioned Dataset (36.04)===
    mean smoothness mean concavity worst fractal dimension worst smoothness  \
108        0.163400         0.4268                0.100155          0.22260   
35         0.099965         0.1551                0.088975          0.16445   
446        0.108300         0.4268                0.092790          0.14870   
563        0.115900         0.4268                0.100155          0.14870   
122        0.163400         0.4268                0.081960          0.22260   

    fractal dimension error mean symmetry mean perimeter  target  
108                0.006436       0.3

array([[ 62,   3],
       [  1, 122]])

In [228]:
# rf = RandomForest(n_estimators=20, n_sample=len(X)).fit(X, y)