# Decision Tree

### Importing libraries

In [63]:
# Calling the scikit learn datasets of forest covertypes
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.utils import Bunch
pd.set_option('display.max_rows', None)  # Mostrar todas las filas
pd.set_option('display.max_columns', None)  # Mostrar todas las columnas
pd.set_option('display.width', None)  # No truncar las columnas

### Apliying feature engineering and selecting features on tree different datasets

In [64]:
breast_cancer = load_breast_cancer()

df = pd.DataFrame(breast_cancer.data, columns=breast_cancer.feature_names)
#df['target'] = breast_cancer.target

#print(df.head(30))
f1 = df[["mean radius","mean texture","mean perimeter","mean area","mean smoothness"]]
f2 = df[["mean compactness","mean concavity","mean concave points","mean symmetry"]]
f3 = df[["mean fractal dimension","radius error","texture error","perimeter error"]]
f4 = df[["area error","smoothness error","compactness error","concavity error"]]
f5 = df[["concave points error","symmetry error","fractal dimension error"]]
f6 = df[["worst radius","worst texture","worst perimeter","worst area"]]
f7 = df[["worst smoothness","worst compactness","worst concavity"]]
f8 = df[["worst concave points","worst symmetry","worst fractal dimension"]]

#df = f1
# Selecting columns
# df = df[[]]
data_dict = {'data': df.values, 'target': breast_cancer.target, 'feature_names': df.columns, 'target_names': breast_cancer.feature_names}

breast_cancer = Bunch(**data_dict)
#print(breast_cancer.target_names)
#print(type(breast_cancer))
#print(df.shape)
#print(f1.head(30))
#print(y)
#print(df.iloc[50:101,:])
#print(df)


In [65]:
def switch_case(argument):
    switcher = {
        0: breast_cancer,
        1: "second bunch",
        2: "third bunch"
    }
    return switcher.get(argument, "Invalid option")

selected_bunch = switch_case(0)

x = breast_cancer.data
y = breast_cancer.target

In [66]:
import math
import numpy as np
from pprint import pprint
from sklearn.model_selection import train_test_split

def entropy_func(c, n):
    return -(c*1.0/n)*math.log(c*1.0/n, 2)

def entropy_cal(c1, c2):
    #entropy between class 1 and 2
    if c1== 0 or c2 == 0:  # when there is only one class in the group, entropy is 0
        return 0
    return entropy_func(c1, c1+c2) + entropy_func(c2, c1+c2)

# One versus All
# c1,c2,c3, .., cm
# c1, *
# c2, *
# c3, *
# ...
# cm, *

#each class versus the others
def entropy_of_one_division(division): 
    s = 0
    n = len(division)
    classes = set(division)
    for c in classes:   # for each class, get entropy
        n_c = sum(division==c)
        e = n_c*1.0/n * entropy_cal(sum(division==c), sum(division!=c)) # weighted avg
        s += e
    return s, n

# The whole entropy
def get_entropy(y_predict, y_real):
    if len(y_predict) != len(y_real):
        print('They have to be the same length')
        return None
    n = len(y_real)
    s_true, n_true = entropy_of_one_division(y_real[y_predict]) # left hand side entropy
    s_false, n_false = entropy_of_one_division(y_real[~y_predict]) # right hand side entropy
    s = n_true*1.0/n * s_true + n_false*1.0/n * s_false # overall entropy, again weighted average
    return s



class DecisionTreeClassifier(object):
    def __init__(self, max_depth,bunch):
        self.depth = 0
        self.bunch = bunch
        self.max_depth = max_depth
    
    def fit(self, x, y, par_node={}, depth=0):
        if par_node is None: 
            return None
        elif len(y) == 0:
            return None
        elif self.all_same(y):
            return {'val':y[0]}
        elif depth >= self.max_depth:
            return None
        else: 
            col, cutoff, entropy = self.find_best_split_of_all(x, y)    # find one split given an information gain 
            y_left = y[x[:, col] < cutoff]
            y_right = y[x[:, col] >= cutoff]
            par_node = {'col': self.bunch.feature_names[col], 'index_col':col,
                        'cutoff':cutoff,
                       'val': np.round(np.mean(y))}
            par_node['left'] = self.fit(x[x[:, col] < cutoff], y_left, {}, depth+1)
            par_node['right'] = self.fit(x[x[:, col] >= cutoff], y_right, {}, depth+1)
            self.depth += 1 
            self.trees = par_node
            return par_node
    
    #all features versus values, get best
    def find_best_split_of_all(self, x, y):
        #print(x.shape, y.shape)
        col = None
        min_entropy = 1
        cutoff = None
        for i, c in enumerate(x.T):
            entropy, cur_cutoff = self.find_best_split(c, y)
            if entropy == 0:    # find the first perfect cutoff. Stop Iterating
                return i, cur_cutoff, entropy
            elif entropy <= min_entropy:
                min_entropy = entropy
                col = i
                cutoff = cur_cutoff
        return col, cutoff, min_entropy
    
    #one feature versus values
    def find_best_split(self, col, y):
        min_entropy = 10
        n = len(y)
        for value in set(col):
            y_predict = col < value #get which ones are less than
            my_entropy = get_entropy(y_predict, y)
            if my_entropy <= min_entropy:
                min_entropy = my_entropy
                cutoff = value
        return min_entropy, cutoff
    
    def all_same(self, items):
        return all(x == items[0] for x in items)
                                           
    def predict(self, x):
        tree = self.trees
        results = np.array([0]*len(x))
        for i, c in enumerate(x):
            results[i] = self._get_prediction(c)
        return results
    
    def _get_prediction(self, row):
        cur_layer = self.trees
        while cur_layer.get('cutoff'):
            if row[cur_layer['index_col']] < cur_layer['cutoff']:
                cur_layer = cur_layer['left']
            else:
                cur_layer = cur_layer['right']
        else:
            return cur_layer.get('val')

In [67]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=45)

model = DecisionTreeClassifier(max_depth=10,bunch=selected_bunch)
tree = model.fit(X_train, y_train)

pprint(tree)

X_train.shape

{'col': 'mean perimeter',
 'cutoff': 98.64,
 'index_col': 2,
 'left': {'col': 'mean perimeter',
          'cutoff': 90.2,
          'index_col': 2,
          'left': {'col': 'mean smoothness',
                   'cutoff': 0.1088,
                   'index_col': 4,
                   'left': {'col': 'mean texture',
                            'cutoff': 20.28,
                            'index_col': 1,
                            'left': {'val': 1},
                            'right': {'col': 'mean area',
                                      'cutoff': 477.4,
                                      'index_col': 3,
                                      'left': {'val': 1},
                                      'right': {'col': 'mean smoothness',
                                                'cutoff': 0.09714,
                                                'index_col': 4,
                                                'left': {'col': 'mean area',
                                        

(426, 5)

In [69]:
y_pred = model.predict(X_test)
print(y_pred)
print(y_test)

AttributeError: 'NoneType' object has no attribute 'get'

In [71]:
def score(y_pred, y_test):        
    return float(sum(y_pred == y_test))/ float(len(y_test))

score(y_pred, y_test)

0.8671328671328671

# Metricas

## Accuracy

## Precisión

## Recall

## AUC

## ROC