## Bibliotecas

In [1]:
import numpy as np
import pandas as pd
import math

## Ajuste de Colunas

In [2]:
data = pd.read_csv("data.csv", skiprows=1, header=None, names=['id', 'col1', 'col2', 'col3','col4','col5','result','resultClass'])
data_class = data.drop(columns=['id','col1','col2','result'])
data_reg = data.drop(columns=['id','col1','col2','resultClass'])

## Nó

Foram utilizadas duas árvores - Uma para decidir a gravidade (regressão) e outra para decidir a classe de gravidade (classificação).
A informação no nó nas duas árvores é muito semelhante, sendo possível utilizar a mesma classe para as duas, mas foi escolhido separá-las para melhor entendimento.
O nó guarda a informação da feature de divisão, o threshold utilizado para tal feature (Apenas para nós intermediários), o apontamento para os nós filhos, o ganho de informação e o valor (Apenas para nós folha)

In [3]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        self.value = value

In [4]:
class Node_Reg():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, var_red=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.var_red = var_red
        self.value = value

## Funções da Árvore

In [5]:
class DecisionTreeClassifier():
    def __init__(self, min_samples_split=2, max_depth=2):
        self.root = None
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
    def build_tree(self, dataset, curr_depth=0):
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positive
            if best_split["info_gain"]>0:
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return Node(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["info_gain"])
        
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        max_info_gain = -float("inf")
        
        # loop over all the features
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain
                    curr_info_gain = self.information_gain(y, left_y, right_y)
                    # update the best split if needed
                    if curr_info_gain>max_info_gain:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["info_gain"] = curr_info_gain
                        max_info_gain = curr_info_gain
                        
        # return best split
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    def information_gain(self, parent, l_child, r_child):
        ''' function to compute information gain '''
        
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        
        gain = self.entropy(parent) - (weight_l*self.entropy(l_child) + weight_r*self.entropy(r_child))
        return gain
    
    def entropy(self, y):
        ''' function to compute entropy '''
        
        class_labels = np.unique(y)
        entropy = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            entropy += -p_cls * np.log2(p_cls)
        return entropy
    
    def gini_index(self, y):
        ''' function to compute gini index '''
        
        class_labels = np.unique(y)
        gini = 0
        for cls in class_labels:
            p_cls = len(y[y == cls]) / len(y)
            gini += p_cls**2
        return 1 - gini
        
    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''
        
        Y = list(Y)
        return max(Y, key=Y.count)
    
    def print_tree(self, tree=None, indent=" "):
        ''' function to print the tree '''
        
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)

        else:
            print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.info_gain)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)
    
    def fit(self, X, Y):
        ''' function to train the tree '''
        
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)
    
    def predict(self, X):
        ''' function to predict new dataset '''
        
        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions
    
    def make_prediction(self, x, tree):
        ''' function to predict a single data point '''
        
        if tree.value!=None: return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)

In [6]:
class DecisionTreeRegressor():
    def __init__(self, min_samples_split=2, max_depth=2):
        ''' constructor '''
        
        # initialize the root of the tree 
        self.root = None
        
        # stopping conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
    def build_tree(self, dataset, curr_depth=0):
        ''' recursive function to build the tree '''
        
        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        best_split = {}
        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positive
            if best_split["var_red"]>0:
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return Node_Reg(best_split["feature_index"], best_split["threshold"], 
                            left_subtree, right_subtree, best_split["var_red"])
        
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node_Reg(value=leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features):
        ''' function to find the best split '''
        
        # dictionary to store the best split
        best_split = {}
        max_var_red = -float("inf")
        # loop over all the features
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain
                    curr_var_red = self.variance_reduction(y, left_y, right_y)
                    # update the best split if needed
                    if curr_var_red>max_var_red:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["var_red"] = curr_var_red
                        max_var_red = curr_var_red
                        
        # return best split
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        ''' function to split the data '''
        
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    def variance_reduction(self, parent, l_child, r_child):
        ''' function to compute variance reduction '''
        
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        reduction = np.var(parent) - (weight_l * np.var(l_child) + weight_r * np.var(r_child))
        return reduction
    
    def calculate_leaf_value(self, Y):
        ''' function to compute leaf node '''
        
        val = np.mean(Y)
        return val
                
    def print_tree(self, tree=None, indent=" "):
        ''' function to print the tree '''
        
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)

        else:
            print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.var_red)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)
    
    def fit(self, X, Y):
        ''' function to train the tree '''
        
        dataset = np.concatenate((X, Y), axis=1)
        self.root = self.build_tree(dataset)
        
    def make_prediction(self, x, tree):
        ''' function to predict new dataset '''
        
        if tree.value!=None: return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)
    
    def predict(self, X):
        ''' function to predict a single data point '''
        
        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions

## Separação Treino/Teste

In [7]:
data_class = np.array(data_class)
np.random.shuffle(data_class)
train_size = round(data_class.shape[0] * 0.85)
data_train_class = data_class[0:train_size]
data_test_class = data_class[train_size:data_class.shape[0]]
X_train_class = data_train_class[:, :-1]
Y_train_class = data_train_class[:, -1]
X_test_class = data_test_class[:, :-1]
Y_test_class = data_test_class[:, -1]

In [8]:
data_reg = np.array(data_reg)
np.random.shuffle(data_reg)
train_size = round(data_class.shape[0] * 0.85)
data_train_reg = data_reg[0:train_size]
data_test_reg = data_reg[train_size:data_reg.shape[0]]
X_train_reg = data_train_reg[:, :-1]
Y_train_reg = data_train_reg[:, -1]
X_test_reg = data_test_reg[:, :-1]
Y_test_reg = data_test_reg[:, -1]

In [9]:
Y_train_reg = [[x] for x in Y_train_reg]

In [10]:
Y_train_class = Y_train_class.reshape(-1,1)

## Construção da Árvore de Classificação

In [11]:
classifier = DecisionTreeClassifier(min_samples_split=3, max_depth=3)
classifier.fit(X_train_class,Y_train_class)
classifier.print_tree()

X_2 <= 11.082202 ? 0.18326255684655912
 left:X_0 <= 4.691179 ? 0.2550161383116625
  left:X_0 <= -4.734839 ? 0.2369757604244681
    left:X_1 <= 153.648972 ? 0.2795008263148364
        left:2.0
        right:1.0
    right:X_1 <= 121.307449 ? 0.13550709862307853
        left:2.0
        right:2.0
  right:X_1 <= 134.230207 ? 0.2644227348501903
    left:X_1 <= 39.255383 ? 0.6380318291655236
        left:1.0
        right:2.0
    right:X_2 <= 10.489267 ? 0.24127645118549612
        left:1.0
        right:2.0
 right:X_0 <= 4.395155 ? 0.1719247782017208
  left:X_0 <= -4.333333 ? 0.12966238837160304
    left:X_1 <= 41.704475 ? 0.1790185253649803
        left:2.0
        right:3.0
    right:X_2 <= 18.943576 ? 0.1836370760151278
        left:3.0
        right:2.0
  right:X_2 <= 19.494703 ? 0.2369350818025575
    left:X_1 <= 127.037276 ? 0.28421811551399745
        left:3.0
        right:2.0
    right:X_0 <= 5.204478 ? 0.29502064467694833
        left:2.0
        right:1.0


## Construção da Árvore de Regressão

In [12]:
regressor = DecisionTreeRegressor(min_samples_split=3, max_depth=3)
regressor.fit(X_train_reg,Y_train_reg)
regressor.print_tree()

X_2 <= 11.190504 ? 76.17266019016307
 left:X_0 <= 5.470382 ? 39.33693589097675
  left:X_1 <= 128.873292 ? 26.317969098391004
    left:X_1 <= 38.005095 ? 64.94030815872512
        left:28.46971629787234
        right:45.89030114832535
    right:X_0 <= -5.01904 ? 17.025826769195554
        left:15.438529416666666
        right:30.832123514084504
  right:X_1 <= 126.915897 ? 42.12795725871818
    left:X_1 <= 41.651678 ? 96.00329956175275
        left:15.136923604651162
        right:35.496101053333334
    right:X_2 <= 10.100121 ? 7.847015079523082
        left:13.903836818181816
        right:24.8523116
 right:X_0 <= 3.572722 ? 51.777148514889234
  left:X_2 <= 17.838316 ? 34.8885336591853
    left:X_2 <= 13.054414 ? 33.0791048098302
        left:52.994868553571436
        right:65.72623193571428
    right:X_1 <= 132.346989 ? 66.63970112498393
        left:55.30716079411764
        right:37.902063000000005
  right:X_2 <= 19.457949 ? 54.11447143494226
    left:X_1 <= 127.708274 ? 25.55683881

## Predição da Base de Treino

In [13]:
Y_pred_class = classifier.predict(X_test_class) 

In [14]:
Y_pred_reg = regressor.predict(X_test_reg) 

## Funções para a verificação do resultado

In [15]:
def acurracy(Y1,Y2):
    total = 0
    for i in range(len(Y1)):
        if(Y1[i] == Y2[i]):
            total += 1
    return total/len(Y1)

def precision(Y1,Y2):
    vp = 0
    tp = 0
    for i in range(len(Y1)):
        if (Y1[i] == 1 or Y1[i] == 2):
            tp += 1
            if(Y2[i] == 1 or Y1[i] == 2):
                vp += 1
    return vp/tp

def recall(Y1, Y2):
    vp = 0
    t = 0
    for i in range(len(Y1)):
        if(Y2[i] == 1 or Y2[i] == 2):
            t += 1
            if(Y1[i] == 1 or Y1[i] == 2):
                vp += 1
    return vp/t   
        
def f_measure(Y1,Y2):
    _precision = precision(Y1,Y2)
    _recall = recall(Y1,Y2)
    return 2*_precision*_recall/(_precision+_recall)

def rmse(Y1,Y2):
  t = 0
  for i in range(Y1.shape[0]):
    t += (Y1[i] - Y2[i])**2
  return math.sqrt(t/Y1.shape[0])

A acuracidade mede a proporção de predições corretas (tanto positivas quanto negativas) em relação ao total de predições.

In [16]:
acurracy(Y_pred_class,Y_test_class)

0.76

A precisão mede a proporção de predições positivas corretas em relação ao total de predições positivas feitas pelo modelo.
Alta precisão significa que, quando o modelo prevê uma classe positiva, ele está frequentemente certo.

In [17]:
precision(Y_pred_class,Y_test_class)

0.9735099337748344

O recall mede a proporção de verdadeiros positivos que foram corretamente identificados pelo modelo.
Alto recall indica que o modelo está identificando a maioria dos verdadeiros positivos.

In [18]:
recall(Y_pred_class,Y_test_class)

0.8292682926829268

O f_measure mede o equilíbrio entre precisão e recall. É importante em situações em que haja algum tipo de trade-off entre as duas medidas.

In [19]:
f_measure(Y_pred_class,Y_test_class)

0.8956186721619926

RMSE é uma medida que calcula a diferença média entre os valores preditos pelo modelo e os valores reais do conjunto de dados. É utilizada para problemas de regressão. Menores valores significam que o modelo está mais próximo do resultado esperado.

In [20]:
rmse(Y_test_reg,Y_pred_reg)

9.113428855159787