# Parte 1: Árvore de Decisão

## Parâmetros editáveis

In [None]:
# Arquivo de dados
file_name = './accident_data.csv'
#file_name = './debug.csv'

# Tamanho do set de treino
training_ratio = 0.8

Antes de executar este caderno, pode ser necessário instalar previamente as dependências em seu sistema. Isso pode ser feito com os seguintes comandos:
```bash
pip install --user numpy
pip install --user pandas
```

## Dependencias


In [2]:
import numpy as np
import pandas as pd

## Importando os dados

In [3]:
# Import
dataset = pd.read_csv(file_name, sep = ",")

# Cleaning
# dataset = dataset[]

# Printing
dataset

Unnamed: 0,Data,Countries,Local,Industry Sector,Accident Level,Potential Accident Level,Genre,Employee ou Terceiro,Risco Critico
0,2016-01-01 00:00:00,Country_01,Local_01,Mining,I,IV,Male,Third Party,Pressed
1,2016-01-02 00:00:00,Country_02,Local_02,Mining,I,IV,Male,Employee,Pressurized Systems
2,2016-01-06 00:00:00,Country_01,Local_03,Mining,I,III,Male,Third Party (Remote),Manual Tools
3,2016-01-08 00:00:00,Country_01,Local_04,Mining,I,I,Male,Third Party,Others
4,2016-01-10 00:00:00,Country_01,Local_04,Mining,IV,IV,Male,Third Party,Others
...,...,...,...,...,...,...,...,...,...
434,2017-07-04 00:00:00,Country_01,Local_04,Mining,I,III,Male,Third Party,Others
435,2017-07-04 00:00:00,Country_01,Local_03,Mining,I,II,Female,Employee,Others
436,2017-07-05 00:00:00,Country_02,Local_09,Metals,I,II,Male,Employee,Venomous Animals
437,2017-07-06 00:00:00,Country_02,Local_05,Metals,I,II,Male,Employee,Cut


## Pré-Processamento

In [4]:
def enumeracao(valores):
    mapping = {}
    translating = {}
    cont = 0
    for val in valores:
        if val not in mapping:
            mapping[val] = cont
            translating[cont] = val
            cont += 1
    return mapping, translating, cont

In [5]:
translators = {}
sizes = {}

for col in dataset.columns:
    mapping, translating, size = enumeracao(dataset[col])
    dataset[col] = dataset[col].map(mapping)
    
    translators[col] = translating
    sizes[col] = size

dataset

Unnamed: 0,Data,Countries,Local,Industry Sector,Accident Level,Potential Accident Level,Genre,Employee ou Terceiro,Risco Critico
0,0,0,0,0,0,0,0,0,0
1,1,1,1,0,0,0,0,1,1
2,2,0,2,0,0,1,0,2,2
3,3,0,3,0,0,2,0,0,3
4,4,0,3,0,1,0,0,0,3
...,...,...,...,...,...,...,...,...,...
434,283,0,3,0,0,1,0,0,3
435,283,0,2,0,0,3,1,1,3
436,284,1,9,1,0,3,0,1,21
437,285,1,4,1,0,3,0,1,13


## Separando em treino e teste

In [6]:
# Treino
#train = dataset.sample(frac=training_ratio, random_state=21)
train = dataset.sample(frac=training_ratio)
# Teste
test = dataset.drop(train.index)

## Estrutura de Dados

In [7]:
from abc import ABC

class DesisionTree(ABC):
    def inferencia(self, value):
        raise NotImplementedError()

class Node(DesisionTree):
    def __init__(self, test):
        self.test = test
        self.trees = np.empty((sizes[test],), dtype=object)
        self.num_trees = 0

    def add_tree(self, tree, idx):
        self.trees[idx] = tree
        self.num_trees += 1

    def inferencia(self, value):
        val = value[self.test]
        return self.trees[val].inferencia(value)

class Leaf(DesisionTree):
    def __init__(self, label):
        self.label = label

    def inferencia(self, value):
        return self.label

## Funções Auxiliares

In [8]:
class_name = 'Accident Level'
quant_classes = sizes[class_name]

def distribuicao(valores):
    dist = {}
    for val in valores:
        if val in dist:
            dist[val] += 1
        else:
            dist[val] = 1
    return dist

def valor_maioria(dist_valores):
    max_quant = -1 
    max_class = ''
    for item in dist_valores.items():
        if item[1] > max_quant:
            max_quant = item[1]
            max_class = item[0]
    return max_class

def calcula_entropia(exemplos):
    probabilities = np.zeros((quant_classes))

    total_number = len(exemplos)
    for ex in exemplos[class_name]:
        probabilities[ex] += 1
    
    old_settings = np.seterr(all='ignore')  # Disable warnings
    probabilities /= total_number
    
    entropy = (-probabilities) * (np.log2(probabilities))
    np.seterr(**old_settings) # Enable warnings back
    return np.nan_to_num(entropy).sum()

def melhor_atributo(atributos, exemplos):
    total_number = len(exemplos)
    entropia = calcula_entropia(exemplos)

    max_ganho = -np.inf
    max_atrib = ''
    for atrib in atributos:
        ganho = entropia
        for val in range(sizes[atrib]):
            filterring = exemplos[atrib] == val
            exemplos_com = exemplos[filterring]

            ganho -= (len(exemplos_com) / total_number) * calcula_entropia(exemplos_com)
        if ganho > max_ganho:
            max_ganho = ganho
            max_atrib = atrib
    return max_atrib

## Algoritmo de Treinamento

In [9]:
def treinamento(exemplos, atributos, padrao):
    if len(exemplos) == 0:
        return Leaf(padrao)

    dist = distribuicao(exemplos[class_name])
    if len(dist) == 1:
        return Leaf(list(dist.keys())[0])

    if len(atributos) == 0:
        return Leaf(valor_maioria(dist))
    
    melhor = melhor_atributo(atributos, exemplos)
    arvore = Node(melhor)
    maioria = valor_maioria(dist)

    for val in range(sizes[melhor]):
        filterring = exemplos[melhor] == val
        exemplos_i = exemplos[filterring]

        atributos_i = atributos.copy()
        atributos_i.discard(melhor)
        
        subarvore = treinamento(exemplos_i, atributos_i, maioria)
        arvore.add_tree(subarvore, val)
    return arvore

In [10]:
# Armazena o nome dos atributos em um conjunto
atributos = set()
for col in train.columns:
    atributos.add(col)

atributos.remove(class_name)

# Realiza o treinamento
solution = treinamento(train, atributos, '')

## Visualizando algumas respostas

In [11]:
# Visualizando algumas respostas
num_views = 5
for i in range(num_views):
    real = test.iloc[i][class_name]
    predicted = solution.inferencia(test.iloc[i])
    print('---------------------------')
    print('Real = ', translators[class_name][real], '| Predicted = ', translators[class_name][predicted])

---------------------------
Real =  I | Predicted =  I
---------------------------
Real =  I | Predicted =  I
---------------------------
Real =  IV | Predicted =  I
---------------------------
Real =  I | Predicted =  I
---------------------------
Real =  I | Predicted =  I


## Calculando a taxa de acerto no fração do dataset reservada para teste

In [12]:
count_acertos = 0
for i in range(len(test)):
    real = test.iloc[i][class_name]
    predicted = solution.inferencia(test.iloc[i])
    if real == predicted:
        count_acertos += 1

print('Porcentagem de acertos = ', count_acertos / len(test) * 100, '%')

Porcentagem de acertos =  62.5 %


---
# Parte 2: Classificador à priori

## Classificador por Média

In [13]:
# Classificador utilizando a média
media = 0
for i in range(len(train)):
    media += train.iloc[i][class_name]
class_media = round(media / len(train))

In [14]:
# Visualizando algumas respostas
num_views = 5
for i in range(num_views):
    real = test.iloc[i][class_name]
    predicted = class_media
    print('---------------------------')
    print('Real = ', translators[class_name][real], '| Predicted = ', translators[class_name][predicted])

---------------------------
Real =  I | Predicted =  IV
---------------------------
Real =  I | Predicted =  IV
---------------------------
Real =  IV | Predicted =  IV
---------------------------
Real =  I | Predicted =  IV
---------------------------
Real =  I | Predicted =  IV


### Desempenho

In [15]:
count_acertos = 0
for i in range(len(test)):
    real = test.iloc[i][class_name]
    if real == class_media:
        count_acertos += 1

print('Porcentagem de acertos = ', count_acertos / len(test) * 100, '%')

Porcentagem de acertos =  6.8181818181818175 %


## Classificador por Moda

In [16]:
# Classificador utilizando a moda
moda = {}
for i in range(len(train)):
    if train.iloc[i][class_name] in moda:
        moda[train.iloc[i][class_name]] += 1
    else:
        moda[train.iloc[i][class_name]] = 1

value_moda = -1
class_moda = -1
for key in moda:
    if moda[key] > value_moda:
        value_moda = moda[key]
        class_moda = key

### Desempenho

In [17]:
count_acertos = 0
for i in range(len(test)):
    real = test.iloc[i][class_name]
    if real == class_moda:
        count_acertos += 1

print('Porcentagem de acertos = ', count_acertos / len(test) * 100, '%')

Porcentagem de acertos =  72.72727272727273 %
