# Árvore de Decisão

## Importação e Limpeza

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.tree import export_graphviz
import graphviz

In [3]:
dataset = pd.read_csv('insurance.csv')
dataset

Unnamed: 0.1,Unnamed: 0,GoodStudent,Age,SocioEcon,RiskAversion,VehicleYear,ThisCarDam,RuggedAuto,Accident,MakeModel,...,HomeBase,AntiTheft,PropCost,OtherCarCost,OtherCar,MedCost,Cushioning,Airbag,ILiCost,DrivHist
0,1,False,Adult,Prole,Adventurous,Older,Moderate,EggShell,Mild,Economy,...,City,False,TenThou,Thousand,True,Thousand,Poor,False,Thousand,Many
1,2,False,Senior,Prole,Cautious,Current,,Football,,Economy,...,City,True,Thousand,Thousand,True,Thousand,Good,True,Thousand,Zero
2,3,False,Senior,UpperMiddle,Psychopath,Current,,Football,,FamilySedan,...,City,False,Thousand,Thousand,False,Thousand,Good,True,Thousand,One
3,4,False,Adolescent,Middle,Normal,Older,,EggShell,,Economy,...,Suburb,False,Thousand,Thousand,True,Thousand,Fair,False,Thousand,Zero
4,5,False,Adolescent,Prole,Normal,Older,Moderate,Football,Moderate,Economy,...,City,False,TenThou,Thousand,False,Thousand,Fair,False,Thousand,Many
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,19996,False,Adult,Prole,Adventurous,Older,Mild,Football,Mild,Economy,...,City,False,Thousand,Thousand,True,Thousand,Fair,False,Thousand,Many
19996,19997,False,Adult,Middle,Normal,Older,,Tank,,FamilySedan,...,Suburb,False,Thousand,Thousand,True,Thousand,Good,False,Thousand,Zero
19997,19998,False,Senior,UpperMiddle,Normal,Current,,Football,,Luxury,...,Secure,True,TenThou,Thousand,False,Thousand,Excellent,True,Thousand,Zero
19998,19999,False,Adult,Middle,Normal,Older,,Football,,FamilySedan,...,Suburb,False,Thousand,Thousand,True,Thousand,Good,True,Thousand,Zero


### Limpeza e Separação das Variáveis

In [4]:
dataset = dataset.dropna()

In [5]:
dataset = dataset.drop(columns=['Unnamed: 0'])

In [6]:
y = dataset.iloc[:, 7].values # Classe (variável dependente)
X = dataset.iloc[:, [0,1,2,3,4,5,6,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]].values # variáveis independentes

### Criando Label

In [7]:
labelEncoder = LabelEncoder()

In [8]:
for i in range(X.shape[1]):
    if X[:, i].dtype == 'object':
        X[:, i] = labelEncoder.fit_transform(X[:,i])

In [9]:
if y.dtype == 'object':
    labelEncoder_y = LabelEncoder()
    y = labelEncoder_y.fit_transform(y)

### Dividindo os dados em Treino e Teste

In [10]:
X_treinamento, X_teste, y_treinamento, y_teste = train_test_split(X, y, test_size=0.3, random_state=1)

## Criando o Modelo

In [11]:
modelo = DecisionTreeClassifier(random_state=1)
modelo.fit(X_treinamento, y_treinamento)

Modificando/Inserindo novos parâmetros no modelo, uma vez que, diferente do naive bayes, a árvore tem essa característica de conter hiperparâmetros.

- Gerando novo modelo e verificando se influencia a performance.

- max_depth é a profundidade máxima da árvore de decisão.

In [24]:
modelo = DecisionTreeClassifier(random_state=1, max_depth=8)
modelo.fit(X_treinamento, y_treinamento)

- Gerando uma terceira versão do modelo, inserindo o parâmentro max_leaf_nodes, que é a limitação do número de folhas da árvore.

In [28]:
modelo = DecisionTreeClassifier(random_state=1, max_depth=8, max_leaf_nodes=8)
modelo.fit(X_treinamento, y_treinamento)

## Criando Previsões

In [12]:
previsoes = modelo.predict(X_teste)

### Calculando as métricas

In [13]:
accuracy = accuracy_score(y_teste, previsoes)

In [15]:
accuracy

0.8145363408521303

In [16]:
precision = precision_score(y_teste, previsoes, average='weighted')

In [17]:
precision

0.8131178791705445

In [18]:
recall = recall_score(y_teste, previsoes, average='weighted')

In [19]:
recall

0.8145363408521303

In [20]:
f1 = f1_score(y_teste, previsoes, average='weighted')

In [21]:
f1

0.8136728500430954

Valores das métricas:
- Acurácia: 0.8145363408521303
- Precisão com peso: 0.8131178791705445
- Recall: 0.8145363408521303
- F1 Score: 0.8136728500430954


In [22]:
report = classification_report(y_teste, previsoes)
print(report)

              precision    recall  f1-score   support

           0       0.86      0.90      0.88       423
           1       0.71      0.69      0.70       497
           2       0.86      0.85      0.86       676

    accuracy                           0.81      1596
   macro avg       0.81      0.81      0.81      1596
weighted avg       0.81      0.81      0.81      1596



- Criando as previsões para o novo modelo com o max_depht

In [25]:
previsoes = modelo.predict(X_teste)

In [26]:
accuracy = accuracy_score(y_teste, previsoes)
precision = precision_score(y_teste, previsoes, average='weighted')
recall = recall_score(y_teste, previsoes, average='weighted')
f1 = f1_score(y_teste, previsoes, average='weighted')
print(f'Acuracia: {accuracy}, Precisão: {precision}, Recall {recall}, F1: {f1}')

Acuracia: 0.8540100250626567, Precisão: 0.8542274316892314, Recall 0.8540100250626567, F1: 0.8525408663115527


Valores das métricas do modelo 2:
- Acurácia: 0.8540100250626567
- Precisão com peso: 0.8542274316892314
- Recall: 0.8540100250626567
- F1 Score: 0.8525408663115527

Criando a previsão do terceiro modelo com max_leaf_nodes.

In [29]:
previsoes = modelo.predict(X_teste)

In [30]:
accuracy = accuracy_score(y_teste, previsoes)
precision = precision_score(y_teste, previsoes, average='weighted')
recall = recall_score(y_teste, previsoes, average='weighted')
f1 = f1_score(y_teste, previsoes, average='weighted')
print(f'Acuracia: {accuracy}, Precisão: {precision}, Recall {recall}, F1: {f1}')

Acuracia: 0.8609022556390977, Precisão: 0.8639364702212439, Recall 0.8609022556390977, F1: 0.860829227507376


Valores das métricas do modelo 3:
- Acurácia: 0.8609022556390977
- Precisão com peso: 0.8639364702212439
- Recall: 0.8609022556390977
- F1 Score: 0.860829227507376


## Visualização da Árvore Criada

In [31]:
dot_data = export_graphviz(modelo, out_file=None, filled=True, feature_names=dataset.columns[:-1], class_names=True, rounded=True)

graph = graphviz.Source(dot_data)
graph.render("decision_tree", format="png")

'decision_tree3.png'