# Notebook para treinamento do modelo Decision Tree

## Imports

In [228]:
import pandas as pd
import pydotplus
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

## Leitura dos dados

In [202]:
df = pd.read_parquet("../data/processed/sim_2006_2017_balanced.parquet")
df

Unnamed: 0,ESC,ESTCIV,IDADE,NATURAL,OCUP,RACACOR,MASCULINO,FEMININO,SUICIDIO
0,-0.862039,-0.611299,-0.690103,-1.796410,-0.349766,-0.820889,False,True,0
1,-0.862039,1.180033,-0.711939,-1.796410,-0.350851,-0.820889,True,False,0
2,-0.739565,-0.867204,-0.801467,0.549301,-0.350509,-0.820889,True,False,0
3,-0.862039,-0.355395,-0.694470,0.549301,-0.350380,-0.820889,False,True,0
4,-0.047954,-0.611299,-0.694470,0.549301,-0.349766,0.076121,False,True,0
...,...,...,...,...,...,...,...,...,...
272141,0.882637,-0.014159,0.458619,-0.623555,2.317550,1.574331,False,True,1
272142,0.882637,-0.014159,1.482439,-0.623555,-0.350724,1.574331,False,True,1
272143,1.423335,0.434921,1.041437,-0.623555,2.317550,-0.028007,True,False,1
272144,1.006996,0.434921,1.015247,-0.623555,-0.350724,1.574331,True,False,1


## Separação das variáveis para treinamento

In [203]:
X = df[["ESC","ESTCIV","IDADE","NATURAL","OCUP","RACACOR","MASCULINO","FEMININO"]].values
y = df["SUICIDIO"].values

In [204]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

## Instanciação e Treinamento do modelo

In [205]:
clf = DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

## Predict e Avaliação

In [206]:
y_pred = clf.predict(X_test)

In [208]:
print("Relatório de Classificação:\n", classification_report(y_test, y_pred))
print("Matriz de Confusão:\n", confusion_matrix(y_test, y_pred))

Relatório de Classificação:
               precision    recall  f1-score   support

           0       0.89      0.91      0.90     40570
           1       0.91      0.89      0.90     41074

    accuracy                           0.90     81644
   macro avg       0.90      0.90      0.90     81644
weighted avg       0.90      0.90      0.90     81644

Matriz de Confusão:
 [[36867  3703]
 [ 4483 36591]]


### Representação Gráfica da Árvore

* O código abaixo exporta para o path '../docs/tree.pdf' a árvore de decisão gerada pelo treinamento com uma profundidade de 5 nós.

In [230]:
dot_data = export_graphviz(clf, out_file=None,
                           max_depth=5,
                     feature_names=df.columns[:-1],
                     class_names=["Suicídio", "Não Suicídio"],
                     filled=True, rounded=True,
                     special_characters=True)

pydot_graph = pydotplus.graph_from_dot_data(dot_data)
pydot_graph.write_pdf('../docs/tree.pdf')

True