In [None]:
#Probabilidade da ocorrência de doenças em uma plantação de soja
#algoritimo decision tree
#autor: Fernando Schmitt

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
import graphviz
from sklearn.tree import export_graphviz

In [2]:
#carregando a base de dados a ser trabalhada
dados = pd.read_csv("soybean.csv", sep = ',')
dados.head()

Unnamed: 0,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,germination,...,sclerotia,fruit-pods,fruit-spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots,class
0,october,normal,gt-norm,norm,yes,same-lst-yr,low-areas,pot-severe,none,90-100,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
1,august,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,severe,fungicide,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
2,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,fungicide,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
3,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,none,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
4,october,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,pot-severe,none,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker


In [5]:
#convertendo os dados para arrays
previsor = dados.iloc[:, 0:35].values
classe = dados.iloc[: , 35].values
previsor

array([['october', 'normal', 'gt-norm', ..., 'norm', 'absent', 'norm'],
       ['august', 'normal', 'gt-norm', ..., 'norm', 'absent', 'norm'],
       ['july', 'normal', 'gt-norm', ..., 'norm', 'absent', 'norm'],
       ...,
       ['june', 'lt-normal', '?', ..., '?', '?', 'rotted'],
       ['april', 'lt-normal', '?', ..., '?', '?', 'rotted'],
       ['june', 'lt-normal', '?', ..., '?', '?', 'rotted']], dtype=object)

In [6]:
classe

array(['diaporthe-stem-canker', 'diaporthe-stem-canker',
       'diaporthe-stem-canker', 'diaporthe-stem-canker',
       'diaporthe-stem-canker', 'diaporthe-stem-canker',
       'diaporthe-stem-canker', 'diaporthe-stem-canker',
       'diaporthe-stem-canker', 'diaporthe-stem-canker', 'charcoal-rot',
       'charcoal-rot', 'charcoal-rot', 'charcoal-rot', 'charcoal-rot',
       'charcoal-rot', 'charcoal-rot', 'charcoal-rot', 'charcoal-rot',
       'charcoal-rot', 'rhizoctonia-root-rot', 'rhizoctonia-root-rot',
       'rhizoctonia-root-rot', 'rhizoctonia-root-rot',
       'rhizoctonia-root-rot', 'rhizoctonia-root-rot',
       'rhizoctonia-root-rot', 'rhizoctonia-root-rot',
       'rhizoctonia-root-rot', 'rhizoctonia-root-rot', 'phytophthora-rot',
       'phytophthora-rot', 'phytophthora-rot', 'phytophthora-rot',
       'phytophthora-rot', 'phytophthora-rot', 'phytophthora-rot',
       'phytophthora-rot', 'phytophthora-rot', 'phytophthora-rot',
       'phytophthora-rot', 'phytophthora-rot'

In [7]:
#transformando valores semânticos da variavel previsor em numerais
encoder = LabelEncoder()

for n in range(35):
    previsor[: , n] = encoder.fit_transform(previsor[:, n])
    
print(previsor)    

[[6 2 1 ... 2 1 2]
 [2 2 1 ... 2 1 2]
 [3 2 1 ... 2 1 2]
 ...
 [4 1 0 ... 0 0 3]
 [1 1 0 ... 0 0 3]
 [4 1 0 ... 0 0 3]]


In [28]:
#separando o conjunto em treino e teste
X_treino, x_teste, Y_treino, y_teste = train_test_split(previsor, classe, test_size = 0.3 , random_state = 0)
print(len(X_treino), len(x_teste), len(Y_treino), len(y_teste))

478 205 478 205


In [29]:
#Criando e treinando o classificador da árvore de decisão
tree = DecisionTreeClassifier()
tree.fit(X_treino, Y_treino)

DecisionTreeClassifier()

In [18]:
#exportando o arquivo para visualização
export_graphviz(tree, out_file = 'arvore.dot')

In [30]:
#obtendo previsões com o modelo treinado
prev = tree.predict(x_teste)

In [31]:
print(prev)

['brown-spot' 'phytophthora-rot' 'herbicide-injury' 'purple-seed-stain'
 'cyst-nematode' 'bacterial-blight' 'brown-stem-rot' 'bacterial-pustule'
 'brown-spot' 'alternarialeaf-spot' 'charcoal-rot' 'bacterial-blight'
 'anthracnose' 'anthracnose' 'alternarialeaf-spot' 'brown-spot'
 'alternarialeaf-spot' 'alternarialeaf-spot' 'rhizoctonia-root-rot'
 'phytophthora-rot' 'anthracnose' 'brown-spot' 'phyllosticta-leaf-spot'
 'phyllosticta-leaf-spot' 'diaporthe-stem-canker' 'brown-spot'
 'anthracnose' 'diaporthe-stem-canker' 'bacterial-blight'
 'phytophthora-rot' 'frog-eye-leaf-spot' 'alternarialeaf-spot'
 'phyllosticta-leaf-spot' 'brown-spot' 'brown-spot' 'bacterial-pustule'
 'brown-stem-rot' 'charcoal-rot' 'brown-spot' 'brown-spot'
 'brown-stem-rot' 'alternarialeaf-spot' 'phytophthora-rot' 'anthracnose'
 'bacterial-pustule' '2-4-d-injury' 'frog-eye-leaf-spot'
 'phyllosticta-leaf-spot' 'phyllosticta-leaf-spot' 'alternarialeaf-spot'
 'phyllosticta-leaf-spot' 'brown-stem-rot' 'phytophthora-rot'
 

In [32]:
#matriz de confusão 
mtx = confusion_matrix(y_teste, prev)
mtx

array([[ 3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0, 25,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2,  0,  0,  0,
         0,  0,  0],
       [ 0,  0, 14,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  4,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  2,  0],
       [ 0,  0,  0,  1,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0, 20,  0,  0,  0,  0,  0,  0,  1,  0,  2,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  1, 12,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  9,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  5,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  0,  0,  0,  0,  0,  0,
         0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  8,  0,  0,  0,  0,  0

In [33]:
#taxa de acerto do modelo
tx_acerto = accuracy_score(y_teste, prev)
tx_acerto

0.8975609756097561

In [34]:
#taxa erro
tx_erro = 1 - tx_acerto
tx_erro

0.10243902439024388