In [150]:
# Exercício de Análise de doenças de soja

In [151]:
# Import de bibliotecas
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Dropout
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import ExtraTreesClassifier

In [152]:
# Carregamento da base de dados
base = pd.read_csv('./dados/soybean.csv')
base.head()

Unnamed: 0,date,plant-stand,precip,temp,hail,crop-hist,area-damaged,severity,seed-tmt,germination,...,sclerotia,fruit-pods,fruit-spots,seed,mold-growth,seed-discolor,seed-size,shriveling,roots,class
0,october,normal,gt-norm,norm,yes,same-lst-yr,low-areas,pot-severe,none,90-100,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
1,august,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,severe,fungicide,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
2,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,fungicide,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
3,july,normal,gt-norm,norm,yes,same-lst-yr,scattered,severe,none,80-89,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker
4,october,normal,gt-norm,norm,yes,same-lst-two-yrs,scattered,pot-severe,none,lt-80,...,absent,norm,dna,norm,absent,absent,norm,absent,norm,diaporthe-stem-canker


In [153]:
# Campos onde tem valor nulo(?)
base.eq('?').sum()

date                 1
plant-stand         36
precip              38
temp                30
hail               121
crop-hist           16
area-damaged         1
severity           121
seed-tmt           121
germination        112
plant-growth        16
leaves               0
leafspots-halo      84
leafspots-marg      84
leafspot-size       84
leaf-shread        100
leaf-malf           84
leaf-mild          108
stem                16
lodging            121
stem-cankers        38
canker-lesion       38
fruiting-bodies    106
external-decay      38
mycelium            38
int-discolor        38
sclerotia           38
fruit-pods          84
fruit-spots        106
seed                92
mold-growth         92
seed-discolor      106
seed-size           92
shriveling         106
roots               31
class                0
dtype: int64

In [154]:
# limpar a base onde severity = '?'
base = base.loc[base['severity'] != '?']

In [155]:
# Verificar se ainda há outros campos marcados como nulo
base.eq('?').sum()

date               0
plant-stand        0
precip             0
temp               0
hail               0
crop-hist          0
area-damaged       0
severity           0
seed-tmt           0
germination        0
plant-growth       0
leaves             0
leafspots-halo     0
leafspots-marg     0
leafspot-size      0
leaf-shread        0
leaf-malf          0
leaf-mild          0
stem               0
lodging            0
stem-cankers       0
canker-lesion      0
fruiting-bodies    0
external-decay     0
mycelium           0
int-discolor       0
sclerotia          0
fruit-pods         0
fruit-spots        0
seed               0
mold-growth        0
seed-discolor      0
seed-size          0
shriveling         0
roots              0
class              0
dtype: int64

In [156]:
# Verificar quantas instâncias permaneceram na base
base.shape

(562, 36)

In [157]:
# verificar novamente nome dos atributos
base.columns

Index(['date', 'plant-stand', 'precip', 'temp', 'hail', 'crop-hist',
       'area-damaged', 'severity', 'seed-tmt', 'germination', 'plant-growth',
       'leaves', 'leafspots-halo', 'leafspots-marg', 'leafspot-size',
       'leaf-shread', 'leaf-malf', 'leaf-mild', 'stem', 'lodging',
       'stem-cankers', 'canker-lesion', 'fruiting-bodies', 'external-decay',
       'mycelium', 'int-discolor', 'sclerotia', 'fruit-pods', 'fruit-spots',
       'seed', 'mold-growth', 'seed-discolor', 'seed-size', 'shriveling',
       'roots', 'class'],
      dtype='object')

In [158]:
# Agrupado por mês
agrupado = base.groupby(['date']).size()
agrupado

date
april         19
august       118
july          86
june          66
may           51
october       82
september    140
dtype: int64

In [159]:
# Agrupado por padrão de plantas
agrupado2 = base.groupby(['plant-stand']).size()
agrupado2

plant-stand
lt-normal    215
normal       347
dtype: int64

In [160]:
# Agrupado por padrão de plantas
agrupado3 = base.groupby(['precip']).size()
agrupado3

precip
gt-norm    404
lt-norm     74
norm        84
dtype: int64

In [161]:
# Formato de matriz
previsores = base.iloc[:, 0:35].values
classe = base.iloc[:, 35].values
previsores
#classe

array([['october', 'normal', 'gt-norm', ..., 'norm', 'absent', 'norm'],
       ['august', 'normal', 'gt-norm', ..., 'norm', 'absent', 'norm'],
       ['july', 'normal', 'gt-norm', ..., 'norm', 'absent', 'norm'],
       ...,
       ['october', 'lt-normal', 'gt-norm', ..., 'norm', 'absent', 'norm'],
       ['august', 'normal', 'gt-norm', ..., 'norm', 'absent', 'norm'],
       ['september', 'normal', 'gt-norm', ..., 'norm', 'absent', 'norm']],
      dtype=object)

In [162]:
previsores[1,31:36]

array(['absent', 'norm', 'absent', 'norm'], dtype=object)

In [163]:
# processo de label enconder das colunas categóricas
labelencoder0 = LabelEncoder()
previsores[:, 0] = labelencoder0.fit_transform(previsores[:, 0])
labelencoder1 = LabelEncoder()
previsores[:, 1] = labelencoder1.fit_transform(previsores[:, 1])
labelencoder2 = LabelEncoder()
previsores[:, 2] = labelencoder2.fit_transform(previsores[:, 2])
labelencoder3 = LabelEncoder()
previsores[:, 3] = labelencoder3.fit_transform(previsores[:, 3])
labelencoder4 = LabelEncoder()
previsores[:, 4] = labelencoder4.fit_transform(previsores[:, 4])
labelencoder5 = LabelEncoder()
previsores[:, 5] = labelencoder5.fit_transform(previsores[:, 5])
labelencoder6 = LabelEncoder()
previsores[:, 6] = labelencoder6.fit_transform(previsores[:, 6])
labelencoder7 = LabelEncoder()
previsores[:, 7] = labelencoder7.fit_transform(previsores[:, 7])
labelencoder8 = LabelEncoder()
previsores[:, 8] = labelencoder8.fit_transform(previsores[:, 8])
labelencoder9 = LabelEncoder()
previsores[:, 9] = labelencoder9.fit_transform(previsores[:, 9])
labelencoder10 = LabelEncoder()
previsores[:, 10] = labelencoder10.fit_transform(previsores[:, 10])
labelencoder11 = LabelEncoder()
previsores[:, 11] = labelencoder11.fit_transform(previsores[:, 11])
labelencoder12 = LabelEncoder()
previsores[:, 12] = labelencoder12.fit_transform(previsores[:, 12])
labelencoder13 = LabelEncoder()
previsores[:, 13] = labelencoder13.fit_transform(previsores[:, 13])
labelencoder14 = LabelEncoder()
previsores[:, 14] = labelencoder14.fit_transform(previsores[:, 14])
labelencoder15 = LabelEncoder()
previsores[:, 15] = labelencoder15.fit_transform(previsores[:, 15])
labelencoder16 = LabelEncoder()
previsores[:, 16] = labelencoder16.fit_transform(previsores[:, 16])
labelencoder17 = LabelEncoder()
previsores[:, 17] = labelencoder17.fit_transform(previsores[:, 17])
labelencoder18 = LabelEncoder()
previsores[:, 18] = labelencoder18.fit_transform(previsores[:, 18])
labelencoder19 = LabelEncoder()
previsores[:, 19] = labelencoder19.fit_transform(previsores[:, 19])
labelencoder20 = LabelEncoder()
previsores[:, 20] = labelencoder20.fit_transform(previsores[:, 20])
labelencoder21 = LabelEncoder()
previsores[:, 21] = labelencoder21.fit_transform(previsores[:, 21])
labelencoder22 = LabelEncoder()
previsores[:, 22] = labelencoder22.fit_transform(previsores[:, 22])
labelencoder23 = LabelEncoder()
previsores[:, 23] = labelencoder23.fit_transform(previsores[:, 23])
labelencoder24 = LabelEncoder()
previsores[:, 24] = labelencoder24.fit_transform(previsores[:, 24])
labelencoder25 = LabelEncoder()
previsores[:, 25] = labelencoder25.fit_transform(previsores[:, 25])
labelencoder26 = LabelEncoder()
previsores[:, 26] = labelencoder26.fit_transform(previsores[:, 26])
labelencoder27 = LabelEncoder()
previsores[:, 27] = labelencoder27.fit_transform(previsores[:, 27])
labelencoder28 = LabelEncoder()
previsores[:, 28] = labelencoder28.fit_transform(previsores[:, 28])
labelencoder29 = LabelEncoder()
previsores[:, 29] = labelencoder29.fit_transform(previsores[:, 29])
labelencoder30 = LabelEncoder()
previsores[:, 30] = labelencoder30.fit_transform(previsores[:, 30])
labelencoder31 = LabelEncoder()
previsores[:, 31] = labelencoder31.fit_transform(previsores[:, 31])
labelencoder32 = LabelEncoder()
previsores[:, 32] = labelencoder32.fit_transform(previsores[:, 32])
labelencoder33 = LabelEncoder()
previsores[:, 33] = labelencoder33.fit_transform(previsores[:, 33])
labelencoder34 = LabelEncoder()
previsores[:, 34] = labelencoder34.fit_transform(previsores[:, 34])

In [164]:
previsores

array([[5, 1, 0, ..., 1, 0, 1],
       [1, 1, 0, ..., 1, 0, 1],
       [2, 1, 0, ..., 1, 0, 1],
       ...,
       [5, 0, 0, ..., 1, 0, 1],
       [1, 1, 0, ..., 1, 0, 1],
       [6, 1, 0, ..., 1, 0, 1]], dtype=object)

In [165]:
# separa a base em treino e teste
x_treino, x_teste, y_treino, y_teste = train_test_split(previsores, classe, test_size = 0.3, random_state = 0)
print(len(x_treino))

393


In [166]:
# Aplicando o método naive_bayes
naive_bayes = GaussianNB()
naive_bayes.fit(x_treino, y_treino)

GaussianNB()

In [167]:
# Previsões utilizando os registros de teste
previsoes = naive_bayes.predict(x_teste)

In [168]:
# Matriz confusao
confusao = confusion_matrix(y_teste, previsoes)
confusao

array([[24,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  5,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [13,  0,  0,  0,  8,  0,  0,  0,  0,  0,  4,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, 14,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  9,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  1,  0,  0,  0,  0,  0,  5,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  7,  0,  0,  0,  0,  0,  0],
       [ 7,  0,  0,  0,  0,  0,  0,  0,  0, 22,  2,  0,  0,  0,  0],
       [ 2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  5,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  5,  0],
       [ 0,  0,  0,  0,  0,  0,  0

In [169]:
# Taxa de acertos
tx_acertos = accuracy_score(y_teste, previsoes)
tx_acertos

0.8284023668639053

In [170]:
# Nesse momento temos uma taxa de acertos de 82% É uma taxa baixa, podemos diminuir numero de previsores e trocar modelos de previsao.

In [171]:
# Utilização da ExtraTreeClassifier para verificar as colunas mais importantes para a estatistica
forest = ExtraTreesClassifier()
forest.fit(x_treino, y_treino)
importantes = forest.feature_importances_
importantes

array([0.05243748, 0.01958412, 0.03337937, 0.03066045, 0.02043759,
       0.01733141, 0.02985464, 0.02880636, 0.01842099, 0.01794593,
       0.02795327, 0.02897107, 0.03942333, 0.05087275, 0.06746893,
       0.03148089, 0.01186024, 0.02949449, 0.03834432, 0.0063961 ,
       0.04051226, 0.0534493 , 0.03729856, 0.04492674, 0.00093516,
       0.03781513, 0.01356636, 0.0509411 , 0.04741849, 0.02233714,
       0.02227556, 0.01271498, 0.00451652, 0.00263882, 0.00753015])

In [249]:
# criação da nova base de dados de treino e teste
x_treino2 = x_treino[:, [13, 14, 21, 27]]
x_teste2 = x_teste[:, [13, 14, 21, 27]]

In [250]:
# Treinar novamente o modelo naive-bayes com as cprincipais categorias
naive_bayes.fit(x_treino2, y_treino)

GaussianNB()

In [251]:
# Previsão do modelo refatorado
previsao2 = naive_bayes.predict(x_teste2)

In [252]:
# Taxa de acerto do modelo refatorado
tx_acerto2 = accuracy_score(y_teste, previsao2)
tx_acerto2

0.6331360946745562

In [None]:
# Não houve melhora nos índices com a troca de categorias
# Trocar por random forest