In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import SVC
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
############## SEPARANDO ENTRE DADOS DE TREINO E TESTE ##############

# Pega as duas primeiras colunas (key, nome_arqivo) do arquivo LBP.csv
df = pd.read_csv('./data/caracteristicas/LBP.csv', usecols=[0,1])

df.head()


Unnamed: 0,key,nome_arquivo
0,abethr1,XC128013
1,abethr1,XC128013
2,abethr1,XC128013
3,abethr1,XC128013
4,abethr1,XC128013


In [4]:
# Remove as linhas onde há um nome_arquivo repetido
df = df.drop_duplicates(subset=['nome_arquivo'])

df.head()

Unnamed: 0,key,nome_arquivo
0,abethr1,XC128013
5,abethr1,XC363501
7,abethr1,XC363502
12,abethr1,XC363503
16,abethr1,XC363504


In [5]:
# Remova o elemento em que só há uma ocorrência de classe
df = df.groupby('key').filter(lambda x: len(x) > 1)

df.head()

Unnamed: 0,key,nome_arquivo
0,abethr1,XC128013
5,abethr1,XC363501
7,abethr1,XC363502
12,abethr1,XC363503
16,abethr1,XC363504


In [6]:
len(df)

16932

In [7]:
# Coloca a coluna key em uma lista chamada classes e a coluna nome_arquivo em uma lista chamada arquivos
classes = df['key'].tolist()
arquivos = df['nome_arquivo'].tolist()

# Separe quais arquivos serão usados para treino e quais serão usados para teste
arquivos_treino, arquivos_teste, classes_treino, classes_teste = train_test_split(arquivos, classes, test_size=0.2, random_state=42, stratify=classes)

In [9]:
len(arquivos_treino) + len(arquivos_teste)

16932

In [11]:
# Carregar o arquivo LBP.csv
df_lbp = pd.read_csv('./data/caracteristicas/LBP.csv')

# Selecionar apenas as linhas que estão nos arquivos de treino
df_lbp_treino = df_lbp[df_lbp['nome_arquivo'].isin(arquivos_treino)]

# Selecionar apenas as linhas que estão nos arquivos de teste
df_lbp_teste = df_lbp[df_lbp['nome_arquivo'].isin(arquivos_teste)]

len(df_lbp_treino) + len(df_lbp_teste), len(df_lbp)

(80300, 80322)

In [28]:
# Separa entre y_treino e X_treino
y_treino = df_lbp_treino['key'].values

print(y_treino)

63788

In [15]:
X_treino = df_lbp_treino.drop(['key', 'nome_arquivo'], axis=1).values

print(X_treino)

[[0.11723252 0.11485854 0.03152195 ... 0.09159675 0.21038049 0.32519675]
 [0.11129431 0.10819512 0.03043252 ... 0.08825691 0.24745691 0.30971057]
 [0.10089431 0.10170407 0.02962602 ... 0.05458537 0.3572813  0.24721626]
 ...
 [0.04642276 0.0441626  0.01126179 ... 0.03369431 0.70652683 0.11724878]
 [0.10594146 0.10657561 0.03517724 ... 0.06454309 0.29874472 0.26466341]
 [0.03505691 0.03564553 0.01148943 ... 0.0203187  0.76922927 0.08722602]]


In [29]:
# Separa entre y_teste e X_teste
y_teste = df_lbp_teste['key'].values

len(y_teste)

16512

In [17]:
X_teste = df_lbp_teste.drop(['key', 'nome_arquivo'], axis=1).values

print(X_teste)

[[0.12478699 0.12187642 0.03265366 ... 0.0937561  0.18260813 0.33754472]
 [0.12578211 0.12265691 0.0329561  ... 0.09378537 0.18322927 0.33961626]
 [0.12601626 0.12188943 0.03275447 ... 0.09371382 0.18407805 0.33842276]
 ...
 [0.11706667 0.12113171 0.03398699 ... 0.09169106 0.18202927 0.32663089]
 [0.12290732 0.12068293 0.03298862 ... 0.09253984 0.18018862 0.33395447]
 [0.03149268 0.03250081 0.0094374  ... 0.02364228 0.78240976 0.086     ]]


In [22]:
############## NORMALIZANDO OS DADOS ##############

# Cria um objeto para normalizar os dados
scaler = StandardScaler()

# Normaliza os dados de treino
X_treino = scaler.fit_transform(X_treino)

print(X_treino)



[[ 0.61672194  0.63756332  0.56510034 ...  1.01366183 -0.80137717
   0.88745148]
 [ 0.45345604  0.44522274  0.4545512  ...  0.90160863 -0.64298061
   0.72965294]
 [ 0.16751719  0.25785729  0.37271183 ... -0.22808841 -0.17379279
   0.09285859]
 ...
 [-1.33013002 -1.40308671 -1.49078372 ... -0.92899374  1.31824141
  -1.23146291]
 [ 0.30628426  0.39847524  0.93601746 ...  0.10599816 -0.42387069
   0.27063876]
 [-1.64262416 -1.64893345 -1.4676839  ... -1.37775209  1.58611652
  -1.53738399]]


In [23]:
# Normaliza os dados de teste
X_teste = scaler.transform(X_teste)

print(X_teste)

[[ 0.82442548  0.84013578  0.67993945 ...  1.08610908 -0.92002524
   1.01327279]
 [ 0.85178549  0.86266469  0.71062921 ...  1.08709105 -0.91737163
   1.03438108]
 [ 0.85822314  0.84051126  0.69016937 ...  1.08469069 -0.9137455
   1.02221979]
 ...
 [ 0.61216193  0.81863944  0.8152384  ...  1.01682595 -0.92249823
   0.90206491]
 [ 0.77274548  0.80568532  0.71392919 ...  1.04530295 -0.93036179
   0.97668951]
 [-1.74061947 -1.73970619 -1.67591229 ... -1.26624442  1.64242573
  -1.54987666]]


In [24]:
############## TRATANDO OS VALORES NAN ##############

# Substitui os valores NaN por 0
X_treino = np.nan_to_num(X_treino)
X_teste = np.nan_to_num(X_teste)

In [25]:
############## TREINANDO O MODELO ##############

# Cria um objeto para o classificador SVM
svm = SVC(kernel='linear', C=1, random_state=42)

# Treina o modelo
svm.fit(X_treino, y_treino)

# Testa o modelo
y_pred = svm.predict(X_teste)

In [27]:
len(y_pred)

16512

In [30]:
# exibe a acurácia do modelo
print('Acurácia: ', accuracy_score(y_teste, y_pred))

Acurácia:  0.13680959302325582


In [51]:
############## VOTAÇÃO ##############

# copie o df df_lbp_teste para df_lbp_teste_votacao e adicione a coluna 'predicao'
df_lbp_teste_votacao = df_lbp_teste.copy()

df_lbp_teste_votacao

Unnamed: 0,key,nome_arquivo,LBP_0,LBP_1,LBP_2,LBP_3,LBP_4,LBP_5,LBP_6,LBP_7,LBP_8,LBP_9
0,abethr1,XC128013,0.124787,0.121876,0.032654,0.025063,0.020810,0.029766,0.031135,0.093756,0.182608,0.337545
1,abethr1,XC128013,0.125782,0.122657,0.032956,0.024533,0.020517,0.025828,0.031096,0.093785,0.183229,0.339616
2,abethr1,XC128013,0.126016,0.121889,0.032754,0.024826,0.021333,0.026485,0.030481,0.093714,0.184078,0.338423
3,abethr1,XC128013,0.128150,0.122937,0.031665,0.023587,0.020117,0.026010,0.030878,0.095041,0.181668,0.339948
4,abethr1,XC128013,0.094976,0.091967,0.023980,0.018377,0.015681,0.020667,0.022722,0.069434,0.387301,0.254894
...,...,...,...,...,...,...,...,...,...,...,...,...
80300,yewgre1,XC703472,0.079070,0.094224,0.030881,0.035272,0.033135,0.036820,0.020771,0.044982,0.427220,0.197626
80301,yewgre1,XC703472,0.014940,0.016833,0.005405,0.005610,0.004911,0.010231,0.003545,0.007213,0.897837,0.033476
80302,yewgre1,XC703485,0.117067,0.121132,0.033987,0.029193,0.027024,0.035590,0.035655,0.091691,0.182029,0.326631
80303,yewgre1,XC703485,0.122907,0.120683,0.032989,0.026894,0.024403,0.030485,0.034956,0.092540,0.180189,0.333954


In [52]:
df_lbp_teste_votacao['predicao'] = y_pred

df_lbp_teste_votacao

Unnamed: 0,key,nome_arquivo,LBP_0,LBP_1,LBP_2,LBP_3,LBP_4,LBP_5,LBP_6,LBP_7,LBP_8,LBP_9,predicao
0,abethr1,XC128013,0.124787,0.121876,0.032654,0.025063,0.020810,0.029766,0.031135,0.093756,0.182608,0.337545,eubeat1
1,abethr1,XC128013,0.125782,0.122657,0.032956,0.024533,0.020517,0.025828,0.031096,0.093785,0.183229,0.339616,hoopoe
2,abethr1,XC128013,0.126016,0.121889,0.032754,0.024826,0.021333,0.026485,0.030481,0.093714,0.184078,0.338423,hoopoe
3,abethr1,XC128013,0.128150,0.122937,0.031665,0.023587,0.020117,0.026010,0.030878,0.095041,0.181668,0.339948,eubeat1
4,abethr1,XC128013,0.094976,0.091967,0.023980,0.018377,0.015681,0.020667,0.022722,0.069434,0.387301,0.254894,eubeat1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
80300,yewgre1,XC703472,0.079070,0.094224,0.030881,0.035272,0.033135,0.036820,0.020771,0.044982,0.427220,0.197626,thrnig1
80301,yewgre1,XC703472,0.014940,0.016833,0.005405,0.005610,0.004911,0.010231,0.003545,0.007213,0.897837,0.033476,thrnig1
80302,yewgre1,XC703485,0.117067,0.121132,0.033987,0.029193,0.027024,0.035590,0.035655,0.091691,0.182029,0.326631,eubeat1
80303,yewgre1,XC703485,0.122907,0.120683,0.032989,0.026894,0.024403,0.030485,0.034956,0.092540,0.180189,0.333954,eubeat1


In [53]:
# Faça a votação para cada nome_arquivo
df_lbp_teste_votacao = df_lbp_teste_votacao.groupby('nome_arquivo')['predicao'].agg(lambda x:x.value_counts().index[0]).reset_index()

df_lbp_teste_votacao 

Unnamed: 0,nome_arquivo,predicao
0,XC109029,thrnig1
1,XC109030,cohmar1
2,XC113071,comsan
3,XC113256,hoopoe
4,XC113284,combuz1
...,...,...
3382,XC756890,hoopoe
3383,XC756903,cohmar1
3384,XC757503,thrnig1
3385,XC757553,thrnig1


In [56]:
# Crie um df com as classes reais
df_lbp_teste_real = df_lbp_teste.drop_duplicates(subset=['nome_arquivo'])

df_lbp_teste_real

Unnamed: 0,key,nome_arquivo,LBP_0,LBP_1,LBP_2,LBP_3,LBP_4,LBP_5,LBP_6,LBP_7,LBP_8,LBP_9
0,abethr1,XC128013,0.124787,0.121876,0.032654,0.025063,0.020810,0.029766,0.031135,0.093756,0.182608,0.337545
42,abethr1,XC616997,0.049707,0.048849,0.016289,0.014420,0.012728,0.024511,0.011811,0.027902,0.682735,0.111047
51,abethr1,XC756300,0.031353,0.029226,0.008124,0.005584,0.006120,0.014527,0.007151,0.021769,0.796250,0.079896
70,abhori1,XC128202,0.120130,0.111831,0.028146,0.025070,0.021015,0.027424,0.014348,0.033958,0.378208,0.239870
77,abhori1,XC138433,0.127724,0.120172,0.032881,0.022953,0.019447,0.030459,0.031600,0.091714,0.187665,0.335385
...,...,...,...,...,...,...,...,...,...,...,...,...
80228,yewgre1,XC479061,0.110680,0.115652,0.040602,0.034728,0.029743,0.040771,0.036007,0.091463,0.189535,0.310820
80292,yewgre1,XC700545,0.064078,0.049584,0.013340,0.010901,0.008244,0.019727,0.006351,0.014289,0.727928,0.085558
80294,yewgre1,XC700615,0.120800,0.118836,0.034156,0.029639,0.027054,0.034859,0.034459,0.092511,0.181486,0.326202
80299,yewgre1,XC703472,0.076263,0.093239,0.031063,0.037538,0.037128,0.040780,0.021672,0.046888,0.417720,0.197707


In [57]:
# Junte os dois df
df_lbp_teste_real = df_lbp_teste_real.merge(df_lbp_teste_votacao, on='nome_arquivo')

df_lbp_teste_real

Unnamed: 0,key,nome_arquivo,LBP_0,LBP_1,LBP_2,LBP_3,LBP_4,LBP_5,LBP_6,LBP_7,LBP_8,LBP_9,predicao
0,abethr1,XC128013,0.124787,0.121876,0.032654,0.025063,0.020810,0.029766,0.031135,0.093756,0.182608,0.337545,eubeat1
1,abethr1,XC616997,0.049707,0.048849,0.016289,0.014420,0.012728,0.024511,0.011811,0.027902,0.682735,0.111047,thrnig1
2,abethr1,XC756300,0.031353,0.029226,0.008124,0.005584,0.006120,0.014527,0.007151,0.021769,0.796250,0.079896,combuz1
3,abhori1,XC128202,0.120130,0.111831,0.028146,0.025070,0.021015,0.027424,0.014348,0.033958,0.378208,0.239870,thrnig1
4,abhori1,XC138433,0.127724,0.120172,0.032881,0.022953,0.019447,0.030459,0.031600,0.091714,0.187665,0.335385,combuz1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3382,yewgre1,XC479061,0.110680,0.115652,0.040602,0.034728,0.029743,0.040771,0.036007,0.091463,0.189535,0.310820,hoopoe
3383,yewgre1,XC700545,0.064078,0.049584,0.013340,0.010901,0.008244,0.019727,0.006351,0.014289,0.727928,0.085558,thrnig1
3384,yewgre1,XC700615,0.120800,0.118836,0.034156,0.029639,0.027054,0.034859,0.034459,0.092511,0.181486,0.326202,cohmar1
3385,yewgre1,XC703472,0.076263,0.093239,0.031063,0.037538,0.037128,0.040780,0.021672,0.046888,0.417720,0.197707,thrnig1


In [58]:
############## AVALIANDO O MODELO ##############

# Imprime o relatório de classificação
print(classification_report(df_lbp_teste_real['key'], df_lbp_teste_real['predicao']))


              precision    recall  f1-score   support

     abethr1       0.00      0.00      0.00         3
     abhori1       0.00      0.00      0.00        25
     abythr1       0.00      0.00      0.00         6
     afbfly1       0.00      0.00      0.00         4
     afdfly1       0.00      0.00      0.00         6
     afecuc1       0.00      0.00      0.00        18
     affeag1       0.00      0.00      0.00        10
     afgfly1       0.00      0.00      0.00         2
     afghor1       0.00      0.00      0.00        14
     afmdov1       0.00      0.00      0.00         7
     afpfly1       0.00      0.00      0.00        21
     afpwag1       0.00      0.00      0.00        16
     afrgos1       0.00      0.00      0.00        11
     afrgrp1       0.00      0.00      0.00         5
     afrjac1       0.00      0.00      0.00         6
     afrthr1       0.00      0.00      0.00         9
     amesun2       0.00      0.00      0.00         9
     augbuz1       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
