In [1]:
import pandas as pd
import math
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

sns.set_style('darkgrid')

%matplotlib inline

<h3>Leitura dos dados: Verifica 1 e Verifica 3</h3>

In [2]:
df = pd.read_csv('conhecimento.csv')

df1 = df[df.verifica==1].copy().reset_index(drop=True)
df3_full = df[df.verifica==3].copy().reset_index(drop=True)

ids_3 = np.random.choice(df3_full.id, len(df1), replace=False)
df3 = df[df.id.isin(ids_3)].copy().reset_index(drop=True)

<h3>Definição das funções usadas</h3>

In [7]:
def normaliza(dado):
    mi = dado.min()
    ma = dado.max()
    d = ((dado-mi) / (ma-mi))
    return d
def gera_sumario(df, normalizar=False):
    dados = []
    for c in cols:
        if normalizar:
            d = normaliza(df[c])
        else:
            d = df[c]
        dados.append([
            c, # coluna
            d.mean(),   # media
            d.median(), # mediana
            d.std()**2, # variancia
            d.std(),    # std
        ])
    sumario = pd.DataFrame(dados, columns=['coluna', 'media', 'mediana', 'variancia', 'std'])
    sumario = sumario.sort_values('std', ascending=False).copy().reset_index(drop=True)
    return sumario
cols = ['ndvi', 'nbrl', 'dif_ndvi', 'dif_dnbrl', 'medianb2', 'medianb3',
       'medianb4', 'medianb5', 'medianb6', 'medianb7']

<h3>Estatistica básica, ordenada pelo Desvio padrão(std) - Sem normalização</h3>

In [12]:
sumario1 = gera_sumario(df1)
sumario1

Unnamed: 0,coluna,media,mediana,variancia,std
0,medianb6,0.621125,0.233781,11.797833,3.434797
1,dif_dnbrl,0.248001,0.2214,0.014324,0.119682
2,dif_ndvi,0.173211,0.155019,0.008907,0.094378
3,nbrl,0.011152,0.009293,0.006253,0.079079
4,ndvi,0.226356,0.224413,0.003595,0.059957
5,medianb7,0.185466,0.183199,0.003146,0.056094
6,medianb5,0.188681,0.188654,0.002306,0.048024
7,medianb4,0.117826,0.114774,0.001087,0.032972
8,medianb3,0.099383,0.096094,0.000519,0.022776
9,medianb2,0.109385,0.106015,0.000305,0.017468


In [13]:
sumario3 = gera_sumario(df3)
sumario3

Unnamed: 0,coluna,media,mediana,variancia,std
0,medianb6,0.421727,0.296105,2.316084,1.521869
1,dif_dnbrl,0.290165,0.280261,0.012122,0.110098
2,dif_ndvi,0.233508,0.223623,0.00924,0.096127
3,nbrl,0.174906,0.170084,0.009175,0.095788
4,ndvi,0.314744,0.310104,0.007432,0.086211
5,medianb5,0.295035,0.290636,0.00436,0.066032
6,medianb7,0.209129,0.202026,0.003427,0.05854
7,medianb4,0.156294,0.150948,0.002558,0.050578
8,medianb3,0.143803,0.138668,0.0021,0.045823
9,medianb2,0.146046,0.138799,0.001768,0.042044


<h3>Estatistica básica, ordenada pelo Desvio padrão(std) - Com normalização</h3>

In [14]:
sumario1n = gera_sumario(df1, True)
sumario1n

Unnamed: 0,coluna,media,mediana,variancia,std
0,dif_ndvi,0.280158,0.258825,0.012247,0.110667
1,dif_dnbrl,0.205654,0.183596,0.00985,0.099246
2,medianb6,0.01595,0.005909,0.007928,0.089039
3,nbrl,0.645609,0.644041,0.004452,0.066726
4,ndvi,0.698784,0.696933,0.003262,0.057115
5,medianb5,0.190471,0.190441,0.002814,0.053049
6,medianb4,0.11206,0.108046,0.001881,0.043365
7,medianb7,0.115584,0.1141,0.001347,0.036707
8,medianb3,0.077287,0.072574,0.001065,0.032636
9,medianb2,0.059858,0.055168,0.000591,0.024312


In [15]:
sumario3n = gera_sumario(df3, True)
sumario3n

Unnamed: 0,coluna,media,mediana,variancia,std
0,dif_dnbrl,0.348012,0.336132,0.017436,0.132047
1,dif_ndvi,0.308578,0.295515,0.016137,0.127031
2,nbrl,0.574907,0.569881,0.009971,0.099853
3,medianb7,0.342445,0.330533,0.009639,0.098178
4,medianb5,0.338129,0.332832,0.006322,0.079512
5,ndvi,0.74269,0.73901,0.004677,0.068385
6,medianb4,0.167999,0.160841,0.004585,0.067715
7,medianb3,0.136109,0.128774,0.004285,0.065461
8,medianb2,0.109543,0.09917,0.003622,0.060184
9,medianb6,0.012966,0.009046,0.002255,0.047491


<h3>Compara com e sem normalização</h3>

In [43]:
s1 = sumario1.coluna.values
s3 = sumario3.coluna.values
s1n = sumario1n.coluna.values
s3n = sumario3n.coluna.values

print('Normalização')
print('------------------ Verifica 1 ----------------------')
print('Prioridade - SEM - COM ')
for i in range(len(s1)):
    print('{}º - {} - {}'.format(i+1, s1[i], s1n[i]))
print('------------------ Verifica 3 ----------------------')
for i in range(len(s1)):
    print('{}º - {} - {}'.format(i+1, s3[i], s3n[i]))

Normalização
------------------ Verifica 1 ----------------------
Prioridade - SEM - COM 
1º - medianb6 - dif_ndvi
2º - dif_dnbrl - dif_dnbrl
3º - dif_ndvi - medianb6
4º - nbrl - nbrl
5º - ndvi - ndvi
6º - medianb7 - medianb5
7º - medianb5 - medianb4
8º - medianb4 - medianb7
9º - medianb3 - medianb3
10º - medianb2 - medianb2
------------------ Verifica 3 ----------------------
1º - medianb6 - dif_dnbrl
2º - dif_dnbrl - dif_ndvi
3º - dif_ndvi - nbrl
4º - nbrl - medianb7
5º - ndvi - medianb5
6º - medianb5 - ndvi
7º - medianb7 - medianb4
8º - medianb4 - medianb3
9º - medianb3 - medianb2
10º - medianb2 - medianb6


<h3>Conclusão</h3>
<pre>
- Após normalizar, a ordem de prioridade dos atributos mudaram como esperado
- Com ou sem normalização, a ordem de prioridade não apresentou o mesmo 
  resultado dos gráficos.
</pre>