<a href="https://colab.research.google.com/github/erlonL/testes-inferencia/blob/main/Teste_KolmogorovSmirnov.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from scipy import stats

## Utilizando biblioteca padrão

In [None]:
rng = np.random.default_rng()

In [None]:
x = stats.norm.rvs(size=1000, random_state=rng)

In [None]:
x

In [None]:
ks_stat, ks_pvalue = stats.kstest(x, stats.norm.cdf)
print(ks_stat)
print(ks_pvalue)

0.033868464490305894
0.19698833897893553


In [None]:
def kolmogorov_smirnov_critico(n):
    # table of critical values for the kolmogorov-smirnov test - 95% confidence
    # Source: https://www.soest.hawaii.edu/GG/FACULTY/ITO/GG413/K_S_Table_one_Sample.pdf
    # Source: http://www.real-statistics.com/statistics-tables/kolmogorov-smirnov-table/
    # alpha = 0.05 (95% confidential level)

    if n <= 40:
        # valores entre 1 e 40
        kolmogorov_critico = [0.97500, 0.84189, 0.70760, 0.62394, 0.56328, 0.51926, 0.48342, 0.45427, 0.43001, 0.40925,
                      0.39122, 0.37543, 0.36143, 0.34890, 0.33760, 0.32733, 0.31796, 0.30936, 0.30143, 0.29408,
                      0.28724, 0.28087, 0.27490, 0.26931, 0.26404, 0.25907, 0.25438, 0.24993, 0.24571, 0.24170,
                      0.23788, 0.23424, 0.23076, 0.22743, 0.22425, 0.22119, 0.21826, 0.21544, 0.21273, 0.21012]
        ks_critico = kolmogorov_critico[n - 1]
    elif n > 40:
        # valores acima de 40:
        kolmogorov_critico = 1.36/(np.sqrt(n))
        ks_critico = kolmogorov_critico
    else:
        pass

    return ks_critico

In [None]:
ks_critico = kolmogorov_smirnov_critico(len(x))
ks_critico

0.04300697617828996

In [None]:
if ks_critico >= ks_stat:
  print("Com 95% de confianca, não temos evidências para rejeitar a hipótese de normalidade dos dados, segundo o teste de Kolmogorov-Smirnov")
else:
  print("Com 95% de confianca, temos evidências para rejeitar a hipótese de normalidade dos dados, segundo o teste de Kolmogorov-Smirnov")

Com 95% de confianca, não temos evidências para rejeitar a hipótese de normalidade dos dados, segundo o teste de Kolmogorov-Smirnov


## Implementação

In [None]:
'''
Xi | Fabs | Fac | Fobs | Zi | Fesp | |Fesp(Xi) - Fobs(Xi)| | |Fesp(Xi) - Fobs(Xi-1)|

Xi: Os valores únicos do conjunto utilizado
set(list(x))

Fabs: Quantidade de vezes que esse valor se repete na amostra
list(x).count(i)

Fac: Frequencia acumulada da frequencia absoluta
tabela['Fac'] = tabela['Fabs'].cumsum()

Fobs: Frequencia acumulada dividido pela quantidade de valores
tabela['Fobs'] = tabela['Fac'] / len(amostra)

Zi: valor x seguindo a distribuição normal
(Xi - media)/var**(1/2)

Fesp: Valor de Zi na tabela normal

'''

"\nXi | Fabs | Fac | Fobs | Zi | Fesp | |Fesp(Xi) - Fobs(Xi)| | |Fesp(Xi) - Fobs(Xi-1)|\n\nXi: Os valores únicos do conjunto utilizado\nset(list(x))\n\nFabs: Quantidade de vezes que esse valor se repete na amostra\nlist(x).count(i)\n\nFac: Frequencia acumulada da frequencia absoluta\ntabela['Fac'] = tabela['Fabs'].cumsum()\n\nFobs: Frequencia acumulada dividido pela quantidade de valores\ntabela['Fobs'] = tabela['Fac'] / len(amostra)\n\nZi: valor x seguindo a distribuição normal\n(Xi - media)/var**(1/2)\n\nFesp: Valor de Zi na tabela normal\n\n"

In [None]:
# Gerando os valores aleatórios
valores = np.random.normal(10, 25, size=(100)).astype(int)

In [None]:
valores

array([-12,  35, -12,   9,  25,  10,  29,  43,  33,  -7, -25,  -1,  25,
        51,  -8, -12,  34, -30,  20,  -7,  33,  34, -11, -16,  30,  42,
        11,  23,  26,  11,  -7,   4,  -3,   2,  20, -20,   0,  41,  17,
         9,  11, -35,  -3, -19,   0,  15, -24,  15,  21,  32,  18,   9,
        58, -26,  32,  39,  -4, -22,  50,   3,  39, -19,  61,  19,  13,
       -25, -12,  -8,  43,  23, -17,  18,   8,   0,  16,  26,  38,  25,
       -20,  64,  -9,  -6,  31,  49,   1,  24,  18,   0,  58,   1,  -6,
        15,   4,  40, -20,  63, -41,   5,  12,  26])

In [None]:
len(set(valores))

63

In [None]:
len(valores)

100

In [None]:
tabela = pd.DataFrame()

In [None]:
colunas = ['Xi',
           'Fabs',
           'Fac',
           'Fobs',
           'Zi',
           'Fesp',
           '|Fesp(Xi) - Fobs(Xi)|',
           '|Fesp(Xi) - Fobs(Xi-1)|']

In [None]:
tabela = pd.DataFrame(columns=colunas)

In [None]:
xi = list(set(valores))

In [None]:
# Informações importantes sobre o conjunto de valores
media = np.mean(valores)
var = np.var(valores)
dp = np.std(valores)
cont = len(valores)
cont_unicos = len(xi)

print(' Média: ', media, '\n', 'Variância: ', var, '\n', 'Desvio Padrão: ', dp, '\n', 'Contagem: ', cont, '\n', 'Contagem de valores unicos: ', cont_unicos)

 Média:  11.73 
 Variância:  568.5570999999999 
 Desvio Padrão:  23.844435409545763 
 Contagem:  100 
 Contagem de valores unicos:  63


In [None]:
tabela['Xi'] = xi

In [None]:
fabs = pd.value_counts(valores)

In [None]:
fabs

-12    4
 0     4
-7     3
 26    3
-20    3
      ..
-1     1
-26    1
-4     1
 30    1
 12    1
Length: 63, dtype: int64

In [None]:
fabs.index

Int64Index([-12,   0,  -7,  26, -20,  15,  11,  18,  25,   9,  58,  39,  32,
             -6,   1,  -3,   4,  23, -19,  20,  43,  34,  -8,  33, -25,  42,
             -9,  13, -17,   8,  16,  38,  64,  31,  29,  61,  49,  10,  24,
             40,  63, -41,   5,  19, -22,   3,  50, -16, -11, -30,   2,  41,
             17, -35,  35,  51, -24,  21,  -1, -26,  -4,  30,  12],
           dtype='int64')

In [None]:
fabs.values

array([4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [None]:
for i in range(cont_unicos):
  for j in range(cont_unicos):
    if tabela['Xi'][i] == fabs.index[j]:
      tabela['Fabs'][i] = fabs.values[j]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tabela['Fabs'][i] = fabs.values[j]


In [None]:
tabela['Fac'] = tabela['Fabs'].cumsum()

In [None]:
tabela['Fobs'] = tabela['Fac'] / cont

In [None]:
tabela['Zi'] = (tabela['Xi'] - media) / dp

In [None]:
tabela['Fesp'] = stats.norm.cdf(tabela['Zi'])

In [None]:
tabela['|Fesp(Xi) - Fobs(Xi)|'] = abs(tabela['Fesp'] - tabela['Fobs'])

In [None]:
abs(tabela['Fesp'][0])

0.31138134777396215

In [None]:
for i in range(0, cont_unicos):
  if i == 0:
    tabela['|Fesp(Xi) - Fobs(Xi-1)|'][i] = abs(tabela['Fesp'][i])
  else:
    tabela['|Fesp(Xi) - Fobs(Xi-1)|'][i] = abs(tabela['Fesp'][i] - tabela['Fobs'][i-1])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tabela['|Fesp(Xi) - Fobs(Xi-1)|'][i] = abs(tabela['Fesp'][i])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tabela['|Fesp(Xi) - Fobs(Xi-1)|'][i] = abs(tabela['Fesp'][i] - tabela['Fobs'][i-1])


In [None]:
tabela

Unnamed: 0,Xi,Fabs,Fac,Fobs,Zi,Fesp,|Fesp(Xi) - Fobs(Xi)|,|Fesp(Xi) - Fobs(Xi-1)|
0,0,4,4,0.04,-0.491939,0.311381,0.271381,0.311381
1,1,2,6,0.06,-0.450000,0.326355,0.266355,0.286355
2,2,1,7,0.07,-0.408062,0.341614,0.271614,0.281614
3,3,1,8,0.08,-0.366123,0.357137,0.277137,0.287137
4,4,2,10,0.1,-0.324185,0.372899,0.272899,0.292899
...,...,...,...,...,...,...,...,...
58,-7,3,94,0.94,-0.785508,0.216078,0.723922,0.693922
59,-6,2,96,0.96,-0.743570,0.228568,0.731432,0.711432
60,-4,1,97,0.97,-0.659693,0.254726,0.715274,0.705274
61,-3,2,99,0.99,-0.617754,0.268369,0.721631,0.701631


In [None]:
Dcalc = max([max(tabela['|Fesp(Xi) - Fobs(Xi-1)|']), max(tabela['|Fesp(Xi) - Fobs(Xi)|'])])
Dcalc

0.7314315841067982

In [None]:
def kolmogorov_smirnov_critico(n):
    # table of critical values for the kolmogorov-smirnov test - 95% confidence
    # Source: https://www.soest.hawaii.edu/GG/FACULTY/ITO/GG413/K_S_Table_one_Sample.pdf
    # Source: http://www.real-statistics.com/statistics-tables/kolmogorov-smirnov-table/
    # alpha = 0.05 (95% confidential level)

    if n <= 40:
        # valores entre 1 e 40
        kolmogorov_critico = [0.97500, 0.84189, 0.70760, 0.62394, 0.56328, 0.51926, 0.48342, 0.45427, 0.43001, 0.40925,
                      0.39122, 0.37543, 0.36143, 0.34890, 0.33760, 0.32733, 0.31796, 0.30936, 0.30143, 0.29408,
                      0.28724, 0.28087, 0.27490, 0.26931, 0.26404, 0.25907, 0.25438, 0.24993, 0.24571, 0.24170,
                      0.23788, 0.23424, 0.23076, 0.22743, 0.22425, 0.22119, 0.21826, 0.21544, 0.21273, 0.21012]
        ks_critico = kolmogorov_critico[n - 1]
    elif n > 40:
        # valores acima de 40:
        kolmogorov_critico = 1.36/(np.sqrt(n))
        ks_critico = kolmogorov_critico
    else:
        pass

    return ks_critico

In [None]:
Dc = kolmogorov_smirnov_critico(cont)
Dc

0.136

In [None]:
if Dc >= Dcalc:
  print("Com 95% de confianca, não temos evidências para rejeitar a hipótese de normalidade dos dados, segundo o teste de Kolmogorov-Smirnov")
else:
  print("Com 95% de confianca, temos evidências para rejeitar a hipótese de normalidade dos dados, segundo o teste de Kolmogorov-Smirnov")

Com 95% de confianca, temos evidências para rejeitar a hipótese de normalidade dos dados, segundo o teste de Kolmogorov-Smirnov


In [None]:
ks_stat, ks_pvalue = stats.kstest(valores, stats.norm.cdf)
print(ks_stat)
print(ks_pvalue)

0.6086501019683699
4.4633757039791465e-36
