In [1]:
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import pandas as pd
# additional packages
from statsmodels.stats.diagnostic import lillifors

# Badanie założenia o normalności rozkładu

Wylosujmy próbki z rozkładu normalnego, jednostajnego i t-Studenta, a następnie sprawdźmy jak zadziałają testy na tych próbkach.

In [3]:
n=1000
ud=stats.uniform(-1,2) # jednostajny
nd=stats.norm()  # normalny 
td=stats.t(7) # t-studenta

# losowanie próbki
data_u=ud.rvs(n)
data_n=nd.rvs(n)
data_t=td.rvs(n)

# podzbiory próbek
fewData_u = data_u[:100]
fewData_n = data_n[:100]
fewData_t = data_t[:100]

In [7]:
# Dane z próbki rozkładu jednostajnego
data = data_u
fewData = fewData_u

pVals = pd.Series()
pFewVals = pd.Series()
# The scipy normaltest is based on D-Agostino and Pearsons test that
# combines skew and kurtosis to produce an omnibus test of normality.
_, pVals['Omnibus']    = stats.normaltest(data)
_, pFewVals['Omnibus'] = stats.normaltest(fewData)

# Shapiro-Wilk test
_, pVals['Shapiro-Wilk']    = stats.shapiro(data)
_, pFewVals['Shapiro-Wilk'] = stats.shapiro(fewData)

# Or you can check for normality with Lilliefors-test
_, pVals['Lilliefors']    = lillifors(data)
_, pFewVals['Lilliefors'] = lillifors(fewData)

# Alternatively with original Kolmogorov-Smirnov test
_, pVals['Kolmogorov-Smirnov']    = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm')
_, pFewVals['Kolmogorov-Smirnov'] = stats.kstest((fewData-np.mean(fewData))/np.std(fewData,ddof=1), 'norm')

print('p-values for all {0} data points: ----------------'.format(len(data)))
print(pVals)
print('p-values for the first 100 data points: ----------------')
print(pFewVals)

if pVals['Omnibus'] > 0.05:
    print('Data are normally distributed')

p-values for all 1000 data points: ----------------
Omnibus               2.415306e-147
Shapiro-Wilk           3.469788e-17
Lilliefors             2.595992e-10
Kolmogorov-Smirnov     5.490735e-04
dtype: float64
p-values for the first 100 data points: ----------------
Omnibus               1.387371e-15
Shapiro-Wilk          6.157946e-05
Lilliefors            2.964189e-02
Kolmogorov-Smirnov    3.213209e-01
dtype: float64


Use lilliefors, lillifors will be removed in 0.9 
(Note: misspelling missing 'e')
Use lilliefors, lillifors will be removed in 0.9 
(Note: misspelling missing 'e')


In [5]:
data = data_t
fewData = fewData_t

pVals = pd.Series()
pFewVals = pd.Series()
# The scipy normaltest is based on D-Agostino and Pearsons test that
# combines skew and kurtosis to produce an omnibus test of normality.
_, pVals['Omnibus']    = stats.normaltest(data)
_, pFewVals['Omnibus'] = stats.normaltest(fewData)

# Shapiro-Wilk test
_, pVals['Shapiro-Wilk']    = stats.shapiro(data)
_, pFewVals['Shapiro-Wilk'] = stats.shapiro(fewData)

# Or you can check for normality with Lilliefors-test
_, pVals['Lilliefors']    = lillifors(data)
_, pFewVals['Lilliefors'] = lillifors(fewData)

# Alternatively with original Kolmogorov-Smirnov test
_, pVals['Kolmogorov-Smirnov']    = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm')
_, pFewVals['Kolmogorov-Smirnov'] = stats.kstest((fewData-np.mean(fewData))/np.std(fewData,ddof=1), 'norm')

print('p-values for all {0} data points: ----------------'.format(len(data)))
print(pVals)
print('p-values for the first 100 data points: ----------------')
print(pFewVals)

if pVals['Omnibus'] > 0.05:
    print('Data are normally distributed')

p-values for all 1000 data points: ----------------
Omnibus               0.000009
Shapiro-Wilk          0.000060
Lilliefors            0.001863
Kolmogorov-Smirnov    0.112285
dtype: float64
p-values for the first 100 data points: ----------------
Omnibus               0.002234
Shapiro-Wilk          0.006214
Lilliefors            0.042109
Kolmogorov-Smirnov    0.365471
dtype: float64


Use lilliefors, lillifors will be removed in 0.9 
(Note: misspelling missing 'e')
  app.launch_new_instance()
Use lilliefors, lillifors will be removed in 0.9 
(Note: misspelling missing 'e')


In [6]:
data = data_n
fewData = fewData_n

pVals = pd.Series()
pFewVals = pd.Series()
# The scipy normaltest is based on D-Agostino and Pearsons test that
# combines skew and kurtosis to produce an omnibus test of normality.
_, pVals['Omnibus']    = stats.normaltest(data)
_, pFewVals['Omnibus'] = stats.normaltest(fewData)

# Shapiro-Wilk test
_, pVals['Shapiro-Wilk']    = stats.shapiro(data)
_, pFewVals['Shapiro-Wilk'] = stats.shapiro(fewData)

# Or you can check for normality with Lilliefors-test
_, pVals['Lilliefors']    = lillifors(data)
_, pFewVals['Lilliefors'] = lillifors(fewData)

# Alternatively with original Kolmogorov-Smirnov test
_, pVals['Kolmogorov-Smirnov']    = stats.kstest((data-np.mean(data))/np.std(data,ddof=1), 'norm')
_, pFewVals['Kolmogorov-Smirnov'] = stats.kstest((fewData-np.mean(fewData))/np.std(fewData,ddof=1), 'norm')

print('p-values for all {0} data points: ----------------'.format(len(data)))
print(pVals)
print('p-values for the first 100 data points: ----------------')
print(pFewVals)

if pVals['Omnibus'] > 0.05:
    print('Data are normally distributed')

p-values for all 1000 data points: ----------------
Omnibus               0.145034
Shapiro-Wilk          0.109491
Lilliefors            0.069782
Kolmogorov-Smirnov    0.427684
dtype: float64
p-values for the first 100 data points: ----------------
Omnibus               0.724417
Shapiro-Wilk          0.967619
Lilliefors            0.200000
Kolmogorov-Smirnov    0.799361
dtype: float64
Data are normally distributed


Use lilliefors, lillifors will be removed in 0.9 
(Note: misspelling missing 'e')
  app.launch_new_instance()
Use lilliefors, lillifors will be removed in 0.9 
(Note: misspelling missing 'e')


# Zadanie
Wylosujmy po 100 próbek z rozkładu normalnego, jednostajnego i t-Studenta zawierajacych 1000 punktów, a następnie sprawdżmy na ilu z nich testy normalności się pomylą.

Wykonaj analogiczne zadanie dla 100 próbek o liczności 100.

In [12]:
n=100
pVals = pd.Series()

ud=stats.uniform(-1,2)
nd=stats.norm()
td=stats.t(7)

count_u = 0
count_n = 0
count_t = 0

for i in range(100):

    data_u=ud.rvs(n)
    data_n=nd.rvs(n)
    data_t=td.rvs(n)

    # Alternatively with original Kolmogorov-Smirnov test
    _, pVals['u']    = stats.kstest((data_u-np.mean(data_u))/np.std(data_u,ddof=1), 'norm')
    _, pVals['n']    = stats.kstest((data_n-np.mean(data_n))/np.std(data_n,ddof=1), 'norm')
    _, pVals['t']    = stats.kstest((data_t-np.mean(data_t))/np.std(data_t,ddof=1), 'norm')

    if pVals['u']>0.05:
        count_u += 1
#         print("i="+ str(i) +" u>0.05")
        
    if pVals['n']>0.05:
        count_n += 1
#         print("i="+ str(i) +" n>0.05")

    if pVals['t']>0.05:
        count_t += 1
#         print("i="+ str(i) +" t>0.05")

print (count_n)
print (count_u)
print (count_t)

100
98
100


In [4]:
n=100
pVals = pd.Series()

ud=stats.uniform(-1,2)
nd=stats.norm()
td=stats.t(7)

count_u = 0
count_n = 0
count_t = 0

for i in range(100):

    data_u=ud.rvs(n)
    data_n=nd.rvs(n)
    data_t=td.rvs(n)

    # Alternatively with original Omnibus test
    _, pVals['u']    = stats.normaltest(data_u)
    _, pVals['n']    = stats.normaltest(data_n)
    _, pVals['t']    = stats.normaltest(data_t)

    if pVals['u']>0.05:
        count_u += 1
#         print("i="+ str(i) +" u>0.05")
        
    if pVals['n']>0.05:
        count_n += 1
#         print("i="+ str(i) +" n>0.05")

    if pVals['t']>0.05:
        count_t += 1
#         print("i="+ str(i) +" t>0.05")
#     print(pVals)

print (count_n)
print (count_u)
print (count_t)


u    0.008119
n    0.573080
t    0.041113
dtype: float64
u    1.660812e-08
n    9.349828e-01
t    2.438659e-02
dtype: float64
u    2.687257e-13
n    1.324183e-03
t    3.880160e-03
dtype: float64
u    0.000027
n    0.423540
t    0.022161
dtype: float64
u    0.000008
n    0.325094
t    0.974514
dtype: float64
u    7.379886e-12
n    2.770331e-01
t    7.592241e-01
dtype: float64
u    2.598870e-27
n    8.773381e-01
t    8.285454e-01
dtype: float64
u    0.000628
n    0.358368
t    0.000022
dtype: float64
u    1.809035e-07
n    6.687232e-01
t    2.185276e-01
dtype: float64
u    2.091881e-08
n    8.314195e-01
t    9.171473e-01
dtype: float64
u    1.890415e-09
n    3.474733e-01
t    1.790839e-04
dtype: float64
u    3.076710e-10
n    8.775849e-01
t    3.091736e-03
dtype: float64
u    7.386810e-10
n    9.286836e-01
t    5.661941e-02
dtype: float64
u    6.851368e-07
n    3.279011e-01
t    4.116624e-01
dtype: float64
u    0.001177
n    0.292827
t    0.000009
dtype: float64
u    8.008213e-13
n    9.

In [14]:
n=100
pVals = pd.Series()

ud=stats.uniform(-1,2)
nd=stats.norm()
td=stats.t(7)

count_u = 0
count_n = 0
count_t = 0

for i in range(100):

    data_u=ud.rvs(n)
    data_n=nd.rvs(n)
    data_t=td.rvs(n)

    # Alternatively with original Shapiro-Wilk test
    _, pVals['u']    = stats.shapiro(data_u)
    _, pVals['n']    = stats.shapiro(data_n)
    _, pVals['t']    = stats.shapiro(data_t)

    if pVals['u']>0.05:
        count_u += 1
#         print("i="+ str(i) +" u>0.05")
        
    if pVals['n']>0.05:
        count_n += 1
#         print("i="+ str(i) +" n>0.05")

    if pVals['t']>0.05:
        count_t += 1
#         print("i="+ str(i) +" t>0.05")

print (count_n)
print (count_u)
print (count_t)

96
0
73
