# Pruebas de Normalidad/Gaussianidad

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import pingouin as pg

from matplotlib import pyplot as plt

import ipywidgets as widgets
from ipywidgets import HBox, VBox
from IPython.display import display
%matplotlib inline

## Comparación entre dos grupos

### Diseño experimental
 - Casos-Controles
 - Logitudinal dos codiciones

### Supuestos
  1. Variables numéricas
  2. Variables continuas
  3. Distribución Gaussian (Normal) de AMBOS grupos
     1. Shapiro-Wilk
     2. Kolmogorov-Smirnov
   
| Grupo 1 | Grupo 2 | Prueba    |
|---------|---------|-----------|
| NG      | NG      |           |
| NG      | G       |           |
| G       | NG      |           |
| G       | G       | t-Student |

  1. Homogeneidad de la varianza
     1. Levene

In [5]:
def get_random_df(params, label, N=350):
    dtype = params[0]
    match dtype:
        case 'normal':
            x = np.random.normal(params[1], params[2], N)
        #
        case "bimodal":
            x1 = np.random.normal(params[1], params[2], N//2)
            x2 = np.random.normal(3*params[1], 0.5*params[2], N - N//2)
            x = np.concatenate((x1, x2))
        case _:
            x = np.random.normal(params[1], params[2], N)
    #
    x = pd.Series(x, name='metric').to_frame()
    x['group'] = label
    return x.copy()
#
def gen_2_groups(params0, params1, N=350):
    x1 = get_random_df(params0, 'control')
    x2 = get_random_df(params1, 'case')
    #
    X = pd.concat((x1, x2)).copy()
    return X
#
def t_tests(data, group):
    unique = data[group].unique()
    grouped = data.groupby(group)
    #
    ttests = []
    for col in data.drop(columns=group).columns:
        x0 = grouped.get_group(unique[0])[col]
        x1 = grouped.get_group(unique[1])[col]
        #
        tmp = pg.ttest(x0, x1)
        tmp['variable'] = col
        ttests.append(tmp)
    #
    ttests = pd.concat(ttests)
    ttests = ttests[['variable'] + list(ttests.columns[:-1])]
    print(ttests)

In [10]:

@widgets.interact(m1=(-4, 4, 0.1),
                  v1=(.1, 2, 0.1),
                  m2=(-2, 2, 0.1),
                  v2=(0.1, 2, 0.1),
                  bins=[20, 25, 30, 35])
def _t_tests(m1, v1, m2, v2, bins):
    #
    params0 = ('normal', m1, v1)
    params1 = ('normal', m2, v2)
    X = gen_2_groups(params0, params1)
    sns.histplot(x='metric', hue='group', data=X, 
                 bins=bins, stat='probability')
    #
    plt.figure()
    #sns.boxplot(y='metric', hue='group', data=X, )
    sns.violinplot(y='metric', hue='group', data=X, split=True)
    #
    print(pg.normality(X, dv='metric', group='group'))
    print()
    print(pg.homoscedasticity(X, dv='metric', group='group'))
    print()
    t_tests(X, 'group')  


interactive(children=(FloatSlider(value=0.0, description='m1', max=4.0, min=-4.0), FloatSlider(value=1.0, desc…

## Ejemplo

In [11]:
fname = "/home/omarpr/Dropbox/Brain/data/cursos/bioestadistica/dbs/health_data.csv"
data = pd.read_csv(fname)
#
data.head()

Unnamed: 0,id,age,heart_rate,breath_rate,glucose_blood_concentration,mexican_state_of_birth,scholarity,diagnostic
0,1,30,78,17,206,Durango,0,diabetes
1,2,26,78,15,74,Campeche,4,health
2,3,27,85,23,152,Sonora,2,diabetes
3,4,31,83,21,92,Puebla,3,health
4,5,30,83,12,165,Zacatecas,0,diabetes


In [12]:
df = data.drop(columns='mexican_state_of_birth')
#
t_tests(df, 'diagnostic')

                           variable          T         dof alternative  \
T-test                           id   1.043705  195.566493   two-sided   
T-test                          age   0.339363  186.628210   two-sided   
T-test                   heart_rate  -2.658403  196.543983   two-sided   
T-test                  breath_rate  -1.730967  197.846736   two-sided   
T-test  glucose_blood_concentration  26.083967  129.227789   two-sided   
T-test                   scholarity  -0.311983  195.536293   two-sided   

               p-val           CI95%   cohen-d       BF10     power  
T-test  2.979105e-01  [-7.59, 24.66]  0.147486      0.257  0.178737  
T-test  7.347178e-01   [-0.72, 1.01]  0.048483      0.163  0.063351  
T-test  8.497860e-03  [-3.05, -0.45]  0.374895      4.059  0.748449  
T-test  8.501669e-02   [-2.14, 0.14]  0.241919      0.623  0.396369  
T-test  2.459024e-53  [63.64, 74.08]  3.460395  1.358e+62  1.000000  
T-test  7.553860e-01   [-0.47, 0.34]  0.044089      0.162  0.

## https://bit.ly/3SSgUBk