In [None]:
from scipy import stats
import numpy as np
import pandas as pd

from scipy.stats import norm

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Chi-square test

  - dados numéricos e discretos
  - comparação de 2 ou mais distribuições
    - p.ex. número de escamas de cobras fêmeas, de mesma espécie, em altitudes diferentes
    - são estatisticamente diferentes?
    - coletaremos 5 amostras a 1200 m e 5 amostras a 200 m

### Exemplo 01 - medicina

  - tabela de contingência

In [None]:
# testes x estado de pacientes
cases = np.array([[10, 4], [3, 29]]).T
dfc = pd.DataFrame(cases)
dfc.columns = ['são', 'doente']
dfc.index = ["negativo", "positivo"]
dfc

### Exemplo 02 - ecologia ~ cobras

In [None]:
chi2, pvalue, dof, expected = stats.chi2_contingency(dfc)
chi2, pvalue, dof, expected 

In [None]:
# scales = observed data
scales = np.array([[16, 18, 25, 14, 12], [32, 24, 27, 35, 40]]).T
scales

In [None]:
df = pd.DataFrame(scales)
df.columns = ['alt1200', 'alt200']
df.index = [32, 33, 34, 35, 36]
df

## Valores observados x Valores esperados

### Pearson

In [None]:
dfa = df.copy()
dfa['tot_marg_escamas'] = dfa.sum(axis = 1)
dfa

In [None]:
dfa.sum(axis = 0)

In [None]:
dfa = dfa.append(pd.DataFrame(dfa.sum(axis = 0)).T)
lista = list(dfa.index)
lista[-1] = 'tot_marg_altitude'
dfa.index = lista
dfa

In [None]:
dfp = dfa.copy()
nrow, ncol = dfp.shape
tot_animal = dfp.iloc[nrow-1, ncol-1]
dfp = dfp / tot_animal
print('tot animal', tot_animal)
dfp

### A multiplicação dos valores marginais coluna (j) x marginais linha (i) é o valor esperado de uma célula (i,j)

In [None]:
dfexp  = dfp.copy()

pos_tot_marg_escamas  = ncol - 1
pos_tot_marg_altitude = nrow - 1
print(pos_tot_marg_escamas, pos_tot_marg_altitude, "\n")

for i in range(nrow):
    for j in range(ncol):
        dfexp.iloc[i, j] = dfp.iloc[i, pos_tot_marg_escamas] * dfp.iloc[pos_tot_marg_altitude, j]
        
dfexp

In [None]:
dfexp2 = dfexp.copy()
(dfexp2*tot_animal).round(2)

In [None]:
dfexp2 = dfexp.copy()
dfexp2 = (dfexp2*tot_animal).round(0).astype(int)
dfexp2

In [None]:
df

In [None]:
dfexp3 = dfexp2.copy()
dfexp3 = dfexp2.iloc[:5, :2]
dfexp3

In [None]:
df - dfexp3

In [None]:
(df - dfexp3)**2

In [None]:
(df - dfexp3)**2 / dfexp3

In [None]:
chi2 = np.sum(np.sum((df - dfexp3)**2 / dfexp3))
chi2

### Dataframe == tabela de contingência

In [None]:
chi2, pvalue, dof, expected = stats.chi2_contingency(df)
chi2, pvalue, dof, expected 

In [None]:
"estatística de chi-quadrado %.3f, p-value=%.3e, degree of freedom (dof) = %d"%(chi2, pvalue, dof)

In [None]:
df

In [None]:
expected.round(2)

In [None]:
scales - expected

### Se a distribuição observada (df) similar à calculada ~ erro próximo de zero

In [None]:
erro = np.sum((scales - expected)**2)
erro

### A definição de estatística de chi-quadrado
### soma (observado - esperado)**2 / esperado

In [None]:
chis_stat = np.sum((scales - expected)**2/expected)
chis_stat

### Valores experados não se distanciam muito dos observados?

  - não: então as distribuições são similares:     H0  --> p-value < 0.05
  - sim: então as distribuições NÃO são similares: Ha --> p-value >= 0.05

In [None]:
pvalue

In [None]:
if pvalue >= 0.05:
    stri = "não se refuta H0: distribuições estatisticamente similares, statistic = %.3f, p-value = %.3e"%(chi2, pvalue)
else:
    stri = "adota-se a Ha: distribuições estatisticamente diferentes, statistic = %.3f, p-value = %.3e"%(chi2, pvalue)    
stri

### A tabela de contingência é de 5 linhas x 2 colunas

In [None]:
scales.shape

### Graus de liberdade

dof = (nrow - 1) * (ncol -1)

In [None]:
dof = (scales.shape[0]-1) * (scales.shape[1]-1)
dof

### Dados mais diferentes: distribuições distintas

In [None]:
# scales = observed data
scales = np.array([[16, 18, 16, 14, 12], [10, 24, 27, 35, 40]]).T
scales

In [None]:
df = pd.DataFrame(scales)
df.columns = ['alt1200', 'alt200']
df.index = [32, 33, 34, 35, 36]
df

In [None]:
stat = stats.chisquare(scales, axis=None)
stat

In [None]:
def chi2_text(df, verbose=True):
    chi2, p, dof, expected = stats.chi2_contingency(df)

    erro = np.sum((df.to_numpy() - expected)**2)

    if p >= 0.05:
        stri = "não se refuta H0: distribuições estatisticamente similares, statistic = %.3f, p-value = %.3e"%(chi2, p)
    else:
        stri = "adota-se a Ha: distribuições estatisticamente diferentes, statistic = %.3f, p-value = %.3e"%(chi2, p)
        
    if verbose:
        stri += "\nErro total = %d\nGraus de liberdade (dof) = %d"%(erro, dof)
        print(stri)
    
    return chi2, p, dof, expected, erro, stri

In [None]:
chi2, p, dof, expected, erro, stri = chi2_text(df, verbose=True)

### Simulando um afastamento ...

In [None]:
# scales = observed data
scales = np.array([[16, 18, 16, 14, 12], [10, 24, 27, 35, 40]]).T
df = pd.DataFrame(scales)
df.columns = ['alt1200', 'alt200']
df.index = [32, 33, 34, 35, 36]
df

In [None]:
df.alt200 = df.alt200 + 1
df

### Simulação de ir aumentando a col

In [None]:
# scales = observed data
scales = np.array([[16, 18, 16, 14, 12], [10, 24, 27, 35, 40]]).T
df = pd.DataFrame(scales)
df.columns = ['alt1200', 'alt200']
df.index = [32, 33, 34, 35, 36]
df

for i in range(5):
    print('%d)', i)
    df.alt200 = df.alt200 -1
    chi2, p, dof, expected, erro, stri = chi2_text(df, verbose=True)
    print('-----------')
    

In [None]:
df

### Se eu tiver 3 altitude (0m, 200m, 1200m)
### Grau de liberdade vai mudar
### Erro tende a aumentar

In [None]:
# scales = observed data
scales = np.array([[16, 18, 16, 14, 12], [10, 24, 27, 35, 40]]).T
print(scales)

df = pd.DataFrame(scales)
df.columns = ['alt1200', 'alt200']


chi2, p, dof, expected, erro, stri = chi2_text(df, verbose=True)

In [None]:
# scales = observed data
scales = np.array([[16, 15, 14, 12, 15], [16, 18, 16, 14, 12], [10, 24, 27, 35, 40]]).T
print(scales)

df = pd.DataFrame(scales)
df.columns = ['alt0', 'alt1200', 'alt200']

chi2, p, dof, expected, erro, stri = chi2_text(df, verbose=True)

### Distribuição chi-quadrado

In [None]:
# 2 altitudes
scales = np.array([[16, 18, 16, 14, 12], [10, 24, 27, 35, 40]]).T
df1 = pd.DataFrame(scales)
df1.columns = ['alt1200', 'alt200']

# 3 altitudes
scales = np.array([[16, 15, 14, 12, 15], [16, 18, 16, 14, 12], [10, 24, 27, 35, 40]]).T
df2 = pd.DataFrame(scales)
df2.columns = ['alt0', 'alt1200', 'alt200']

# 5 altitudes
scales = np.array([[16, 18, 16, 14, 10], 
                   [17, 19, 16, 13, 15],
                   [32, 24, 18, 11, 12],
                   [16, 18, 12, 16, 12],
                   [20, 18, 18, 16, 11]]).T
df3 = pd.DataFrame(scales)
df3.columns = ['alt0', 'alt1', 'alt2', 'alt1200', 'alt200']


In [None]:
N = 300
colors = ['blue', 'green', 'red']

fig = plt.figure(figsize=(12,6))

for i in range(3):
    if   i == 0: df = df1
    elif i == 1: df = df2
    else:        df = df3
        
    color = colors[i]
    chi2, p, dof, expected, erro, stri = chi2_text(df, verbose=False)
    print(i, dof, chi2, color)

    ax = sns.distplot(np.random.chisquare(df=dof, size=N), hist=False, color=color);


    ax.annotate('chi2 stat dof=%d, pvalue=%.1e'%(dof, p), xy=(chi2+.21, 0.02), xytext=(chi2+.8, 0.04*(i+1)),
                color = color,
                arrowprops=dict(arrowstyle="->",
                                connectionstyle="angle3,angleA=0,angleB=-90", color=color));
    
    ax.axvline(x=chi2, ymin=0, ymax=.1, color=color)


### Adendo da distribuição de chi-quadrado

In [None]:
mu = 0
sdv = 1
N = 1000

dist1 = np.random.normal(mu, sdv, N)**2
dist2 = np.random.normal(mu, sdv, N)**2


chi2_2 = dist1 + dist2

plt.figure(figsize=(14,7))
ax = sns.distplot(dist1, color='blue',  hist=True, kde=False, norm_hist=True, rug=True, rug_kws={"color": 'blue', "alpha": .2,})
ret = sns.distplot(chi2_2, color='red', hist=True, kde=False, norm_hist=True, rug=True, rug_kws={"color": 'red', "alpha": .2,}, ax=ax)

# fit the chi2 distribution
from scipy import stats

df = 1
xseq = np.linspace(stats.chi2.ppf(0.01, df), stats.chi2.ppf(0.99, df), 100)
ax.plot(xseq, stats.chi2.pdf(xseq, df), color='navy')

df = 2
xseq = np.linspace(stats.chi2.ppf(0.01, df), stats.chi2.ppf(0.99, df), 100)
ax.plot(xseq, stats.chi2.pdf(xseq, df), color='darkred')

plt.ylim(0, 2)
plt.xlim(0,9)

In [None]:
dist3 = np.random.normal(mu, sdv, N)**2

chi2_3 = dist1 + dist2 + dist3

plt.figure(figsize=(12,6))
ax  = sns.distplot(dist2,  color='blue',   hist=True, kde=False, norm_hist=True, rug=True, rug_kws={"color": 'blue', "alpha": .2,})
ret = sns.distplot(chi2_2, color='red',    hist=True, kde=False, norm_hist=True, rug=True, rug_kws={"color": 'red', "alpha": .2,}, ax=ax)
ret = sns.distplot(chi2_3, color='yellow', hist=True, kde=False, norm_hist=True, rug=True, rug_kws={"color": 'yellow', "alpha": .2,}, ax=ax)

df = 1
xseq = np.linspace(stats.chi2.ppf(0.01, df), stats.chi2.ppf(0.99, df), 100)
ax.plot(xseq, stats.chi2.pdf(xseq, df), color='navy')

df = 2
xseq = np.linspace(stats.chi2.ppf(0.01, df), stats.chi2.ppf(0.99, df), 100)
ax.plot(xseq, stats.chi2.pdf(xseq, df), color='darkred')

df = 3
xseq = np.linspace(stats.chi2.ppf(0.01, df), stats.chi2.ppf(0.99, df), 100)
ax.plot(xseq, stats.chi2.pdf(xseq, df), color='orange')

plt.ylim(0, 1)
plt.xlim(0,9)