# Gala dataset

## Importando os dados

In [6]:
import pandas as pd


dados = pd.read_table('./datasets/gala.txt', comment="#")
dados.Island = dados.Island.astype(pd.CategoricalDtype())

dados.head()

Unnamed: 0,Island,Species,Endemics,Area,Elevation,Nearest,Scruz,Adjacent
0,Baltra,58,23,25.09,346,0.6,0.6,1.84
1,Bartolome,31,21,1.24,109,0.6,26.3,572.33
2,Caldwell,3,3,0.21,114,2.8,58.7,0.78
3,Champion,25,9,0.1,46,1.9,47.4,0.18
4,Coamano,2,1,0.05,77,1.9,1.9,903.82


## Ajustando modelo Poisson com offset

In [7]:
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.generalized_linear_model import GLMResults

offset = np.log(dados.Area)
formula = "Species ~ Endemics + Elevation + Nearest + Scruz + np.log(Adjacent)"

modelo = smf.glm(
    formula,
    data=dados,
    family=sm.families.Poisson(),
    offset=offset,
)

ajuste: GLMResults = modelo.fit()

ajuste.summary()

0,1,2,3
Dep. Variable:,Species,No. Observations:,30.0
Model:,GLM,Df Residuals:,24.0
Model Family:,Poisson,Df Model:,5.0
Link Function:,Log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-808.82
Date:,"Thu, 10 Nov 2022",Deviance:,1456.8
Time:,21:43:29,Pearson chi2:,3590.0
No. Iterations:,7,Pseudo R-squ. (CS):,1.0
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,2.7204,0.075,36.116,0.000,2.573,2.868
Endemics,-0.0175,0.002,-11.161,0.000,-0.021,-0.014
Elevation,-0.0020,0.000,-17.512,0.000,-0.002,-0.002
Nearest,-0.0161,0.002,-10.460,0.000,-0.019,-0.013
Scruz,-0.0045,0.001,-5.775,0.000,-0.006,-0.003
np.log(Adjacent),-0.0479,0.012,-3.855,0.000,-0.072,-0.024


## Análise dos resíduos

In [15]:
residuos_originais = ajuste.resid_pearson.sort_values()
k_repeticoes = 99

matriz_residuos = np.ndarray(shape=(len(residuos_originais), k_repeticoes))

for k in range(k_repeticoes):
    y_preditos = ajuste.predict(dados)
    dados_simulados = dados.copy(deep=True)
    dados_simulados.Species = y_preditos

    ajuste_simuladas: GLMResults = smf.glm(
        formula,
        data=dados_simulados,
        family=sm.families.Poisson(),
        offset=offset,
    ).fit()

    matriz_residuos[:,k] = ajuste.resid_pearson.sort_values()

matriz_residuos.shape

(30, 99)

In [38]:
limite_inferior = np.quantile(matriz_residuos, .025)
limite_superior = np.quantile(matriz_residuos, .975)
limite_inferior

-11.974594722065893