Reproduction of https://www.statsmodels.org/devel/examples/notebooks/generated/glm.html#GLM:-Binomial-response-data

In [1]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
import sys
sys.path.append("..")

In [3]:
import linearlab as ll

In [4]:
print(sm.datasets.star98.NOTE)

::

    Number of Observations - 303 (counties in California).

    Number of Variables - 13 and 8 interaction terms.

    Definition of variables names::

        NABOVE   - Total number of students above the national median for the
                   math section.
        NBELOW   - Total number of students below the national median for the
                   math section.
        LOWINC   - Percentage of low income students
        PERASIAN - Percentage of Asian student
        PERBLACK - Percentage of black students
        PERHISP  - Percentage of Hispanic students
        PERMINTE - Percentage of minority teachers
        AVYRSEXP - Sum of teachers' years in educational service divided by the
                number of teachers.
        AVSALK   - Total salary budget including benefits divided by the number
                   of full-time teachers (in thousands)
        PERSPENK - Per-pupil spending (in thousands)
        PTRATIO  - Pupil-teacher ratio.
        PCTAF    - Percenta

In [5]:
star98 = sm.datasets.star98.load().data

In [6]:
formula = (
    "NABOVE + NBELOW ~ "
    "LOWINC + PERASIAN + PERBLACK + PERHISP + PERMINTE + AVYRSEXP + "
    "AVSALK + PERSPENK + PTRATIO + PCTAF + PCTCHRT + PCTYRRND + "
    "PERMINTE_AVYRSEXP + PERMINTE_AVSAL + AVYRSEXP_AVSAL + "
    "PERSPEN_PTRATIO + PERSPEN_PCTAF + PTRATIO_PCTAF + "
    "PERMINTE_AVYRSEXP_AVSAL + PERSPEN_PTRATIO_PCTAF"
)

In [7]:
sm_logit_model = smf.glm(formula, star98, family=sm.families.Binomial())

In [8]:
sm_logit_fit = sm_logit_model.fit()

In [9]:
sm_logit_fit.summary()

0,1,2,3
Dep. Variable:,"['NABOVE', 'NBELOW']",No. Observations:,303.0
Model:,GLM,Df Residuals:,282.0
Model Family:,Binomial,Df Model:,20.0
Link Function:,Logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-2998.6
Date:,"Thu, 10 Aug 2023",Deviance:,4078.8
Time:,06:17:05,Pearson chi2:,4050.0
No. Iterations:,5,Pseudo R-squ. (CS):,1.0
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,2.9589,1.547,1.913,0.056,-0.073,5.990
LOWINC,-0.0168,0.000,-38.749,0.000,-0.018,-0.016
PERASIAN,0.0099,0.001,16.505,0.000,0.009,0.011
PERBLACK,-0.0187,0.001,-25.182,0.000,-0.020,-0.017
PERHISP,-0.0142,0.000,-32.818,0.000,-0.015,-0.013
PERMINTE,0.2545,0.030,8.498,0.000,0.196,0.313
AVYRSEXP,0.2407,0.057,4.212,0.000,0.129,0.353
AVSALK,0.0804,0.014,5.775,0.000,0.053,0.108
PERSPENK,-1.9522,0.317,-6.162,0.000,-2.573,-1.331


In [10]:
ll_logit_model = ll.glm(star98, formula, lik=ll.lik.binomial())

In [11]:
ll_logit_fit = ll_logit_model.fit()

In [12]:
ll_logit_fit.loglik

-2998.6125589939766

In [13]:
ll_logit_fit.beta_grouped

p  Intercept                  2.958878
   LOWINC                    -0.016815
   PERASIAN                   0.009925
   PERBLACK                  -0.018724
   PERHISP                   -0.014239
   PERMINTE                   0.254487
   AVYRSEXP                   0.240694
   AVSALK                     0.080409
   PERSPENK                  -1.952160
   PTRATIO                   -0.334086
   PCTAF                     -0.169022
   PCTCHRT                    0.004917
   PCTYRRND                  -0.003580
   PERMINTE_AVYRSEXP         -0.014077
   PERMINTE_AVSAL            -0.004005
   AVYRSEXP_AVSAL            -0.003906
   PERSPEN_PTRATIO            0.091714
   PERSPEN_PCTAF              0.048990
   PTRATIO_PCTAF              0.008041
   PERMINTE_AVYRSEXP_AVSAL    0.000222
   PERSPEN_PTRATIO_PCTAF     -0.002249
dtype: float64

In [14]:
sm_probit_model = smf.glm(formula, star98, family=sm.families.Binomial(sm.families.links.Probit()))

In [15]:
sm_probit_fit = sm_probit_model.fit()

In [16]:
sm_probit_fit.summary()

0,1,2,3
Dep. Variable:,"['NABOVE', 'NBELOW']",No. Observations:,303.0
Model:,GLM,Df Residuals:,282.0
Model Family:,Binomial,Df Model:,20.0
Link Function:,Probit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-3014.0
Date:,"Thu, 10 Aug 2023",Deviance:,4109.6
Time:,06:19:08,Pearson chi2:,4090.0
No. Iterations:,5,Pseudo R-squ. (CS):,1.0
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.5439,0.925,1.669,0.095,-0.269,3.357
LOWINC,-0.0103,0.000,-38.981,0.000,-0.011,-0.010
PERASIAN,0.0059,0.000,16.376,0.000,0.005,0.007
PERBLACK,-0.0118,0.000,-26.077,0.000,-0.013,-0.011
PERHISP,-0.0088,0.000,-33.542,0.000,-0.009,-0.008
PERMINTE,0.1477,0.018,8.220,0.000,0.112,0.183
AVYRSEXP,0.1361,0.035,3.885,0.000,0.067,0.205
AVSALK,0.0462,0.009,5.410,0.000,0.029,0.063
PERSPENK,-1.0953,0.188,-5.820,0.000,-1.464,-0.726


In [18]:
ll_probit_model = ll.glm(star98, formula, lik=ll.lik.binomial(ll.link.probit))

In [19]:
ll_probit_fit = ll_probit_model.fit()

In [21]:
ll_probit_fit.loglik

-3014.0409879316285

In [22]:
ll_probit_fit.beta_grouped

p  Intercept                  1.543917
   LOWINC                    -0.010326
   PERASIAN                   0.005927
   PERBLACK                  -0.011753
   PERHISP                   -0.008834
   PERMINTE                   0.147680
   AVYRSEXP                   0.136137
   AVSALK                     0.046177
   PERSPENK                  -1.095327
   PTRATIO                   -0.184516
   PCTAF                     -0.091851
   PCTCHRT                    0.002986
   PCTYRRND                  -0.002175
   PERMINTE_AVYRSEXP         -0.008076
   PERMINTE_AVSAL            -0.002308
   AVYRSEXP_AVSAL            -0.002188
   PERSPEN_PTRATIO            0.051564
   PERSPEN_PCTAF              0.027390
   PTRATIO_PCTAF              0.004403
   PERMINTE_AVYRSEXP_AVSAL    0.000127
   PERSPEN_PTRATIO_PCTAF     -0.001260
dtype: float64

In [23]:
sm_cauchit_model = smf.glm(formula, star98, family=sm.families.Binomial(sm.families.links.Cauchy()))

In [24]:
sm_cauchit_fit = sm_cauchit_model.fit()

In [25]:
sm_cauchit_fit.summary()

0,1,2,3
Dep. Variable:,"['NABOVE', 'NBELOW']",No. Observations:,303.0
Model:,GLM,Df Residuals:,282.0
Model Family:,Binomial,Df Model:,20.0
Link Function:,Cauchy,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-2985.9
Date:,"Thu, 10 Aug 2023",Deviance:,4053.3
Time:,06:22:04,Pearson chi2:,4000.0
No. Iterations:,7,Pseudo R-squ. (CS):,1.0
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,6.3387,1.562,4.059,0.000,3.278,9.400
LOWINC,-0.0148,0.000,-37.254,0.000,-0.016,-0.014
PERASIAN,0.0104,0.001,17.489,0.000,0.009,0.012
PERBLACK,-0.0144,0.001,-20.430,0.000,-0.016,-0.013
PERHISP,-0.0116,0.000,-28.598,0.000,-0.012,-0.011
PERMINTE,0.2773,0.030,9.300,0.000,0.219,0.336
AVYRSEXP,0.2583,0.050,5.151,0.000,0.160,0.357
AVSALK,0.0850,0.012,6.927,0.000,0.061,0.109
PERSPENK,-2.7343,0.331,-8.252,0.000,-3.384,-2.085


In [26]:
ll_cauchit_model = ll.glm(star98, formula, lik=ll.lik.binomial(ll.link.cauchit))

In [27]:
ll_cauchit_fit = ll_cauchit_model.fit()

In [28]:
ll_cauchit_fit.loglik

-2985.90195670526

In [29]:
ll_cauchit_fit.beta_grouped

p  Intercept                  6.338691
   LOWINC                    -0.014766
   PERASIAN                   0.010363
   PERBLACK                  -0.014395
   PERHISP                   -0.011633
   PERMINTE                   0.277337
   AVYRSEXP                   0.258281
   AVSALK                     0.084957
   PERSPENK                  -2.734326
   PTRATIO                   -0.494774
   PCTAF                     -0.272836
   PCTCHRT                    0.004143
   PCTYRRND                  -0.003197
   PERMINTE_AVYRSEXP         -0.015877
   PERMINTE_AVSAL            -0.004444
   AVYRSEXP_AVSAL            -0.004270
   PERSPEN_PTRATIO            0.125752
   PERSPEN_PCTAF              0.070688
   PTRATIO_PCTAF              0.012541
   PERMINTE_AVYRSEXP_AVSAL    0.000251
   PERSPEN_PTRATIO_PCTAF     -0.003196
dtype: float64

In [30]:
sm_loglog_model = smf.glm(formula, star98, family=sm.families.Binomial(sm.families.links.LogLog()))

In [31]:
sm_loglog_fit = sm_loglog_model.fit()

In [32]:
sm_loglog_fit.summary()

0,1,2,3
Dep. Variable:,"['NABOVE', 'NBELOW']",No. Observations:,303.0
Model:,GLM,Df Residuals:,282.0
Model Family:,Binomial,Df Model:,20.0
Link Function:,LogLog,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-3270.8
Date:,"Thu, 10 Aug 2023",Deviance:,4623.2
Time:,06:24:28,Pearson chi2:,4600.0
No. Iterations:,6,Pseudo R-squ. (CS):,1.0
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,4.2550,1.056,4.029,0.000,2.185,6.325
LOWINC,-0.0111,0.000,-38.986,0.000,-0.012,-0.011
PERASIAN,0.0059,0.000,14.148,0.000,0.005,0.007
PERBLACK,-0.0132,0.000,-28.470,0.000,-0.014,-0.012
PERHISP,-0.0095,0.000,-34.479,0.000,-0.010,-0.009
PERMINTE,0.1183,0.018,6.496,0.000,0.083,0.154
AVYRSEXP,0.0734,0.039,1.903,0.057,-0.002,0.149
AVSALK,0.0317,0.009,3.389,0.001,0.013,0.050
PERSPENK,-1.4653,0.220,-6.672,0.000,-1.896,-1.035


In [33]:
ll_loglog_model = ll.glm(star98, formula, lik=ll.lik.binomial(ll.link.loglog))

In [34]:
ll_loglog_fit = ll_loglog_model.fit()

In [35]:
ll_loglog_fit.loglik

-3270.8427515828516

In [36]:
ll_loglog_fit.beta_grouped

p  Intercept                  4.254966
   LOWINC                    -0.011135
   PERASIAN                   0.005884
   PERBLACK                  -0.013154
   PERHISP                   -0.009474
   PERMINTE                   0.118271
   AVYRSEXP                   0.073423
   AVSALK                     0.031662
   PERSPENK                  -1.465295
   PTRATIO                   -0.247174
   PCTAF                     -0.142116
   PCTCHRT                    0.003142
   PCTYRRND                  -0.002273
   PERMINTE_AVYRSEXP         -0.006135
   PERMINTE_AVSAL            -0.001759
   AVYRSEXP_AVSAL            -0.001078
   PERSPEN_PTRATIO            0.067138
   PERSPEN_PCTAF              0.040162
   PTRATIO_PCTAF              0.006607
   PERMINTE_AVYRSEXP_AVSAL    0.000092
   PERSPEN_PTRATIO_PCTAF     -0.001816
dtype: float64

In [37]:
sm_cloglog_model = smf.glm(formula, star98, family=sm.families.Binomial(sm.families.links.CLogLog()))

In [38]:
sm_cloglog_fit = sm_cloglog_model.fit()

In [39]:
sm_cloglog_fit.summary()

0,1,2,3
Dep. Variable:,"['NABOVE', 'NBELOW']",No. Observations:,303.0
Model:,GLM,Df Residuals:,282.0
Model Family:,Binomial,Df Model:,20.0
Link Function:,CLogLog,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-2885.1
Date:,"Thu, 10 Aug 2023",Deviance:,3851.8
Time:,06:26:08,Pearson chi2:,3820.0
No. Iterations:,6,Pseudo R-squ. (CS):,1.0
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.0839,1.059,-0.079,0.937,-2.159,1.991
LOWINC,-0.0121,0.000,-37.653,0.000,-0.013,-0.011
PERASIAN,0.0072,0.000,17.782,0.000,0.006,0.008
PERBLACK,-0.0138,0.001,-23.626,0.000,-0.015,-0.013
PERHISP,-0.0109,0.000,-32.644,0.000,-0.012,-0.010
PERMINTE,0.2213,0.023,9.496,0.000,0.176,0.267
AVYRSEXP,0.2180,0.041,5.264,0.000,0.137,0.299
AVSALK,0.0707,0.010,6.952,0.000,0.051,0.091
PERSPENK,-1.1468,0.211,-5.429,0.000,-1.561,-0.733


In [40]:
ll_cloglog_model = ll.glm(star98, formula, lik=ll.lik.binomial(ll.link.cloglog))

In [41]:
ll_cloglog_fit = ll_cloglog_model.fit()

In [42]:
ll_cloglog_fit.loglik

-2885.1361523091327

In [43]:
ll_cloglog_fit.beta_grouped

p  Intercept                 -0.083918
   LOWINC                    -0.012077
   PERASIAN                   0.007176
   PERBLACK                  -0.013755
   PERHISP                   -0.010855
   PERMINTE                   0.221333
   AVYRSEXP                   0.218000
   AVSALK                     0.070656
   PERSPENK                  -1.146776
   PTRATIO                   -0.192686
   PCTAF                     -0.074351
   PCTCHRT                    0.003526
   PCTYRRND                  -0.002703
   PERMINTE_AVYRSEXP         -0.011970
   PERMINTE_AVSAL            -0.003583
   AVYRSEXP_AVSAL            -0.003604
   PERSPEN_PTRATIO            0.055230
   PERSPEN_PCTAF              0.024148
   PTRATIO_PCTAF              0.003711
   PERMINTE_AVYRSEXP_AVSAL    0.000194
   PERSPEN_PTRATIO_PCTAF     -0.001134
dtype: float64