In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
from scipy import stats

In [3]:
in_file_full_name = '../../../data/pfas_demo.csv'
data_in_df = pd.read_csv(in_file_full_name)
data_in_df.head()

Unnamed: 0,disease,PFOS,age,gender,BMI
0,0,11.741246,36,0,23.557257
1,0,3.721879,61,0,21.576156
2,0,5.870561,53,1,24.71787
3,0,22.399261,31,0,23.274264
4,0,9.348789,43,0,27.621793


In [5]:
# log transform PFAS
log_pfas_column_name = f'log_PFOS'
data_in_df[log_pfas_column_name] = np.log(data_in_df['PFOS'])
data_in_df.head()

Unnamed: 0,disease,PFOS,age,gender,BMI,log_PFOS
0,0,11.741246,36,0,23.557257,2.463108
1,0,3.721879,61,0,21.576156,1.314229
2,0,5.870561,53,1,24.71787,1.76995
3,0,22.399261,31,0,23.274264,3.109028
4,0,9.348789,43,0,27.621793,2.235247


In [7]:
# fit model 1
covariate_columns_1 = sm.add_constant(data_in_df['log_PFOS'])
print(covariate_columns_1)
model_1 =sm.Logit(data_in_df['disease'], covariate_columns_1)
result_1 = model_1.fit()
print(result_1.summary())

     const  log_PFOS
0      1.0  2.463108
1      1.0  1.314229
2      1.0  1.769950
3      1.0  3.109028
4      1.0  2.235247
..     ...       ...
295    1.0  1.495473
296    1.0  2.008583
297    1.0  1.770568
298    1.0  1.776932
299    1.0  2.095647

[300 rows x 2 columns]
Optimization terminated successfully.
         Current function value: 0.690454
         Iterations 3
                           Logit Regression Results                           
Dep. Variable:                disease   No. Observations:                  300
Model:                          Logit   Df Residuals:                      298
Method:                           MLE   Df Model:                            1
Date:                Tue, 09 Sep 2025   Pseudo R-squ.:               2.899e-06
Time:                        13:44:36   Log-Likelihood:                -207.14
converged:                       True   LL-Null:                       -207.14
Covariance Type:            nonrobust   LLR p-value:                 

In [37]:
print(result_1.params)

const       0.134200
log_PFOS    0.006641
dtype: float64


In [8]:
# fit model 2
covariate_columns_2 = sm.add_constant(data_in_df[['log_PFOS', 'age', 'gender', 'BMI']])
covariate_columns_2.head()
model_2 = sm.Logit(data_in_df['disease'], covariate_columns_2)
result_2 = model_2.fit()
print(result_2.summary())

Optimization terminated successfully.
         Current function value: 0.658576
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                disease   No. Observations:                  300
Model:                          Logit   Df Residuals:                      295
Method:                           MLE   Df Model:                            4
Date:                Tue, 09 Sep 2025   Pseudo R-squ.:                 0.04617
Time:                        13:51:35   Log-Likelihood:                -197.57
converged:                       True   LL-Null:                       -207.14
Covariance Type:            nonrobust   LLR p-value:                 0.0007418
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.1764      0.953     -3.332      0.001      -5.045      -1.308
log_PFOS       0.0558      0.

In [39]:
print(result_2.params)

const      -3.176407
log_PFOS    0.055781
age         0.019553
gender      0.549666
BMI         0.072055
dtype: float64


In [40]:
beta_in_result_1 = result_1.params[log_pfas_column_name]
beta_in_result_2 = result_2.params[log_pfas_column_name]
percent_change = 100 *(beta_in_result_2 - beta_in_result_1) / beta_in_result_1
percent_change

739.9539684206452

In [42]:
lr_stat = 2 * (result_2.llf - result_1.llf)
lr_stat

19.126635783429037

In [44]:
df_diff = result_2.df_model - result_1.df_model
df_diff

3.0

In [49]:
p_lrt = stats.chi2.sf(lr_stat, df_diff)
p_lrt

0.0002574002276777569