In [49]:
import pandas as pd
import numpy as np

import statsmodels.api as sm
from matplotlib import pyplot as plt
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

In [31]:
in_file_full_name = '../../../data/pfas_demo.csv'
data_in_df = pd.read_csv(in_file_full_name)
data_in_df.head()

Unnamed: 0,disease,PFOS,age,gender,BMI
0,0,11.741246,36,0,23.557257
1,0,3.721879,61,0,21.576156
2,0,5.870561,53,1,24.71787
3,0,22.399261,31,0,23.274264
4,0,9.348789,43,0,27.621793


In [32]:
# log transform PFAS
log_pfas_column_name = f'log_PFOS'
data_in_df[log_pfas_column_name] = np.log(data_in_df['PFOS'])
data_in_df.head()

Unnamed: 0,disease,PFOS,age,gender,BMI,log_PFOS
0,0,11.741246,36,0,23.557257,2.463108
1,0,3.721879,61,0,21.576156,1.314229
2,0,5.870561,53,1,24.71787,1.76995
3,0,22.399261,31,0,23.274264,3.109028
4,0,9.348789,43,0,27.621793,2.235247


In [19]:
# fit model 1 with only the log_PFOS as the variable
covariate_cols_1 = sm.add_constant(data_in_df['log_PFOS'])
print(covariate_cols_1.head())

   const  log_PFOS
0    1.0  2.463108
1    1.0  1.314229
2    1.0  1.769950
3    1.0  3.109028
4    1.0  2.235247


In [21]:
model_1 = sm.Logit(data_in_df['disease'], covariate_cols_1)
result_1 = model_1.fit()
print(result_1.summary())

Optimization terminated successfully.
         Current function value: 0.690454
         Iterations 3
                           Logit Regression Results                           
Dep. Variable:                disease   No. Observations:                  300
Model:                          Logit   Df Residuals:                      298
Method:                           MLE   Df Model:                            1
Date:                Wed, 24 Sep 2025   Pseudo R-squ.:               2.899e-06
Time:                        22:57:15   Log-Likelihood:                -207.14
converged:                       True   LL-Null:                       -207.14
Covariance Type:            nonrobust   LLR p-value:                    0.9724
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1342      0.385      0.348      0.728      -0.621       0.889
log_PFOS       0.0066      0.

In [22]:
print(result_1.params)

const       0.134200
log_PFOS    0.006641
dtype: float64


In [23]:
# fit model 2: add covariates
covariate_cols_2 = sm.add_constant(data_in_df[['log_PFOS', 'age', 'gender', 'BMI']])
covariate_cols_2.head()
model_2 = sm.Logit(data_in_df['disease'], covariate_cols_2)
result_2 = model_2.fit()
print(result_2.summary())

Optimization terminated successfully.
         Current function value: 0.658576
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                disease   No. Observations:                  300
Model:                          Logit   Df Residuals:                      295
Method:                           MLE   Df Model:                            4
Date:                Wed, 24 Sep 2025   Pseudo R-squ.:                 0.04617
Time:                        22:57:59   Log-Likelihood:                -197.57
converged:                       True   LL-Null:                       -207.14
Covariance Type:            nonrobust   LLR p-value:                 0.0007418
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.1764      0.953     -3.332      0.001      -5.045      -1.308
log_PFOS       0.0558      0.

In [None]:
print(result_2.params)

In [24]:
beta_in_result_1 = result_1.params[log_pfas_column_name]
beta_in_result_2 = result_2.params[log_pfas_column_name]
percent_change = 100 *(beta_in_result_2 - beta_in_result_1) / beta_in_result_1
percent_change

739.9539684206452

In [25]:
beta_in_result_1

0.006640945097852425

In [26]:
beta_in_result_2

0.05578088189004774

In [27]:
lr_stat = 2 * (result_2.llf - result_1.llf)
lr_stat

19.126635783429037

In [28]:
df_diff = result_2.df_model - result_1.df_model
df_diff

3.0

In [29]:
p_lrt = stats.chi2.sf(lr_stat, df_diff)
p_lrt

0.0002574002276777569

In [None]:
x = np.arange(0, 50, 1)
beta = 0.34
p = x * np.exp(beta) / (1 + x * np.exp(beta))
fig, ax = plt.subplots()
ax.scatter(x, p)

In [51]:
covariate_cols_1_scaled = StandardScaler().fit_transform(covariate_cols_1)
logistic_regression_obj = LogisticRegression()
logistic_regression_obj.fit(X=covariate_cols_1_scaled, y=data_in_df['disease'])

In [45]:
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0).fit(X, y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [47]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0).fit(X, y)
clf.predict(X[:2, :])

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0, 0])