In [1]:
import pandas as pd
import statsmodels.formula.api as smf

In [3]:
blood = pd.read_csv("data/medicaldata_blood_storage.csv")

In [5]:
blood["Recurrence"].value_counts()

0    262
1     54
Name: Recurrence, dtype: int64

In [6]:
model = smf.logit("Recurrence ~ Age", data=blood)

In [8]:
results = model.fit()

Optimization terminated successfully.
         Current function value: 0.456452
         Iterations 6


In [9]:
results.summary()

0,1,2,3
Dep. Variable:,Recurrence,No. Observations:,316.0
Model:,Logit,Df Residuals:,314.0
Method:,MLE,Df Model:,1.0
Date:,"Thu, 23 Sep 2021",Pseudo R-squ.:,0.001828
Time:,16:22:59,Log-Likelihood:,-144.24
converged:,True,LL-Null:,-144.5
Covariance Type:,nonrobust,LLR p-value:,0.4673

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.5107,1.302,-1.928,0.054,-5.062,0.041
Age,0.0152,0.021,0.723,0.470,-0.026,0.056


In [10]:
results.params

Intercept   -2.510677
Age          0.015164
dtype: float64

In [11]:
import numpy as np

In [12]:
np.exp(results.params)

Intercept    0.081213
Age          1.015279
dtype: float64

In [13]:
model = smf.logit("Recurrence ~ Age + PVol + PreopPSA", data=blood)
results = model.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.421375
         Iterations 6


0,1,2,3
Dep. Variable:,Recurrence,No. Observations:,305.0
Model:,Logit,Df Residuals:,301.0
Method:,MLE,Df Model:,3.0
Date:,"Thu, 23 Sep 2021",Pseudo R-squ.:,0.0666
Time:,16:27:08,Log-Likelihood:,-128.52
converged:,True,LL-Null:,-137.69
Covariance Type:,nonrobust,LLR p-value:,0.0003744

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-3.7564,1.439,-2.611,0.009,-6.576,-0.937
Age,0.0281,0.024,1.194,0.232,-0.018,0.074
PVol,-0.0080,0.006,-1.368,0.171,-0.019,0.003
PreopPSA,0.0952,0.023,4.128,0.000,0.050,0.140


In [15]:
np.exp(results.params)

Intercept    0.023368
Age          1.028542
PVol         0.992061
PreopPSA     1.099839
dtype: float64

In [17]:
model = smf.logit("Recurrence ~ Age + PVol + PreopPSA + C(TVol)", data=blood)
results = model.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 0.382324
         Iterations 8


0,1,2,3
Dep. Variable:,Recurrence,No. Observations:,300.0
Model:,Logit,Df Residuals:,294.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 23 Sep 2021",Pseudo R-squ.:,0.1411
Time:,16:29:57,Log-Likelihood:,-114.7
converged:,True,LL-Null:,-133.55
Covariance Type:,nonrobust,LLR p-value:,4.336e-07

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-6.1399,1.819,-3.375,0.001,-9.706,-2.574
C(TVol)[T.2.0],2.0730,1.061,1.953,0.051,-0.007,4.153
C(TVol)[T.3.0],3.1200,1.065,2.930,0.003,1.033,5.207
Age,0.0265,0.025,1.082,0.279,-0.022,0.075
PVol,-9.982e-06,0.006,-0.002,0.999,-0.013,0.013
PreopPSA,0.0586,0.024,2.422,0.015,0.011,0.106


In [18]:
np.exp(results.params)

Intercept          0.002155
C(TVol)[T.2.0]     7.948254
C(TVol)[T.3.0]    22.647154
Age                1.026896
PVol               0.999990
PreopPSA           1.060348
dtype: float64

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
blood.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316 entries, 0 to 315
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   RBC.Age.Group     316 non-null    int64  
 1   Median.RBC.Age    316 non-null    int64  
 2   Age               316 non-null    float64
 3   AA                316 non-null    int64  
 4   FamHx             316 non-null    int64  
 5   PVol              307 non-null    float64
 6   TVol              310 non-null    float64
 7   T.Stage           303 non-null    float64
 8   bGS               314 non-null    float64
 9   BN+               316 non-null    int64  
 10  OrganConfined     316 non-null    int64  
 11  PreopPSA          313 non-null    float64
 12  PreopTherapy      316 non-null    int64  
 13  Units             316 non-null    int64  
 14  sGS               316 non-null    int64  
 15  AnyAdjTherapy     316 non-null    int64  
 16  AdjRadTherapy     316 non-null    int64  
 1

In [22]:
blood_subset = blood[["Recurrence", "Age", "PVol", "PreopPSA", "TVol"]]

In [25]:
blood_subset["TVol"] = blood_subset["TVol"].astype("category")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  blood_subset["TVol"] = blood_subset["TVol"].astype("category")


In [26]:
blood_subset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 316 entries, 0 to 315
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   Recurrence  316 non-null    int64   
 1   Age         316 non-null    float64 
 2   PVol        307 non-null    float64 
 3   PreopPSA    313 non-null    float64 
 4   TVol        310 non-null    category
dtypes: category(1), float64(3), int64(1)
memory usage: 10.4 KB


In [28]:
pd.get_dummies(blood_subset)

Unnamed: 0,Recurrence,Age,PVol,PreopPSA,TVol_1.0,TVol_2.0,TVol_3.0
0,1,72.1,54.0,14.08,0,0,1
1,1,73.6,43.2,10.50,0,0,1
2,0,67.5,102.7,6.98,1,0,0
3,0,65.8,46.0,4.40,1,0,0
4,0,63.2,60.0,21.40,0,1,0
...,...,...,...,...,...,...,...
311,0,54.8,25.6,5.00,0,1,0
312,0,62.3,42.3,7.60,0,1,0
313,1,62.4,50.0,9.30,0,1,0
314,0,57.6,74.9,4.50,1,0,0


In [50]:
blood_sub_dummy = pd.get_dummies(blood_subset, drop_first=True)
blood_sub_dummy = blood_sub_dummy.dropna()

In [51]:
response = blood_sub_dummy["Recurrence"]
response

0      1
1      1
2      0
3      0
4      0
      ..
311    0
312    0
313    1
314    0
315    0
Name: Recurrence, Length: 305, dtype: int64

In [52]:
predictors = blood_sub_dummy.iloc[:, 1:]
predictors

Unnamed: 0,Age,PVol,PreopPSA,TVol_2.0,TVol_3.0
0,72.1,54.0,14.08,0,1
1,73.6,43.2,10.50,0,1
2,67.5,102.7,6.98,0,0
3,65.8,46.0,4.40,0,0
4,63.2,60.0,21.40,1,0
...,...,...,...,...,...
311,54.8,25.6,5.00,1,0
312,62.3,42.3,7.60,1,0
313,62.4,50.0,9.30,1,0
314,57.6,74.9,4.50,0,0


In [57]:
X_train, X_test, y_train, y_test = train_test_split(predictors, response)

In [58]:
from sklearn.linear_model import LogisticRegression

In [83]:
model = LogisticRegression(penalty="none")

In [84]:
model.fit(X_train, y_train)

LogisticRegression(penalty='none')

In [85]:
model.coef_

array([[ 0.03498819, -0.0201158 ,  0.07500502,  0.83120946,  1.7086887 ]])

In [86]:
model.intercept_

array([-4.54673264])

In [87]:
np.exp(model.coef_)

array([[1.03560748, 0.98008517, 1.07788957, 2.29609411, 5.52171612]])

In [80]:
model.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

In [81]:
y_test.value_counts()

0    62
1    15
Name: Recurrence, dtype: int64

In [82]:
model.score(X_test, y_test)

0.7922077922077922