## Running A New Logistic Regression Model

### Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

import sklearn

import statsmodels.api as sm
import statsmodels.formula.api as smf
import scipy.stats as stats

%matplotlib inline
#sets the default autosave frequency in seconds
%autosave 60 
sns.set_style('dark')
sns.set(font_scale=1.2)

plt.rc('axes', labelsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)

from sklearn.inspection import permutation_importance
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.tree import export_graphviz, plot_tree
from sklearn.metrics import confusion_matrix, classification_report, mean_absolute_error, mean_squared_error,r2_score
from sklearn.metrics import plot_confusion_matrix, plot_precision_recall_curve, plot_roc_curve, accuracy_score
from sklearn.metrics import auc, f1_score, precision_score, recall_score, roc_auc_score



import warnings
warnings.filterwarnings('ignore')


pd.set_option('display.max_columns',None)
#pd.set_option('display.max_rows',100)
pd.set_option('display.width', 1000)
pd.set_option('display.float_format','{:.4f}'.format)


random.seed(0)
np.random.seed(0)
np.set_printoptions(suppress=True)

Autosaving every 60 seconds


## Exploratory Data Analysis

In [2]:
df = pd.read_csv("diabetes.csv")

In [3]:
df

Unnamed: 0,id,chol,stab.glu,hdl,ratio,glyhb,location,age,gender,height,weight,frame,bp.1s,bp.1d,bp.2s,bp.2d,waist,hip,time.ppn,insurance,fh,smoking,dm
0,1000,203.0000,82,56.0000,3.6000,4.3100,Buckingham,46,female,62.0000,121.0000,medium,118.0000,59.0000,,,29.0000,38.0000,720.0000,1,0,3,no
1,1001,165.0000,97,24.0000,6.9000,4.4400,Buckingham,29,female,64.0000,218.0000,large,112.0000,68.0000,,,46.0000,48.0000,360.0000,0,0,2,no
2,1002,228.0000,92,37.0000,6.2000,4.6400,Buckingham,58,female,61.0000,256.0000,large,190.0000,92.0000,185.0000,92.0000,49.0000,57.0000,180.0000,2,0,2,no
3,1003,78.0000,93,12.0000,6.5000,4.6300,Buckingham,67,male,67.0000,119.0000,large,110.0000,50.0000,,,33.0000,38.0000,480.0000,1,0,3,no
4,1005,249.0000,90,28.0000,8.9000,7.7200,Buckingham,64,male,68.0000,183.0000,medium,138.0000,80.0000,,,44.0000,41.0000,300.0000,0,0,3,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
398,41506,296.0000,369,46.0000,6.4000,16.1100,Louisa,53,male,69.0000,173.0000,medium,138.0000,94.0000,130.0000,94.0000,35.0000,39.0000,210.0000,2,0,1,yes
399,41507,284.0000,89,54.0000,5.3000,4.3900,Louisa,51,female,63.0000,154.0000,medium,140.0000,100.0000,146.0000,102.0000,32.0000,43.0000,180.0000,2,0,3,no
400,41510,194.0000,269,38.0000,5.1000,13.6300,Louisa,29,female,69.0000,167.0000,small,120.0000,70.0000,,,33.0000,40.0000,20.0000,0,1,2,yes
401,41752,199.0000,76,52.0000,3.8000,4.4900,Louisa,41,female,63.0000,197.0000,medium,120.0000,78.0000,,,41.0000,48.0000,255.0000,2,0,2,no


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 403 entries, 0 to 402
Data columns (total 23 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         403 non-null    int64  
 1   chol       402 non-null    float64
 2   stab.glu   403 non-null    int64  
 3   hdl        402 non-null    float64
 4   ratio      402 non-null    float64
 5   glyhb      390 non-null    float64
 6   location   403 non-null    object 
 7   age        403 non-null    int64  
 8   gender     403 non-null    object 
 9   height     398 non-null    float64
 10  weight     402 non-null    float64
 11  frame      391 non-null    object 
 12  bp.1s      398 non-null    float64
 13  bp.1d      398 non-null    float64
 14  bp.2s      141 non-null    float64
 15  bp.2d      141 non-null    float64
 16  waist      401 non-null    float64
 17  hip        401 non-null    float64
 18  time.ppn   400 non-null    float64
 19  insurance  403 non-null    int64  
 20  fh        

In [5]:
df.describe(include='all')

Unnamed: 0,id,chol,stab.glu,hdl,ratio,glyhb,location,age,gender,height,weight,frame,bp.1s,bp.1d,bp.2s,bp.2d,waist,hip,time.ppn,insurance,fh,smoking,dm
count,403.0,402.0,403.0,402.0,402.0,390.0,403,403.0,403,398.0,402.0,391,398.0,398.0,141.0,141.0,401.0,401.0,400.0,403.0,403.0,403.0,390
unique,,,,,,,2,,2,,,3,,,,,,,,,,,2
top,,,,,,,Louisa,,female,,,medium,,,,,,,,,,,no
freq,,,,,,,203,,234,,,184,,,,,,,,,,,330
mean,15978.3102,207.8458,106.6725,50.4453,4.5216,5.5898,,46.8511,,66.0201,177.592,,136.9045,83.3216,152.383,92.5248,37.9002,43.0399,341.25,1.0546,0.1886,1.8462,
std,11881.1221,44.4456,53.0767,17.2626,1.7279,2.2426,,16.3123,,3.9185,40.3407,,22.741,13.5892,21.713,11.5552,5.7293,5.6567,309.541,0.7961,0.3917,0.6512,
min,1000.0,78.0,48.0,12.0,1.5,2.68,,19.0,,52.0,99.0,,90.0,48.0,110.0,60.0,26.0,30.0,5.0,0.0,0.0,1.0,
25%,4792.5,179.0,81.0,38.0,3.2,4.38,,34.0,,63.0,151.0,,121.25,75.0,138.0,84.0,33.0,39.0,90.0,0.0,0.0,1.0,
50%,15766.0,204.0,89.0,46.0,4.2,4.84,,45.0,,66.0,172.5,,136.0,82.0,149.0,92.0,37.0,42.0,240.0,1.0,0.0,2.0,
75%,20336.0,230.0,106.0,59.0,5.4,5.6,,60.0,,69.0,200.0,,146.75,90.0,161.0,100.0,41.0,46.0,517.5,2.0,0.0,2.0,


In [6]:
df.shape

(403, 23)

In [7]:
df.columns

Index(['id', 'chol', 'stab.glu', 'hdl', 'ratio', 'glyhb', 'location', 'age', 'gender', 'height', 'weight', 'frame', 'bp.1s', 'bp.1d', 'bp.2s', 'bp.2d', 'waist', 'hip', 'time.ppn', 'insurance', 'fh', 'smoking', 'dm'], dtype='object')

### Drop unwanted features

Using the same data set as before, now try another model with these predictor variables: age, cholesterol and insurance type. 

In [8]:
df2 = df[['age', 'chol', 'insurance','dm' ]]

In [9]:
df2.head()

Unnamed: 0,age,chol,insurance,dm
0,46,203.0,1,no
1,29,165.0,0,no
2,58,228.0,2,no
3,67,78.0,1,no
4,64,249.0,0,yes


### Treat Missing Values

In [10]:
df2.isnull().sum()

age           0
chol          1
insurance     0
dm           13
dtype: int64

In [11]:
df2.dropna(inplace=True)

In [12]:
df2.isnull().sum()

age          0
chol         0
insurance    0
dm           0
dtype: int64

In [13]:
encoder = LabelEncoder()

In [14]:
df2["dm"] = encoder.fit_transform(df2["dm"])

In [15]:
df2["insurance"] = encoder.fit_transform(df2["insurance"].astype("str"))

In [16]:
df2

Unnamed: 0,age,chol,insurance,dm
0,46,203.0000,1,0
1,29,165.0000,0,0
2,58,228.0000,2,0
3,67,78.0000,1,0
4,64,249.0000,0,1
...,...,...,...,...
397,89,301.0000,1,0
398,53,296.0000,2,1
399,51,284.0000,2,0
400,29,194.0000,0,1


In [17]:
df2.insurance.value_counts()

1    144
2    132
0    113
Name: insurance, dtype: int64

In [18]:
df2["insurance"] = df2["insurance"].astype("object")

In [19]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 389 entries, 0 to 401
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   age        389 non-null    int64  
 1   chol       389 non-null    float64
 2   insurance  389 non-null    object 
 3   dm         389 non-null    int32  
dtypes: float64(1), int32(1), int64(1), object(1)
memory usage: 13.7+ KB


### Age Predictor

In [20]:
model_age = smf.logit(formula="dm ~ + age", data=df2).fit()

Optimization terminated successfully.
         Current function value: 0.384479
         Iterations 7


In [21]:
model_age.summary()

0,1,2,3
Dep. Variable:,dm,No. Observations:,389.0
Model:,Logit,Df Residuals:,387.0
Method:,MLE,Df Model:,1.0
Date:,"Fri, 07 May 2021",Pseudo R-squ.:,0.1059
Time:,08:42:49,Log-Likelihood:,-149.56
converged:,True,LL-Null:,-167.27
Covariance Type:,nonrobust,LLR p-value:,2.667e-09

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-4.3967,0.542,-8.107,0.000,-5.460,-3.334
age,0.0524,0.009,5.584,0.000,0.034,0.071


In [22]:
model_age.params

Intercept   -4.3967
age          0.0524
dtype: float64

In [23]:
np.exp(model_age.params)

Intercept   0.0123
age         1.0538
dtype: float64

Enter the odds ratio for age to two decimal places:

In [24]:
model_odds = pd.DataFrame(np.exp(model_age.params), columns=["ODDS RATIO"])

In [25]:
model_odds["Z-Value"] = model_age.pvalues

In [26]:
model_odds[["2.5%","97.5%"]] = np.exp(model_age.conf_int())

In [27]:
model_odds

Unnamed: 0,ODDS RATIO,Z-Value,2.5%,97.5%
Intercept,0.0123,0.0,0.0043,0.0357
age,1.0538,0.0,1.0346,1.0733


### Cholesterol Predictor

In [28]:
model_chol = smf.logit(formula="dm ~ + chol", data=df2).fit()

Optimization terminated successfully.
         Current function value: 0.410820
         Iterations 6


In [29]:
model_chol.summary()

0,1,2,3
Dep. Variable:,dm,No. Observations:,389.0
Model:,Logit,Df Residuals:,387.0
Method:,MLE,Df Model:,1.0
Date:,"Fri, 07 May 2021",Pseudo R-squ.:,0.0446
Time:,08:42:58,Log-Likelihood:,-159.81
converged:,True,LL-Null:,-167.27
Covariance Type:,nonrobust,LLR p-value:,0.0001122

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-4.1700,0.683,-6.108,0.000,-5.508,-2.832
chol,0.0115,0.003,3.813,0.000,0.006,0.017


In [30]:
model_chol.params

Intercept   -4.1700
chol         0.0115
dtype: float64

In [31]:
np.exp(model_chol.params)

Intercept   0.0155
chol        1.0116
dtype: float64

Enter the odds ratio for cholesterol to two decimal places:

In [32]:
model_odds = pd.DataFrame(np.exp(model_chol.params), columns=["ODDS RATIO"])

In [33]:
model_odds["Z-Value"] = model_chol.pvalues

In [34]:
model_odds[["2.5%","97.5%"]] = np.exp(model_chol.conf_int())

In [35]:
model_odds

Unnamed: 0,ODDS RATIO,Z-Value,2.5%,97.5%
Intercept,0.0155,0.0,0.0041,0.0589
chol,1.0116,0.0001,1.0056,1.0175


### Insurance Predictor

In [36]:
df2.head()

Unnamed: 0,age,chol,insurance,dm
0,46,203.0,1,0
1,29,165.0,0,0
2,58,228.0,2,0
3,67,78.0,1,0
4,64,249.0,0,1


In [37]:
govtinsurance = df2[df2["insurance"] == 1]

In [38]:
govtinsurance

Unnamed: 0,age,chol,insurance,dm
0,46,203.0000,1,0
3,67,78.0000,1,0
5,34,248.0000,1,0
10,60,242.0000,1,0
16,50,255.0000,1,0
...,...,...,...,...
382,33,217.0000,1,0
383,37,209.0000,1,0
387,60,279.0000,1,1
392,78,210.0000,1,0


In [39]:
model_ins1 = smf.logit(formula="dm ~ + C(insurance)", data=govtinsurance).fit()

Optimization terminated successfully.
         Current function value: 0.415413
         Iterations 6


In [40]:
model_ins1.summary()

0,1,2,3
Dep. Variable:,dm,No. Observations:,144.0
Model:,Logit,Df Residuals:,143.0
Method:,MLE,Df Model:,0.0
Date:,"Fri, 07 May 2021",Pseudo R-squ.:,9.812e-11
Time:,08:44:34,Log-Likelihood:,-59.819
converged:,True,LL-Null:,-59.819
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.7677,0.236,-7.487,0.000,-2.230,-1.305


In [41]:
model_ins1.params

Intercept   -1.7677
dtype: float64

In [42]:
np.exp(model_ins1.params)

Intercept   0.1707
dtype: float64

Enter the odds ratio for insurance to two decimal places:

In [43]:
model_odds = pd.DataFrame(np.exp(model_ins1.params), columns=["ODDS RATIO"])

In [44]:
model_odds["Z-Value"] = model_ins1.pvalues

In [45]:
model_odds[["2.5%","97.5%"]] = np.exp(model_ins1.conf_int())

In [46]:
model_odds

Unnamed: 0,ODDS RATIO,Z-Value,2.5%,97.5%
Intercept,0.1707,0.0,0.1075,0.2712


In [48]:
#dir(model_ins1)

In [49]:
#model_ins.fittedvalues

In [50]:
privinsurance = df2[df2["insurance"] == 2]

In [51]:
privinsurance

Unnamed: 0,age,chol,insurance,dm
2,58,228.0000,2,0
6,30,195.0000,2,0
8,45,177.0000,2,0
9,55,263.0000,2,0
13,40,183.0000,2,0
...,...,...,...,...
390,52,196.0000,2,1
391,59,221.0000,2,0
398,53,296.0000,2,1
399,51,284.0000,2,0


In [52]:
model_ins2 = smf.logit(formula="dm ~ + C(insurance)", data=privinsurance).fit()

Optimization terminated successfully.
         Current function value: 0.384076
         Iterations 6


In [53]:
model_ins2.summary()

0,1,2,3
Dep. Variable:,dm,No. Observations:,132.0
Model:,Logit,Df Residuals:,131.0
Method:,MLE,Df Model:,0.0
Date:,"Fri, 07 May 2021",Pseudo R-squ.:,1.423e-10
Time:,08:47:17,Log-Likelihood:,-50.698
converged:,True,LL-Null:,-50.698
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.9117,0.260,-7.357,0.000,-2.421,-1.402


In [54]:
model_ins2.params

Intercept   -1.9117
dtype: float64

In [55]:
np.exp(model_ins2.params)

Intercept   0.1478
dtype: float64

Enter the odds ratio for insurance to two decimal places:

In [56]:
model_odds = pd.DataFrame(np.exp(model_ins2.params), columns=["ODDS RATIO"])

In [57]:
model_odds["Z-Value"] = model_ins2.pvalues

In [58]:
model_odds[["2.5%","97.5%"]] = np.exp(model_ins2.conf_int())

In [59]:
model_odds

Unnamed: 0,ODDS RATIO,Z-Value,2.5%,97.5%
Intercept,0.1478,0.0,0.0888,0.246


In [60]:
df2

Unnamed: 0,age,chol,insurance,dm
0,46,203.0000,1,0
1,29,165.0000,0,0
2,58,228.0000,2,0
3,67,78.0000,1,0
4,64,249.0000,0,1
...,...,...,...,...
397,89,301.0000,1,0
398,53,296.0000,2,1
399,51,284.0000,2,0
400,29,194.0000,0,1


In [62]:
df2.groupby('insurance')['dm'].count()

insurance
0    113
1    144
2    132
Name: dm, dtype: int64

In [64]:
govtinsurance.reset_index(inplace=True, drop=True)

In [65]:
govtinsurance

Unnamed: 0,age,chol,insurance,dm
0,46,203.0000,1,0
1,67,78.0000,1,0
2,34,248.0000,1,0
3,60,242.0000,1,0
4,50,255.0000,1,0
...,...,...,...,...
139,33,217.0000,1,0
140,37,209.0000,1,0
141,60,279.0000,1,1
142,78,210.0000,1,0


In [66]:
govtinsurance.groupby("dm").count()

Unnamed: 0_level_0,age,chol,insurance
dm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,123,123,123
1,21,21,21


In [67]:
privinsurance.reset_index(inplace=True, drop=True)

In [68]:
privinsurance

Unnamed: 0,age,chol,insurance,dm
0,58,228.0000,2,0
1,30,195.0000,2,0
2,45,177.0000,2,0
3,55,263.0000,2,0
4,40,183.0000,2,0
...,...,...,...,...
127,52,196.0000,2,1
128,59,221.0000,2,0
129,53,296.0000,2,1
130,51,284.0000,2,0


In [69]:
privinsurance.groupby("dm").count()

Unnamed: 0_level_0,age,chol,insurance
dm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,115,115,115
1,17,17,17


#### Python code done by Dennis Lam