# Build logistic classification model for predicting the occurrence of metabolic syndrome

In [96]:
#import statements
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
from scipy.stats import shapiro
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

## Logistic classification model building

In [97]:
# load datasets for model building
df_clean_train = pd.read_csv('data/df_clean_train.csv')
df_clean_test = pd.read_csv('data/df_clean_test.csv')
df_impute_train = pd.read_csv('data/df_impute_train.csv')
df_impute_test = pd.read_csv('data/df_impute_test.csv')

In [98]:
# process loaded dataset
desired_columns = df_clean_train.columns[~df_clean_train.columns.isin(['Unnamed: 0','seqn'])]
df_clean_train = df_clean_train[desired_columns]
df_clean_test = df_clean_test[desired_columns]
df_impute_train = df_impute_train[desired_columns]
df_impute_test = df_impute_test[desired_columns]

In [99]:
# Extract features
features_columns = df_clean_train.columns[~df_clean_train.columns.isin(['MetabolicSyndrome'])]
dependent_column = 'MetabolicSyndrome'
X_clean_train = df_clean_train[features_columns]
X_impute_train = df_impute_train[features_columns]
y_clean_train = df_clean_train[dependent_column]
y_impute_train = df_impute_train[dependent_column]

In [100]:
# Process features
X_clean_train_encoded = pd.get_dummies(X_clean_train,drop_first=True)
X_impute_train_encoded = pd.get_dummies(X_impute_train,drop_first=True)

bool_cols = X_clean_train_encoded.select_dtypes(include=['bool']).columns
X_clean_train_encoded[bool_cols] = X_clean_train_encoded[bool_cols].astype(int)

bool_cols = X_impute_train_encoded.select_dtypes(include=['bool']).columns
X_impute_train_encoded[bool_cols] = X_impute_train_encoded[bool_cols].astype(int)

In [101]:
# view variables

In [102]:
X_clean_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1607 entries, 0 to 1606
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                1607 non-null   int64  
 1   Income             1607 non-null   float64
 2   WaistCirc          1607 non-null   float64
 3   BMI                1607 non-null   float64
 4   Albuminuria        1607 non-null   int64  
 5   UrAlbCr            1607 non-null   float64
 6   UricAcid           1607 non-null   float64
 7   BloodGlucose       1607 non-null   int64  
 8   HDL                1607 non-null   int64  
 9   Triglycerides      1607 non-null   int64  
 10  Sex_Male           1607 non-null   int32  
 11  Marital_Married    1607 non-null   int32  
 12  Marital_Separated  1607 non-null   int32  
 13  Marital_Single     1607 non-null   int32  
 14  Marital_Widowed    1607 non-null   int32  
 15  Race_Black         1607 non-null   int32  
 16  Race_Hispanic      1607 

In [103]:
X_impute_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1752 entries, 0 to 1751
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                1752 non-null   int64  
 1   Income             1752 non-null   float64
 2   WaistCirc          1752 non-null   float64
 3   BMI                1752 non-null   float64
 4   Albuminuria        1752 non-null   int64  
 5   UrAlbCr            1752 non-null   float64
 6   UricAcid           1752 non-null   float64
 7   BloodGlucose       1752 non-null   int64  
 8   HDL                1752 non-null   int64  
 9   Triglycerides      1752 non-null   int64  
 10  Sex_Male           1752 non-null   int32  
 11  Marital_Married    1752 non-null   int32  
 12  Marital_Separated  1752 non-null   int32  
 13  Marital_Single     1752 non-null   int32  
 14  Marital_Widowed    1752 non-null   int32  
 15  Race_Black         1752 non-null   int32  
 16  Race_Hispanic      1752 

In [104]:
y_clean_train.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1607 entries, 0 to 1606
Series name: MetabolicSyndrome
Non-Null Count  Dtype
--------------  -----
1607 non-null   int64
dtypes: int64(1)
memory usage: 12.7 KB


In [105]:
y_impute_train.info()

<class 'pandas.core.series.Series'>
RangeIndex: 1752 entries, 0 to 1751
Series name: MetabolicSyndrome
Non-Null Count  Dtype
--------------  -----
1752 non-null   int64
dtypes: int64(1)
memory usage: 13.8 KB


In [106]:
# build model
model_clean = sm.Logit(y_clean_train,sm.add_constant(X_clean_train_encoded)).fit()
model_impute = sm.Logit(y_impute_train,sm.add_constant(X_impute_train_encoded)).fit()

Optimization terminated successfully.
         Current function value: 0.341391
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.361207
         Iterations 7


## Logistic classification model evaluation

### Evaluate model built from clean dataset

In [107]:
# model built from clean dataset
model_clean.summary()

0,1,2,3
Dep. Variable:,MetabolicSyndrome,No. Observations:,1607.0
Model:,Logit,Df Residuals:,1586.0
Method:,MLE,Df Model:,20.0
Date:,"Thu, 02 Nov 2023",Pseudo R-squ.:,0.4735
Time:,11:54:00,Log-Likelihood:,-548.62
converged:,True,LL-Null:,-1042.0
Covariance Type:,nonrobust,LLR p-value:,2.5459999999999997e-196

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-11.4718,1.006,-11.408,0.000,-13.443,-9.501
Age,0.0405,0.006,6.863,0.000,0.029,0.052
Income,-3.395e-05,2.86e-05,-1.189,0.235,-8.99e-05,2.2e-05
WaistCirc,0.0560,0.014,4.024,0.000,0.029,0.083
BMI,0.0337,0.032,1.038,0.299,-0.030,0.097
Albuminuria,0.3840,0.228,1.686,0.092,-0.062,0.830
UrAlbCr,-0.0004,0.000,-0.958,0.338,-0.001,0.000
UricAcid,0.1527,0.064,2.368,0.018,0.026,0.279
BloodGlucose,0.0225,0.003,6.720,0.000,0.016,0.029


Prediction with trained dataset

In [108]:
conf_matrix = model_clean.pred_table()
df_cm = pd.DataFrame(conf_matrix, index=['Actual Negative', 'Actual Positive'], 
                     columns=['Predicted Negative', 'Predicted Positive'])
df_cm

Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,946.0,96.0
Actual Positive,146.0,419.0


Prediction with test dataset

In [109]:
# process test dataset
X_clean_test = df_clean_test[features_columns]
X_impute_test = df_impute_test[features_columns]
y_clean_test = df_clean_test[dependent_column]
y_impute_test = df_impute_test[dependent_column]

X_clean_test_encoded = pd.get_dummies(X_clean_test,drop_first=True)
X_impute_test_encoded = pd.get_dummies(X_impute_test,drop_first=True)

bool_cols = X_clean_test_encoded.select_dtypes(include=['bool']).columns
X_clean_test_encoded[bool_cols] = X_clean_test_encoded[bool_cols].astype(int)

bool_cols = X_impute_test_encoded.select_dtypes(include=['bool']).columns
X_impute_test_encoded[bool_cols] = X_impute_test_encoded[bool_cols].astype(int)

In [110]:
X_clean_test_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402 entries, 0 to 401
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                402 non-null    int64  
 1   Income             402 non-null    float64
 2   WaistCirc          402 non-null    float64
 3   BMI                402 non-null    float64
 4   Albuminuria        402 non-null    int64  
 5   UrAlbCr            402 non-null    float64
 6   UricAcid           402 non-null    float64
 7   BloodGlucose       402 non-null    int64  
 8   HDL                402 non-null    int64  
 9   Triglycerides      402 non-null    int64  
 10  Sex_Male           402 non-null    int32  
 11  Marital_Married    402 non-null    int32  
 12  Marital_Separated  402 non-null    int32  
 13  Marital_Single     402 non-null    int32  
 14  Marital_Widowed    402 non-null    int32  
 15  Race_Black         402 non-null    int32  
 16  Race_Hispanic      402 non

In [111]:
X_clean_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1607 entries, 0 to 1606
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                1607 non-null   int64  
 1   Income             1607 non-null   float64
 2   WaistCirc          1607 non-null   float64
 3   BMI                1607 non-null   float64
 4   Albuminuria        1607 non-null   int64  
 5   UrAlbCr            1607 non-null   float64
 6   UricAcid           1607 non-null   float64
 7   BloodGlucose       1607 non-null   int64  
 8   HDL                1607 non-null   int64  
 9   Triglycerides      1607 non-null   int64  
 10  Sex_Male           1607 non-null   int32  
 11  Marital_Married    1607 non-null   int32  
 12  Marital_Separated  1607 non-null   int32  
 13  Marital_Single     1607 non-null   int32  
 14  Marital_Widowed    1607 non-null   int32  
 15  Race_Black         1607 non-null   int32  
 16  Race_Hispanic      1607 

In [112]:
# get predictions for non-training data
y_pred = model_clean.predict(sm.add_constant(X_clean_test_encoded))
y_pred_class = (y_pred > 0.5).astype(int)

In [113]:
# generate confusion matrix 
conf_matrix = confusion_matrix(y_clean_test, y_pred_class)
df_cm = pd.DataFrame(conf_matrix, index=['Actual Negative', 'Actual Positive'], 
                     columns=['Predicted Negative', 'Predicted Positive'])
df_cm

Unnamed: 0,Predicted Negative,Predicted Positive
Actual Negative,218,37
Actual Positive,39,108


### Evaluate model built from imputed dataset

In [114]:
# evaluate performance of a given model
def model_perf_evaluation(model,X_test,y_test):
    # generate confusion matrix for training dataset
    cm = model.pred_table()
    
    # Extract values from the confusion matrix
    true_negative, false_positive, false_negative, true_positive = cm.flatten()
    accuracy_train = (true_positive + true_negative) / np.sum(cm)
    precision_train = true_positive / (true_positive + false_positive)
    
    X_test_encoded = pd.get_dummies(X_test,drop_first=True)
    bool_cols = X_test_encoded.select_dtypes(include=['bool']).columns
    X_test_encoded[bool_cols] = X_test_encoded[bool_cols].astype(int)
    # get predictions for non-training data
    y_pred = model.predict(sm.add_constant(X_test_encoded))
    y_pred_class = (y_pred > 0.5).astype(int)
    
    # generate confusion matrix for test dataset
    conf_matrix = confusion_matrix(y_test, y_pred_class)
    
    
    # the proportion of total cases correctly classified
    accuracy_test = accuracy_score(y_test, y_pred_class)
    # the proportion of positive identifications that were actually correct
    precision_test = precision_score(y_test, y_pred_class)
    
    data = {'pseudo_r_squared' : model.prsquared,
            'predicted_neg_train' : f'{cm[0,0]}/{cm[0,0]+cm[0,1]}',
            'predicted_pos_train' : f'{cm[1,1]}/{cm[1,1]+cm[1,0]}',
            'accuracy_train' : accuracy_train,
            'precisioin_train': precision_train,
            'predicted_neg_test' : f'{conf_matrix[0,0]}/{conf_matrix[0,0]+conf_matrix[0,1]}',
            'predicted_pos_test' : f'{conf_matrix[1,1]}/{conf_matrix[1,1]+conf_matrix[1,0]}',
            'accuracy_test' : accuracy_test,
            'precision_test':precision_test,
           }
    print(pd.Series(data))

In [115]:
model_perf_evaluation(model_impute, X_impute_test,y_impute_test)

pseudo_r_squared            0.439555
predicted_neg_train    1044.0/1147.0
predicted_pos_train      429.0/605.0
accuracy_train              0.840753
precisioin_train            0.806391
predicted_neg_test           249/284
predicted_pos_test           115/155
accuracy_test               0.829157
precision_test              0.766667
dtype: object


In [116]:
# for comparisons, formatted performance output from model built from clean dataset
model_perf_evaluation(model_clean,X_clean_test,y_clean_test)

pseudo_r_squared           0.473506
predicted_neg_train    946.0/1042.0
predicted_pos_train     419.0/565.0
accuracy_train             0.849409
precisioin_train           0.813592
predicted_neg_test          218/255
predicted_pos_test          108/147
accuracy_test              0.810945
precision_test             0.744828
dtype: object


* Both models perform relatively well for both training and test dataset. 
* The model built from imputed data seem to perform better for non-training dataset. This is most likely due to the fact it was trained with a larger dataset

## Can we improve model performance even further?

* By reducing features to only those with statistically significant coefficients
* By normalzing numeric columns

[* By optionally removing outliers]

In [117]:
# Normalize training numerical values
numeric_cols=df_impute_train.select_dtypes(include=['number']).columns
df_impute_train[numeric_cols] = (df_impute_train[numeric_cols] - df_impute_train[numeric_cols].min()) / (df_impute_train[numeric_cols].max() - df_impute_train[numeric_cols].min())

In [118]:
# Normalize test numerical values
numeric_cols=df_impute_test.select_dtypes(include=['number']).columns
df_impute_test[numeric_cols] = (df_impute_test[numeric_cols] - df_impute_test[numeric_cols].min()) / (df_impute_test[numeric_cols].max() - df_impute_test[numeric_cols].min())


In [119]:
# Extract features
features_columns = ['Age','WaistCirc','UricAcid','BloodGlucose','HDL','Triglycerides','Sex']
#features_columns = df_clean_train.columns[~df_clean_train.columns.isin(['MetabolicSyndrome'])]
dependent_column = 'MetabolicSyndrome'
X_impute_train = df_impute_train[features_columns]
y_impute_train = df_impute_train[dependent_column]
X_impute_test = df_impute_test[features_columns]
y_impute_test = df_impute_test[dependent_column]

In [120]:
X_impute_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1752 entries, 0 to 1751
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Age                1752 non-null   int64  
 1   Income             1752 non-null   float64
 2   WaistCirc          1752 non-null   float64
 3   BMI                1752 non-null   float64
 4   Albuminuria        1752 non-null   int64  
 5   UrAlbCr            1752 non-null   float64
 6   UricAcid           1752 non-null   float64
 7   BloodGlucose       1752 non-null   int64  
 8   HDL                1752 non-null   int64  
 9   Triglycerides      1752 non-null   int64  
 10  Sex_Male           1752 non-null   int32  
 11  Marital_Married    1752 non-null   int32  
 12  Marital_Separated  1752 non-null   int32  
 13  Marital_Single     1752 non-null   int32  
 14  Marital_Widowed    1752 non-null   int32  
 15  Race_Black         1752 non-null   int32  
 16  Race_Hispanic      1752 

In [121]:
# Process features
X_impute_train_encoded = pd.get_dummies(X_impute_train,drop_first=True)
bool_cols = X_impute_train_encoded.select_dtypes(include=['bool']).columns
X_impute_train_encoded[bool_cols] = X_impute_train_encoded[bool_cols].astype(int)

In [122]:
# build model
model_impute = sm.Logit(y_impute_train,sm.add_constant(X_impute_train_encoded)).fit()

Optimization terminated successfully.
         Current function value: 0.364796
         Iterations 7


In [124]:
model_impute.summary()

0,1,2,3
Dep. Variable:,MetabolicSyndrome,No. Observations:,1752.0
Model:,Logit,Df Residuals:,1744.0
Method:,MLE,Df Model:,7.0
Date:,"Thu, 02 Nov 2023",Pseudo R-squ.:,0.434
Time:,11:56:39,Log-Likelihood:,-639.12
converged:,True,LL-Null:,-1129.2
Covariance Type:,nonrobust,LLR p-value:,2.415e-207

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-5.9037,0.521,-11.333,0.000,-6.925,-4.883
Age,2.0416,0.270,7.550,0.000,1.512,2.572
WaistCirc,8.0753,0.656,12.308,0.000,6.789,9.361
UricAcid,0.9286,0.523,1.776,0.076,-0.096,1.953
BloodGlucose,7.7980,1.066,7.313,0.000,5.708,9.888
HDL,-6.4689,0.979,-6.605,0.000,-8.388,-4.549
Triglycerides,16.5877,1.591,10.426,0.000,13.469,19.706
Sex_Male,-1.0793,0.166,-6.491,0.000,-1.405,-0.753


In [123]:
model_perf_evaluation(model_impute, X_impute_test,y_impute_test)

pseudo_r_squared            0.433987
predicted_neg_train    1043.0/1147.0
predicted_pos_train      419.0/605.0
accuracy_train              0.834475
precisioin_train            0.801147
predicted_neg_test           255/284
predicted_pos_test           110/155
accuracy_test               0.831435
precision_test              0.791367
dtype: object


We get some marginal boost in performance