In [119]:
#import dependencies
import pandas as pd
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
import numpy as np


from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [120]:
# Read in csv file of carido data
cardio_df = pd.read_csv("./Resources/cardio_train.csv", sep = ";", index_col=0)
cardio_df.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [121]:
cardio_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70000 entries, 0 to 99999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  int64  
 1   gender       70000 non-null  int64  
 2   height       70000 non-null  int64  
 3   weight       70000 non-null  float64
 4   ap_hi        70000 non-null  int64  
 5   ap_lo        70000 non-null  int64  
 6   cholesterol  70000 non-null  int64  
 7   gluc         70000 non-null  int64  
 8   smoke        70000 non-null  int64  
 9   alco         70000 non-null  int64  
 10  active       70000 non-null  int64  
 11  cardio       70000 non-null  int64  
dtypes: float64(1), int64(11)
memory usage: 6.9 MB


01 - Preprocessing

In [122]:
# convert naming of gluc
cardio_df.loc[cardio_df["gluc"] == 1, "gluc"] = "Glucose Norm"
cardio_df.loc[cardio_df["gluc"] == 2, "gluc"] = "Glucose Above Norm"
cardio_df.loc[cardio_df["gluc"] == 3, "gluc"] = "Glucose Well Above Norm"

# convert naming of cholesterol
cardio_df.loc[cardio_df["cholesterol"] == 1, "cholesterol"] = "Cholesterol Norm"
cardio_df.loc[cardio_df["cholesterol"] == 2, "cholesterol"] = "Cholesterol Above Norm"
cardio_df.loc[cardio_df["cholesterol"] == 3, "cholesterol"] = "Cholesterol Well Above Norm"

cardio_df.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,2,168,62.0,110,80,Cholesterol Norm,Glucose Norm,0,0,1,0
1,20228,1,156,85.0,140,90,Cholesterol Well Above Norm,Glucose Norm,0,0,1,1
2,18857,1,165,64.0,130,70,Cholesterol Well Above Norm,Glucose Norm,0,0,0,1
3,17623,2,169,82.0,150,100,Cholesterol Norm,Glucose Norm,0,0,1,1
4,17474,1,156,56.0,100,60,Cholesterol Norm,Glucose Norm,0,0,0,0


In [123]:
# Convert categorical n > 2 variable to encoding.
categorical_dummies = pd.get_dummies(cardio_df[["cholesterol", "gluc"]], dtype=int)
categorical_dummies.columns = categorical_dummies.columns.str.replace("cholesterol_", "")
categorical_dummies.columns = categorical_dummies.columns.str.replace("gluc_", "")

cardio_df = pd.concat([cardio_df, categorical_dummies], axis=1)
cardio_df.drop(columns=["cholesterol", "gluc"], inplace=True)

cardio_df.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio,Cholesterol Above Norm,Cholesterol Norm,Cholesterol Well Above Norm,Glucose Above Norm,Glucose Norm,Glucose Well Above Norm
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,18393,2,168,62.0,110,80,0,0,1,0,0,1,0,0,1,0
1,20228,1,156,85.0,140,90,0,0,1,1,0,0,1,0,1,0
2,18857,1,165,64.0,130,70,0,0,0,1,0,0,1,0,1,0
3,17623,2,169,82.0,150,100,0,0,1,1,0,1,0,0,1,0
4,17474,1,156,56.0,100,60,0,0,0,0,0,1,0,0,1,0


In [124]:
#reorder the columns in dataframe
cardio_df = cardio_df[['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'smoke', 'alco',
       'active', 'Cholesterol Above Norm', 'Cholesterol Norm',
       'Cholesterol Well Above Norm', 'Glucose Above Norm',
       'Glucose Norm', 'Glucose Well Above Norm', 'cardio']]

cardio_df.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,Cholesterol Above Norm,Cholesterol Norm,Cholesterol Well Above Norm,Glucose Above Norm,Glucose Norm,Glucose Well Above Norm,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,18393,2,168,62.0,110,80,0,0,1,0,1,0,0,1,0,0
1,20228,1,156,85.0,140,90,0,0,1,0,0,1,0,1,0,1
2,18857,1,165,64.0,130,70,0,0,0,0,0,1,0,1,0,1
3,17623,2,169,82.0,150,100,0,0,1,0,1,0,0,1,0,1
4,17474,1,156,56.0,100,60,0,0,0,0,1,0,0,1,0,0


In [125]:
cardio_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70000 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   age                          70000 non-null  int64  
 1   gender                       70000 non-null  int64  
 2   height                       70000 non-null  int64  
 3   weight                       70000 non-null  float64
 4   ap_hi                        70000 non-null  int64  
 5   ap_lo                        70000 non-null  int64  
 6   smoke                        70000 non-null  int64  
 7   alco                         70000 non-null  int64  
 8   active                       70000 non-null  int64  
 9   Cholesterol Above Norm       70000 non-null  int64  
 10  Cholesterol Norm             70000 non-null  int64  
 11  Cholesterol Well Above Norm  70000 non-null  int64  
 12  Glucose Above Norm           70000 non-null  int64  
 13  Glucose Norm         

In [127]:
# split data with train test split
X = cardio_df[cardio_df.columns[:-1]]
y = cardio_df["cardio"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

# Standardize the data
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

02 - ML Models

In [130]:
#logistical regression considering all features

# logistic regression model
clf = LogisticRegression(solver="lbfgs")

# fit the model
clf.fit(X_train_scaled, y_train)
logistic_predictions = clf.predict(X_test_scaled)

# calculate the accuracy score
logistic_accuracy = accuracy_score(y_test, logistic_predictions)
logistic_accuracy


0.7177142857142857

In [131]:
# decision tree considering all features

# create classifier
from sklearn import tree
tree_clf = tree.DecisionTreeClassifier()

# fit data to make prediction
tree_clf.fit(X_train_scaled, y_train)
dt_predictions = tree_clf.predict(X_test_scaled)

print(f"Decision Tree Accuracy = {accuracy_score(y_test, dt_predictions)}")


Decision Tree Accuracy = 0.6358857142857143


In [132]:
# determine feature importance

print(tree_clf.feature_importances_)

[0.31718113 0.01785486 0.13824305 0.16392311 0.23083776 0.04329505
 0.00886655 0.00869281 0.01381987 0.00739219 0.00729196 0.02161274
 0.00666221 0.00838895 0.00593777]


In [133]:
# random forest with all features considered
# create a random forest classifier

from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=200)

# fit the data to the model
rf_clf.fit(X_train_scaled, y_train)

# compute the accuracy
rf_predictions = rf_clf.predict(X_test_scaled)

print(f"Random Forest Accuracy is {accuracy_score(y_test, rf_predictions)}")


Random Forest Accuracy is 0.7157714285714286


In [135]:
# determine importance
print(rf_clf.feature_importances_)

[0.30982245 0.0122051  0.15861254 0.17757644 0.17666656 0.0901385
 0.00726862 0.00635885 0.01167297 0.00464957 0.01747627 0.01511217
 0.00335061 0.00527347 0.00381589]


In [136]:
# Check the data type of y_train
print(y_train.dtype)

int64


In [139]:
cardio_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 70000 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   age                          70000 non-null  int64  
 1   gender                       70000 non-null  int64  
 2   height                       70000 non-null  int64  
 3   weight                       70000 non-null  float64
 4   ap_hi                        70000 non-null  int64  
 5   ap_lo                        70000 non-null  int64  
 6   smoke                        70000 non-null  int64  
 7   alco                         70000 non-null  int64  
 8   active                       70000 non-null  int64  
 9   Cholesterol Above Norm       70000 non-null  int64  
 10  Cholesterol Norm             70000 non-null  int64  
 11  Cholesterol Well Above Norm  70000 non-null  int64  
 12  Glucose Above Norm           70000 non-null  int64  
 13  Glucose Norm         

In [143]:
# Check the data type of y_train
y_train.head()

id
17100    0
17777    1
85832    0
36505    1
12547    1
Name: cardio, dtype: int64

In [144]:
# MLP model with all features considered

# MLP Model
# model definition
mlp = tf.keras.models.Sequential()

# adding basic layers
mlp.add(tf.keras.layers.Dense(units = 6, activation = "relu", input_dim = 15))
mlp.add(tf.keras.layers.Dense(units = 6, activation = "relu"))
mlp.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01)
mlp.compile(loss = tf.keras.losses.BinaryCrossentropy, optimizer = optimizer, metrics = ["accuracy"])

# fit the mode
mlp.fit(X_train_scaled,y_train, epochs = 50)

Epoch 1/50


TypeError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1151, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1209, in compute_loss
        return self.compiled_loss(
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/compile_utils.py", line 277, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/losses.py", line 161, in __call__
        return losses_utils.compute_weighted_loss(
    File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/losses_utils.py", line 328, in compute_weighted_loss
        losses = tf.convert_to_tensor(losses)

    TypeError: Failed to convert elements of <keras.src.losses.BinaryCrossentropy object at 0x78838b6236a0> to Tensor. Consider casting elements to a supported type. See https://www.tensorflow.org/api_docs/python/tf/dtypes for supported TF dtypes.


In [17]:
# evaluate the model
print(f"Accuracy: {mlp.evaluate(X_test_scaled, y_test)}")

TypeError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 2066, in test_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 2049, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 2037, in run_step  **
        outputs = model.test_step(data)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1919, in test_step
        self.compute_loss(x, y, y_pred, sample_weight)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 1209, in compute_loss
        return self.compiled_loss(
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/compile_utils.py", line 277, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/losses.py", line 161, in __call__
        return losses_utils.compute_weighted_loss(
    File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/losses_utils.py", line 328, in compute_weighted_loss
        losses = tf.convert_to_tensor(losses)

    TypeError: Failed to convert elements of <keras.src.losses.BinaryCrossentropy object at 0x7883aff717e0> to Tensor. Consider casting elements to a supported type. See https://www.tensorflow.org/api_docs/python/tf/dtypes for supported TF dtypes.


In [148]:
# Now will try model with specific and more relevant features

# split the data with train test split
X = cardio_df.iloc[:, 0:7]
y = cardio_df["cardio"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

# Standardize the data
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [149]:
#logistic regression

logistic_clf = LogisticRegression(solver="lbfgs")

logistic_clf.fit(X_train_scaled, y_train)
logistic_predictions = logistic_clf.predict(X_test_scaled)

print(f"The accuracy of the logistic regression is: {accuracy_score(y_test, logistic_predictions)}")

The accuracy of the logistic regression is: 0.7070857142857143


In [150]:
# decision tree with specific features

dt_clf = tree.DecisionTreeClassifier()
dt_clf.fit(X_train_scaled, y_train)
dt_predictions = dt_clf.predict(X_test_scaled)

print(f"The accuracy of the decision tree is: {accuracy_score(y_test, dt_predictions)}")

The accuracy of the decision tree is: 0.6207428571428572


In [151]:
# random forest classifier
# create  random forest classifier
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=200)

# fit the data to the model
rf_clf.fit(X_train_scaled, y_train)

# compute the accuracy
rf_predictions = rf_clf.predict(X_test_scaled)

print(f"Random Forest Accuracy is: {accuracy_score(y_test, rf_predictions)}")

Random Forest Accuracy is: 0.6974285714285714


In [22]:
#MLP model with specific features

# MLP Model
# model definition
mlp = tf.keras.models.Sequential()

# adding basic layers and drop out
mlp.add(tf.keras.layers.Dense(units = 10, activation = "relu", input_dim = 6))
mlp.add(tf.keras.layers.Dropout(0.5))
mlp.add(tf.keras.layers.Dense(units = 10, activation = tf.keras.layers.LeakyReLU(negative_slope=0.3)))
mlp.add(tf.keras.layers.Dropout(0.5))
mlp.add(tf.keras.layers.Dense(units = 10, activation = tf.keras.layers.LeakyReLU(negative_slope=0.3)))
mlp.add(tf.keras.layers.Dropout(0.5))
mlp.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)
mlp.compile(loss = tf.keras.losses.BinaryCrossentropy, optimizer = optimizer, metrics = ["accuracy"])

# fit the mode
mlp.fit(X_train_scaled, y_train, epochs = 50)

TypeError: ('Keyword argument not understood:', 'negative_slope')

In [23]:
# evaluate the model
print(f"Accuracy: {mlp.evaluate(X_test_scaled, y_test)}")

RuntimeError: You must compile your model before training/testing. Use `model.compile(optimizer, loss)`.

In [24]:
# confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

predictions = mlp.predict(X_test_scaled)
y_pred = (predictions > 0.5).astype(int)

print(classification_report(y_test, y_pred))



ValueError: Classification metrics can't handle a mix of binary and multilabel-indicator targets