In [12]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [13]:
df = pd.read_csv("./Resources/cardio_train.csv", sep = ";", index_col=0)
df.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [14]:
# get small data set
df_sample = df.iloc[0:10, :]
df_sample.to_csv("./Resources/cardio_train_sample.csv")

<h3>1. Preprocessing</h3>

In [15]:
# BMI
height_meter = df["height"]/100
df["BMI"] = df["weight"]/height_meter/height_meter

# convert naming of gluc
df.loc[df["gluc"] == 1, "gluc"] = "Glucose Normal"
df.loc[df["gluc"] == 2, "gluc"] = "Glucose Above Normal"
df.loc[df["gluc"] == 3, "gluc"] = "Glucose Well Above Normal"

# convert naming of cholesterol
df.loc[df["cholesterol"] == 1, "cholesterol"] = "Cholesterol Normal"
df.loc[df["cholesterol"] == 2, "cholesterol"] = "Cholesterol Above Normal"
df.loc[df["cholesterol"] == 3, "cholesterol"] = "Cholesterol Well Above Normal"


In [16]:
df.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,BMI
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,18393,2,168,62.0,110,80,Cholesterol Normal,Glucose Normal,0,0,1,0,21.96712
1,20228,1,156,85.0,140,90,Cholesterol Well Above Normal,Glucose Normal,0,0,1,1,34.927679
2,18857,1,165,64.0,130,70,Cholesterol Well Above Normal,Glucose Normal,0,0,0,1,23.507805
3,17623,2,169,82.0,150,100,Cholesterol Normal,Glucose Normal,0,0,1,1,28.710479
4,17474,1,156,56.0,100,60,Cholesterol Normal,Glucose Normal,0,0,0,0,23.011177


In [18]:
# Convert categorical n > 2 variable to one hot encoding.
categorical_dummies = pd.get_dummies(df[["cholesterol", "gluc"]], dtype=int)
categorical_dummies.columns = categorical_dummies.columns.str.replace("cholesterol_", "")
categorical_dummies.columns = categorical_dummies.columns.str.replace("gluc_", "")

df = pd.concat([df, categorical_dummies], axis=1)
df.drop(columns=["cholesterol", "gluc"], inplace=True)

df.head()


Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio,BMI,Cholesterol Above Normal,Cholesterol Normal,Cholesterol Well Above Normal,Glucose Above Normal,Glucose Normal,Glucose Well Above Normal
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,18393,2,168,62.0,110,80,0,0,1,0,21.96712,0,1,0,0,1,0
1,20228,1,156,85.0,140,90,0,0,1,1,34.927679,0,0,1,0,1,0
2,18857,1,165,64.0,130,70,0,0,0,1,23.507805,0,0,1,0,1,0
3,17623,2,169,82.0,150,100,0,0,1,1,28.710479,0,1,0,0,1,0
4,17474,1,156,56.0,100,60,0,0,0,0,23.011177,0,1,0,0,1,0


In [20]:
#reorder the columns
df = df[['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'smoke', 'alco',
       'active', 'Cholesterol Above Normal', 'Cholesterol Normal',
       'Cholesterol Well Above Normal', 'Glucose Above Normal',
       'Glucose Normal', 'Glucose Well Above Normal', "BMI", 'cardio']]

df.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,Cholesterol Above Normal,Cholesterol Normal,Cholesterol Well Above Normal,Glucose Above Normal,Glucose Normal,Glucose Well Above Normal,BMI,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,18393,2,168,62.0,110,80,0,0,1,0,1,0,0,1,0,21.96712,0
1,20228,1,156,85.0,140,90,0,0,1,0,0,1,0,1,0,34.927679,1
2,18857,1,165,64.0,130,70,0,0,0,0,0,1,0,1,0,23.507805,1
3,17623,2,169,82.0,150,100,0,0,1,0,1,0,0,1,0,28.710479,1
4,17474,1,156,56.0,100,60,0,0,0,0,1,0,0,1,0,23.011177,0


In [21]:
# split the data with train test split
X = df[df.columns[:-1]]
y = df["cardio"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

# Standardize the data
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

<h3>2. ML Models Initial Attempt</h3>

<h4>a. Logistic Regression with All Features</h4>

In [22]:
# try logistic regression model 
# define the model
clf = LogisticRegression(solver="lbfgs")

# fit the model
clf.fit(X_train_scaled, y_train)
logistic_predictions = clf.predict(X_test_scaled)

# calculate the accuracy score
logistic_accuracy = accuracy_score(y_test, logistic_predictions)
logistic_accuracy


0.7199428571428571

<h4> b. Decision Tree with All Features </h4>

In [23]:
# create a decision feree classifier
from sklearn import tree
tree_clf = tree.DecisionTreeClassifier()

# fit the data and make prediction
tree_clf.fit(X_train_scaled, y_train)
dt_predictions = tree_clf.predict(X_test_scaled)

print(f"Decision Tree Accuracy is {accuracy_score(y_test, dt_predictions)}")

Decision Tree Accuracy is 0.6396571428571428


In [36]:
# see feature importance
print(tree_clf.feature_importances_)

[0.32441875 0.01667178 0.14468933 0.15804474 0.23184334 0.0424772
 0.0074589  0.00666206 0.0135507  0.00644428 0.00611565 0.02246513
 0.00724082 0.00634548 0.00557186]


<h4> c. Random Forest with All Features </h4>

In [24]:
# create a random forest classifier
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=200)

# fit the data to the model
rf_clf.fit(X_train_scaled, y_train)

# compute the accuracy
rf_predictions = rf_clf.predict(X_test_scaled)

print(f"Random Forest Accuracy is {accuracy_score(y_test, rf_predictions)}")


Random Forest Accuracy is 0.7139428571428571


In [38]:
# see feature importance
print(rf_clf.feature_importances_)

[0.31060896 0.01211528 0.15860784 0.17707693 0.17644332 0.09102195
 0.00739419 0.00651087 0.0113216  0.00469818 0.01542913 0.01637769
 0.00347095 0.00518745 0.00373567]


<h4> d. Feed Forward Network with All Features</h4>

In [28]:
# MLP Model
# model definition
mlp = tf.keras.models.Sequential()

# adding basic layers
mlp.add(tf.keras.layers.Dense(units = 10, activation = "relu", input_dim = 16))
mlp.add(tf.keras.layers.Dense(units = 10, activation = "relu"))
mlp.add(tf.keras.layers.Dense(units = 10, activation = "relu"))
mlp.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01)
mlp.compile(loss = tf.keras.losses.BinaryCrossentropy, optimizer = optimizer, metrics = ["accuracy"])

# fit the mode
mlp.fit(X_train_scaled, y_train, epochs = 30)


Epoch 1/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 729us/step - accuracy: 0.6820 - loss: 0.5993
Epoch 2/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 703us/step - accuracy: 0.7250 - loss: 0.5543
Epoch 3/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 751us/step - accuracy: 0.7287 - loss: 0.5504
Epoch 4/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 701us/step - accuracy: 0.7292 - loss: 0.5503
Epoch 5/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 700us/step - accuracy: 0.7324 - loss: 0.5451
Epoch 6/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 728us/step - accuracy: 0.7305 - loss: 0.5473
Epoch 7/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 762us/step - accuracy: 0.7339 - loss: 0.5450
Epoch 8/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 693us/step - accuracy: 0.7340 - loss: 0.5448
Epoch 9/

<keras.src.callbacks.history.History at 0x26290d82440>

In [33]:
# evaluate the model, very bad, overfitted.
print(f"Accuracy: {mlp.evaluate(X_test_scaled, y_test)}")

[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 592us/step - accuracy: 0.7135 - loss: 0.5776
Accuracy: [0.5807191729545593, 0.7105714082717896]


<h3>3. ML Models With More Relevant Features</h3>

In [30]:
# split the data with train test split
X = df.iloc[:, 0:6]
y = df["cardio"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

# Standardize the data
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

<h4>a. Logistic Regression</h4>

In [34]:
logistic_clf = LogisticRegression(solver="lbfgs")

logistic_clf.fit(X_train_scaled, y_train)
logistic_predictions = logistic_clf.predict(X_test_scaled)

print(f"The accuracy of this logistic regression is {accuracy_score(y_test, logistic_predictions)}")

The accuracy of this logistic regression is 0.7080571428571428


<h4> b. Decision Tree </h4>

In [43]:
dt_clf = tree.DecisionTreeClassifier()
dt_clf.fit(X_train_scaled, y_train)
dt_predictions = dt_clf.predict(X_test_scaled)

print(f"The accuracy of this decision tree is {accuracy_score(y_test, dt_predictions)}")

The accuracy of this decision tree is 0.6172


<h4>c. Random Forest Classifier</h4>

In [44]:
# create a random forest classifier
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=200)

# fit the data to the model
rf_clf.fit(X_train_scaled, y_train)

# compute the accuracy
rf_predictions = rf_clf.predict(X_test_scaled)

print(f"Random Forest Accuracy is {accuracy_score(y_test, rf_predictions)}")

Random Forest Accuracy is 0.688


<h4> d. Feed Forward Network </h4>

In [32]:
# MLP Model
# model definition
mlp = tf.keras.models.Sequential()

# adding basic layers and drop out
mlp.add(tf.keras.layers.Dense(units = 10, activation = "relu", input_dim = 6))
mlp.add(tf.keras.layers.Dropout(0.5))
mlp.add(tf.keras.layers.Dense(units = 10, activation = tf.keras.layers.LeakyReLU(negative_slope=0.3)))
mlp.add(tf.keras.layers.Dropout(0.5))
mlp.add(tf.keras.layers.Dense(units = 10, activation = tf.keras.layers.LeakyReLU(negative_slope=0.3)))
mlp.add(tf.keras.layers.Dropout(0.5))
mlp.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01)
mlp.compile(loss = tf.keras.losses.BinaryCrossentropy, optimizer = optimizer, metrics = ["accuracy"])

# fit the mode
mlp.fit(X_train_scaled, y_train, epochs = 30)

Epoch 1/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 800us/step - accuracy: 0.6148 - loss: 0.6641
Epoch 2/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 793us/step - accuracy: 0.6811 - loss: 0.6654
Epoch 3/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 841us/step - accuracy: 0.6888 - loss: 0.6219
Epoch 4/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 746us/step - accuracy: 0.6940 - loss: 0.6190
Epoch 5/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 740us/step - accuracy: 0.6761 - loss: 0.6433
Epoch 6/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 743us/step - accuracy: 0.6935 - loss: 0.6179
Epoch 7/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 756us/step - accuracy: 0.6891 - loss: 0.6192
Epoch 8/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 778us/step - accuracy: 0.6911 - loss: 0.6201
Epoch 9/

<keras.src.callbacks.history.History at 0x262b85ef0d0>

In [14]:
# evaluate the model, very bad, overfitted.
print(f"Accuracy: {mlp.evaluate(X_test_scaled, y_test)}")

[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 574us/step - accuracy: 0.7155 - loss: 0.5748
Accuracy: [0.5737436413764954, 0.7157714366912842]


In [22]:
# confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

predictions = mlp.predict(X_test_scaled)
y_pred = (predictions > 0.5).astype(int)

print(classification_report(y_test, y_pred))

[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 560us/step
              precision    recall  f1-score   support

           0       0.70      0.76      0.73      8782
           1       0.74      0.67      0.70      8718

    accuracy                           0.72     17500
   macro avg       0.72      0.72      0.72     17500
weighted avg       0.72      0.72      0.72     17500



In [23]:
confusion_matrix(y_test, y_pred)

array([[6689, 2093],
       [2881, 5837]], dtype=int64)

: 

In [38]:
# hyperparameter tuning

%pip install keras_tuner
from keras_tuner import HyperModel, Hyperband
import tensorflow as tf

class MyHyperModel(HyperModel):
    def __init__(self, input_shape):
        self.input_shape = input_shape

    def build(self, hp):
        model = tf.keras.models.Sequential()

        # Hyperparameter for the number of units in the first layer
        model.add(tf.keras.layers.Dense(
            units=hp.Int('units', min_value=32, max_value=512, step=32),
            activation='relu',
            input_shape=self.input_shape
        ))

        num_layers = hp.Int('num_layers', min_value=1, max_value=5)
        # Tune the number of hidden layers
        for i in range(1, num_layers):
            model.add(tf.keras.layers.Dropout(0.5))
            model.add(tf.keras.layers.Dense(
                units=hp.Int(f'units_{i}', min_value=32, max_value=512, step=32),
                activation=hp.Choice(f'activation_{i}', values=['relu', 'sigmoid', 'tanh'])
            ))

        model.add(tf.keras.layers.Dropout(0.5))
        model.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))
        lr_choice = hp.Choice("lr", values=['0.01', '0.001', '0.0001'])

        optimizer = tf.keras.optimizers.Adam(learning_rate=float(lr_choice))

        model.compile(optimizer='adam', loss = tf.keras.losses.BinaryCrossentropy, metrics=['accuracy'])
        return model

hypermodel = MyHyperModel(input_shape=(6,))
tuner = Hyperband(
    hypermodel,
    max_epochs=50,
    objective='val_accuracy',
    hyperband_iterations=2,
    tuner_id='my_tuner_name'
)

tuner.search(X_train_scaled, y_train, epochs=30, validation_data=(X_test_scaled, y_test))

Trial 2 Complete [00h 00m 12s]
val_accuracy: 0.7226285934448242

Best val_accuracy So Far: 0.7226285934448242
Total elapsed time: 00h 00m 14s

Search: Running Trial #3

Value             |Best Value So Far |Hyperparameter
320               |416               |units
5                 |4                 |num_layers
0.01              |0.001             |lr
32                |448               |units_1
sigmoid           |sigmoid           |activation_1
352               |160               |units_2
relu              |sigmoid           |activation_2
512               |448               |units_3
tanh              |sigmoid           |activation_3
2                 |2                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
3                 |3                 |tuner/bracket
0                 |0                 |tuner/round

Epoch 1/2
[1m1352/1641[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 2ms/step - accuracy: 0.5500 - loss: 0.2437

KeyboardInterrupt: 

In [4]:
!tensorflowjs_converter --input_format=tf_saved_model saved_model/my_model/ tfjs_model

2024-06-07 13:24:16.406623: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-06-07 13:24:20.499262: I tensorflow/core/grappler/devices.cc:75] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0 (Note: TensorFlow was not compiled with CUDA or ROCm support)
