In [1]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv("./Resources/cardio_train.csv", sep = ";", index_col=0)
df.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


<h3>1. Preprocessing</h3>

In [3]:
# convert naming of gluc
df.loc[df["gluc"] == 1, "gluc"] = "Glucose Normal"
df.loc[df["gluc"] == 2, "gluc"] = "Glucose Above Normal"
df.loc[df["gluc"] == 3, "gluc"] = "Glucose Well Above Normal"

# convert naming of cholesterol
df.loc[df["cholesterol"] == 1, "cholesterol"] = "Cholesterol Normal"
df.loc[df["cholesterol"] == 2, "cholesterol"] = "Cholesterol Above Normal"
df.loc[df["cholesterol"] == 3, "cholesterol"] = "Cholesterol Well Above Normal"

In [4]:
# Convert categorical n > 2 variable to one hot encoding.
categorical_dummies = pd.get_dummies(df[["cholesterol", "gluc"]], dtype=int)
categorical_dummies.columns = categorical_dummies.columns.str.replace("cholesterol_", "")
categorical_dummies.columns = categorical_dummies.columns.str.replace("gluc_", "")

df = pd.concat([df, categorical_dummies], axis=1)
df.drop(columns=["cholesterol", "gluc"], inplace=True)

df.head()


Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,cardio,Cholesterol Above Normal,Cholesterol Normal,Cholesterol Well Above Normal,Glucose Above Normal,Glucose Normal,Glucose Well Above Normal
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,18393,2,168,62.0,110,80,0,0,1,0,0,1,0,0,1,0
1,20228,1,156,85.0,140,90,0,0,1,1,0,0,1,0,1,0
2,18857,1,165,64.0,130,70,0,0,0,1,0,0,1,0,1,0
3,17623,2,169,82.0,150,100,0,0,1,1,0,1,0,0,1,0
4,17474,1,156,56.0,100,60,0,0,0,0,0,1,0,0,1,0


In [5]:
#reorder the columns
df = df[['age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'smoke', 'alco',
       'active', 'Cholesterol Above Normal', 'Cholesterol Normal',
       'Cholesterol Well Above Normal', 'Glucose Above Normal',
       'Glucose Normal', 'Glucose Well Above Normal', 'cardio']]

df.head()

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,smoke,alco,active,Cholesterol Above Normal,Cholesterol Normal,Cholesterol Well Above Normal,Glucose Above Normal,Glucose Normal,Glucose Well Above Normal,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0,18393,2,168,62.0,110,80,0,0,1,0,1,0,0,1,0,0
1,20228,1,156,85.0,140,90,0,0,1,0,0,1,0,1,0,1
2,18857,1,165,64.0,130,70,0,0,0,0,0,1,0,1,0,1
3,17623,2,169,82.0,150,100,0,0,1,0,1,0,0,1,0,1
4,17474,1,156,56.0,100,60,0,0,0,0,1,0,0,1,0,0


In [6]:
# split the data with train test split
X = df[df.columns[:-1]]
y = df["cardio"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

# Standardize the data
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

<h3>2. ML Models</h3>

<h4>a. Logistic Regression</h4>

In [9]:
# try logistic regression model 
# define the model
clf = LogisticRegression(solver="lbfgs")

# fit the model
clf.fit(X_train_scaled, y_train)
logistic_predictions = clf.predict(X_test_scaled)

# calculate the accuracy score
logistic_accuracy = accuracy_score(y_test, logistic_predictions)
logistic_accuracy


0.7188571428571429

<h4> b. Decision Tree </h4>

In [21]:
# create a decision feree classifier
from sklearn import tree
tree_clf = tree.DecisionTreeClassifier()

# fit the data and make prediction
tree_clf.fit(X_train_scaled, y_train)
dt_predictions = tree_clf.predict(X_test_scaled)

print(f"Decision Tree Accuracy is {accuracy_score(y_test, dt_predictions)}")

Accuracy is 0.6376


<h4> c. Random Forest </h4>

In [23]:
# create a random forest classifier
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_estimators=200)

# fit the data to the model
rf_clf.fit(X_train_scaled, y_train)

# compute the accuracy
rf_predictions = rf_clf.predict(X_test_scaled)

print(f"Random Forest Accuracy is {accuracy_score(y_test, rf_predictions)}")


Random Forest Accuracy is 0.7171428571428572


<h4> d. Feed Forward Network </h4>

In [16]:
# MLP Model
# model definition
mlp = tf.keras.models.Sequential()

# adding basic layers
mlp.add(tf.keras.layers.Dense(units = 10, activation = "relu", input_dim = 15))
mlp.add(tf.keras.layers.Dense(units = 10, activation = "relu"))
mlp.add(tf.keras.layers.Dense(units = 10, activation = "relu"))
mlp.add(tf.keras.layers.Dense(units = 1, activation = "sigmoid"))

# compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate = 0.01)
mlp.compile(loss = tf.keras.losses.BinaryCrossentropy, optimizer = optimizer, metrics = ["accuracy"])

# fit the mode
mlp.fit(X_train_scaled, y_train, epochs = 30)


Epoch 1/30


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 761us/step - accuracy: 0.6783 - loss: 0.6094
Epoch 2/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 752us/step - accuracy: 0.7279 - loss: 0.5586
Epoch 3/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 734us/step - accuracy: 0.7301 - loss: 0.5546
Epoch 4/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 784us/step - accuracy: 0.7324 - loss: 0.5454
Epoch 5/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 698us/step - accuracy: 0.7294 - loss: 0.5508
Epoch 6/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 750us/step - accuracy: 0.7319 - loss: 0.5474
Epoch 7/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 727us/step - accuracy: 0.7348 - loss: 0.5445
Epoch 8/30
[1m1641/1641[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 714us/step - accuracy: 0.7307 - loss: 0.5462
Epoch 9/30
[1m1641

In [17]:
# evaluate the model, very bad, overfitted.
print(f"Accuracy: {mlp.evaluate(X_test, y_test)}")

[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 542us/step - accuracy: 0.4997 - loss: 9698.2363
Accuracy: [9684.9375, 0.5004000067710876]
