In [107]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
import pickle
from sklearn.compose import ColumnTransformer

In [108]:
## load the dataset
df = pd.read_csv("heart_2020_cleaned.csv")
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [109]:
#figure out all the values in the categorical columns
df['GenHealth'].unique()
df['AgeCategory'].unique()
df['Diabetic'].unique()
df['Race'].unique()
df['Sex'].unique()
df['PhysicalHealth'].unique()

array([ 3.,  0., 20., 28.,  6., 15.,  5., 30.,  7.,  1.,  2., 21.,  4.,
       10., 14., 18.,  8., 25., 16., 29., 27., 17., 24., 12., 23., 26.,
       22., 19.,  9., 13., 11.])

In [110]:
df['Diabetic'] = df['Diabetic'].replace({
    'No': 'No',
    'Yes': 'Yes',
    'No, borderline diabetes': 'Borderline',
    'Yes (during pregnancy)': 'DuringPregnancy'
})
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.60,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,Yes,27.41,Yes,No,No,7.0,0.0,Yes,Male,60-64,Hispanic,Yes,No,Fair,6.0,Yes,No,No
319791,No,29.84,Yes,No,No,0.0,0.0,No,Male,35-39,Hispanic,No,Yes,Very good,5.0,Yes,No,No
319792,No,24.24,No,No,No,0.0,0.0,No,Female,45-49,Hispanic,No,Yes,Good,6.0,No,No,No
319793,No,32.81,No,No,No,0.0,0.0,No,Female,25-29,Hispanic,No,No,Good,12.0,No,No,No


In [111]:
#clean the binary columns
binary_cols = ["HeartDisease", "Smoking", "AlcoholDrinking", "Stroke", 
               "DiffWalking", "Asthma", "KidneyDisease", "SkinCancer"]
df[binary_cols] = df[binary_cols].apply(lambda x: x.map({"Yes": 1, "No": 0}))

df


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.60,1,0,0,3.0,30.0,0,Female,55-59,White,Yes,Yes,Very good,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,Female,80 or older,White,No,Yes,Very good,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,Male,65-69,White,Yes,Yes,Fair,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,Female,75-79,White,No,No,Good,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,Female,40-44,White,No,Yes,Very good,8.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,27.41,1,0,0,7.0,0.0,1,Male,60-64,Hispanic,Yes,No,Fair,6.0,1,0,0
319791,0,29.84,1,0,0,0.0,0.0,0,Male,35-39,Hispanic,No,Yes,Very good,5.0,1,0,0
319792,0,24.24,0,0,0,0.0,0.0,0,Female,45-49,Hispanic,No,Yes,Good,6.0,0,0,0
319793,0,32.81,0,0,0,0.0,0.0,0,Female,25-29,Hispanic,No,No,Good,12.0,0,0,0


In [112]:
#one hot encoding the nomial columns
nominal_cols = ['Sex', 'Race', 'Diabetic']

In [113]:
#ordinal columns cat

age_order = ['18-24','25-29','30-34','35-39','40-44','45-49','50-54','55-59',
             '60-64','65-69','70-74','75-79','80 or older']
gen_health_order = ['Poor', 'Fair', 'Good', 'Very good', 'Excellent']

ordinal_cols = ['AgeCategory', 'GenHealth']

In [114]:
num_cols = ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']

#pipelihne time
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('nom', OneHotEncoder(), nominal_cols),
        ('ord', OrdinalEncoder(categories=[age_order, gen_health_order]), ordinal_cols)
    ],
)
X_processed = preprocessor.fit_transform(df.drop(columns=['HeartDisease']))
y = df['HeartDisease']

In [115]:
with open('preprocessor.pkl', 'wb') as file:
    pickle.dump(preprocessor, file)


In [116]:

# #just check the results
# numeric_features = preprocessor.named_transformers_['num'].get_feature_names_out(num_cols)
# onehot_features = preprocessor.named_transformers_['nom'].get_feature_names_out(nominal_cols)
# ordinal_features = preprocessor.named_transformers_['ord'].get_feature_names_out(ordinal_cols)

# # Combine all feature names
# all_features = list(numeric_features) + list(onehot_features) + list(ordinal_features)

# # Convert to DataFrame (handle sparse/dense cases)
# X_encoded_df = pd.DataFrame(
#     X_processed.toarray() if hasattr(X_processed, "toarray") else X_processed,
#     columns=all_features
# )

# # Save to CSV
# X_encoded_df.to_csv("encoded_health_data.csv", index=False)
# print(X_encoded_df.head())

In [117]:
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [118]:
X_train 

array([[-0.78598116, -0.42351474, -0.48991929, ..., -0.38279496,
         1.2592872 ,  0.38802615],
       [-0.13694972,  3.3547987 , -0.48991929, ..., -0.38279496,
         0.13690441, -0.5701702 ],
       [ 0.68494724, -0.17162718, -0.23855735, ..., -0.38279496,
        -0.14369128,  0.38802615],
       ...,
       [ 0.44922154,  1.08781064,  0.26416653, ..., -0.38279496,
         0.6980958 ,  0.38802615],
       [-0.95727517,  0.45809173,  0.13848556, ..., -0.38279496,
        -1.82726546,  1.34622251],
       [ 1.26640398, -0.42351474, -0.48991929, ..., -0.38279496,
        -1.26607407, -0.5701702 ]])

In [119]:
with open('scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

In [120]:
### ANN
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping,TensorBoard
import datetime

In [121]:
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
]
)

In [122]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_4 (Dense)             (None, 64)                1216      
                                                                 
 dense_5 (Dense)             (None, 32)                2080      
                                                                 
 dense_6 (Dense)             (None, 16)                528       
                                                                 
 dense_7 (Dense)             (None, 1)                 17        
                                                                 
Total params: 3841 (15.00 KB)
Trainable params: 3841 (15.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [123]:
import tensorflow
opt=tensorflow.keras.optimizers.Adam(learning_rate=0.001)
loss=tensorflow.keras.losses.BinaryCrossentropy()
loss

<keras.src.losses.BinaryCrossentropy at 0x21c14a2b710>

In [124]:
model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['accuracy'])

In [125]:
#set up the TensrorBoard callback
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorflow_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

In [126]:
# set up early stopping
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

In [127]:
## train model
history = model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=100,
    callbacks=[early_stopping, tensorflow_callback]
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


In [128]:
model.save('heart_model.h5')

  saving_api.save_model(


In [129]:
## load tensor board extension
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [130]:
%tensorboard --logdir logs/fit --host localhost


Reusing TensorBoard on port 6007 (pid 30656), started 1:35:42 ago. (Use '!kill 30656' to kill it.)

In [131]:
### load the pickle files
