In [23]:
#Heart Data
# Import the required libraries

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import tensorflow as tf

In [24]:
#Basic setup
sns.set_theme(palette = "mako")

data = pd.read_csv('heart.csv')  # read csv file
data = data.drop(columns = ['row.names'], axis = 1)
df = data.copy()
data.head()

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
0,160,12.0,5.73,23.11,Present,49,25.3,97.2,52,1
1,144,0.01,4.41,28.61,Absent,55,28.87,2.06,63,1
2,118,0.08,3.48,32.28,Present,52,29.14,3.81,46,0
3,170,7.5,6.41,38.03,Present,51,31.99,24.26,58,1
4,134,13.6,3.5,27.78,Present,60,25.99,57.34,49,1


In [25]:
#Organizing categorical data
#Citation is from Joweria's heartdisease tutorial notebook
from pandas.core.arrays import categorical 
categorical_data = ["famhist"]

codes = {}  # Create a dictionary to store the codes of the conversion
for i in df:  # For each variable in the data frame 
  if i in categorical_data:  # For each variable that was classified as categorical
    specific_codes = {}  # Create a dictionary for that variable's code conversion 
    df[i] = df[i].astype("category")  # Convert that variable from 'object' datatype to category
    df[i] = df[i].cat.codes  # Create codes for the categories and replace in the dataset 
    coded = df[i].unique()  # Get the unique values (the codes) from the variable
    count = 0  # Count initialized. Later used for accessing specific indexes
    for j in data[i].unique():  # For each unique value
      specific_codes[j] = coded[count]  # Create an item in the dictionary with the value, and tell it which code correlates
      count += 1  # Count incremented
    codes[i] = specific_codes  # Create a new item in the overall dictionary with the dictionary created for this specific variable's codes


# View our created dictionary:
for i in codes:
  print("{} -".format(i))
  for j in codes[i]:
    print("{}: {}".format(j, codes[i][j]))
  print()

famhist -
Present: 1
Absent: 0



In [26]:
df.head() #checking that data is no longer text

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
0,160,12.0,5.73,23.11,1,49,25.3,97.2,52,1
1,144,0.01,4.41,28.61,0,55,28.87,2.06,63,1
2,118,0.08,3.48,32.28,1,52,29.14,3.81,46,0
3,170,7.5,6.41,38.03,1,51,31.99,24.26,58,1
4,134,13.6,3.5,27.78,1,60,25.99,57.34,49,1


In [27]:
df.isnull().sum()

sbp          0
tobacco      0
ldl          0
adiposity    0
famhist      0
typea        0
obesity      0
alcohol      0
age          0
chd          0
dtype: int64

In [92]:
from tensorflow.python.ops.gen_array_ops import prevent_gradient
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
fs_data = df.copy() # copy of dataset for feature selection
x = fs_data.iloc[:,[0,1,2,3,4,5,6,7,8]]  # independent variables
y = fs_data.iloc[:,[9]]    # target column / what we're trying to predict


#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
#Found this easy way to split training data in the scikit documentation
#Shuffles data, and allocated 80% of it to be used for training, and 20% of it to be used for testing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2) # splitting up dataset to training and testing
training = pd.concat([x_train,y_train], axis=1)
training.to_csv("heart_train.csv",index=False)
testing = pd.concat([x_test,y_test], axis=1)
testing.to_csv("heart_test.csv",index=False)


In [93]:
normalizer = tf.keras.layers.Normalization(axis=-1)
normalizer.adapt(x_train)

In [98]:
print("--Make model--")
model = tf.keras.models.Sequential([
  normalizer,
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dropout(0.4),
  tf.keras.layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print("--Fit model--")
model.fit(x_train, y_train, epochs=200, verbose=2)

--Make model--
--Fit model--
Epoch 1/200
12/12 - 1s - loss: 0.7135 - accuracy: 0.4824 - 975ms/epoch - 81ms/step
Epoch 2/200
12/12 - 0s - loss: 0.6352 - accuracy: 0.6341 - 35ms/epoch - 3ms/step
Epoch 3/200
12/12 - 0s - loss: 0.5905 - accuracy: 0.7127 - 31ms/epoch - 3ms/step
Epoch 4/200
12/12 - 0s - loss: 0.5696 - accuracy: 0.6992 - 37ms/epoch - 3ms/step
Epoch 5/200
12/12 - 0s - loss: 0.5505 - accuracy: 0.7209 - 40ms/epoch - 3ms/step
Epoch 6/200
12/12 - 0s - loss: 0.5429 - accuracy: 0.7182 - 31ms/epoch - 3ms/step
Epoch 7/200
12/12 - 0s - loss: 0.5476 - accuracy: 0.7290 - 31ms/epoch - 3ms/step
Epoch 8/200
12/12 - 0s - loss: 0.5368 - accuracy: 0.7236 - 35ms/epoch - 3ms/step
Epoch 9/200
12/12 - 0s - loss: 0.5286 - accuracy: 0.7290 - 44ms/epoch - 4ms/step
Epoch 10/200
12/12 - 0s - loss: 0.5276 - accuracy: 0.7100 - 44ms/epoch - 4ms/step
Epoch 11/200
12/12 - 0s - loss: 0.5247 - accuracy: 0.7290 - 30ms/epoch - 2ms/step
Epoch 12/200
12/12 - 0s - loss: 0.5212 - accuracy: 0.7344 - 33ms/epoch - 3ms

<keras.callbacks.History at 0x7f7111f47590>

In [102]:
print("--Evaluate model--")
model_loss1, model_acc1 = model.evaluate(x_train,  y_train, verbose=2)
model_loss2, model_acc2 = model.evaluate(x_test,  y_test, verbose=2)
print(f"Train / Test Accuracy: {model_acc1*100:.1f}% / {model_acc2*100:.1f}%")


--Evaluate model--
12/12 - 0s - loss: 0.3816 - accuracy: 0.8184 - 39ms/epoch - 3ms/step
3/3 - 0s - loss: 0.4924 - accuracy: 0.7634 - 27ms/epoch - 9ms/step
Train / Test Accuracy: 81.8% / 76.3%


In [None]:
#Ran this to save my model
model.save('CHD_Model.h5') #Changed name so it doesnt overwrite my current
#76.3% accuracy model

# New Section