# Heart Attack Prediction
###### Using data from https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease
###### The goal is to use decision trees to predict whether someone will have a heart attack.
###### Performance will be compared between different parameters and approaches.

In [1]:
# Import libraries and data
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
data = pd.read_csv("heart_2022_no_nans.csv")
data

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,4.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.60,71.67,27.99,No,No,Yes,Yes,"Yes, received Tdap",No,No
1,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,6.0,None of them,No,...,1.78,95.25,30.13,No,No,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,No
2,Alabama,Male,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,"6 or more, but not all",No,...,1.85,108.86,31.66,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
3,Alabama,Female,Fair,5.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,None of them,No,...,1.70,90.72,31.32,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,Yes
4,Alabama,Female,Good,3.0,15.0,Within past year (anytime less than 12 months ...,Yes,5.0,1 to 5,No,...,1.55,79.38,33.07,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246017,Virgin Islands,Male,Very good,0.0,0.0,Within past 2 years (1 year but less than 2 ye...,Yes,6.0,None of them,No,...,1.78,102.06,32.28,Yes,No,No,No,"Yes, received tetanus shot but not sure what type",No,No
246018,Virgin Islands,Female,Fair,0.0,7.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.93,90.72,24.34,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,Yes
246019,Virgin Islands,Male,Good,0.0,15.0,Within past year (anytime less than 12 months ...,Yes,7.0,1 to 5,No,...,1.68,83.91,29.86,Yes,Yes,Yes,Yes,"Yes, received tetanus shot but not sure what type",No,Yes
246020,Virgin Islands,Female,Excellent,2.0,2.0,Within past year (anytime less than 12 months ...,Yes,7.0,None of them,No,...,1.70,83.01,28.66,No,Yes,Yes,No,"Yes, received tetanus shot but not sure what type",No,No


##### Data Preprocessing
###### SciKit learn trees can only handle numbers, so all values must be converted.
##### Numeric Values
###### Floats and integers are fine as is. There is no need to scale or bin them.
##### Binary Values
###### Yes/No and Male/Female are converted to 1 and 0.
##### Range Values
###### Values that can be converted to a range of numbers where the difference between them has meaning.
###### Ex. General Health. Poor to Excellent can be represented as 0-4.
##### Categorical Values
###### These must be represented using a one-hot encoding.
###### Ex. States have no numerical relation to each and there are far more than 2.

In [2]:
# Basic stats of numerical data
display(data.describe())

# Preprocessing - Store numeric values
numeric_columns = ["PhysicalHealthDays", "MentalHealthDays", "SleepHours", "HeightInMeters", "WeightInKilograms", "BMI"]
numeric_entries = data[numeric_columns].to_numpy()
print("Shape:",numeric_entries.shape)
numeric_entries

Unnamed: 0,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms,BMI
count,246022.0,246022.0,246022.0,246022.0,246022.0,246022.0
mean,4.119026,4.16714,7.021331,1.70515,83.615179,28.668136
std,8.405844,8.102687,1.440681,0.106654,21.323156,6.513973
min,0.0,0.0,1.0,0.91,28.12,12.02
25%,0.0,0.0,6.0,1.63,68.04,24.27
50%,0.0,0.0,7.0,1.7,81.65,27.46
75%,3.0,4.0,8.0,1.78,95.25,31.89
max,30.0,30.0,24.0,2.41,292.57,97.65


Shape: (246022, 6)


array([[  4.  ,   0.  ,   9.  ,   1.6 ,  71.67,  27.99],
       [  0.  ,   0.  ,   6.  ,   1.78,  95.25,  30.13],
       [  0.  ,   0.  ,   8.  ,   1.85, 108.86,  31.66],
       ...,
       [  0.  ,  15.  ,   7.  ,   1.68,  83.91,  29.86],
       [  2.  ,   2.  ,   7.  ,   1.7 ,  83.01,  28.66],
       [  0.  ,   0.  ,   5.  ,   1.83, 108.86,  32.55]])

In [3]:
# Examine possible values
binary_columns = [] # All columns with 2 unique values
entries = {} # Dict that stores all unique values by column name

for name in list(data.columns):
    values = set(data[name])
    entries.update({name: values})
    if len(values) == 2:
        binary_columns.append(name)
    if len(values) < 6: # Print some values
        print(name, values)

Sex {'Male', 'Female'}
GeneralHealth {'Very good', 'Fair', 'Good', 'Poor', 'Excellent'}
LastCheckupTime {'Within past year (anytime less than 12 months ago)', 'Within past 2 years (1 year but less than 2 years ago)', 'Within past 5 years (2 years but less than 5 years ago)', '5 or more years ago'}
PhysicalActivities {'Yes', 'No'}
RemovedTeeth {'All', '6 or more, but not all', 'None of them', '1 to 5'}
HadHeartAttack {'Yes', 'No'}
HadAngina {'Yes', 'No'}
HadStroke {'Yes', 'No'}
HadAsthma {'Yes', 'No'}
HadSkinCancer {'Yes', 'No'}
HadCOPD {'Yes', 'No'}
HadDepressiveDisorder {'Yes', 'No'}
HadKidneyDisease {'Yes', 'No'}
HadArthritis {'Yes', 'No'}
HadDiabetes {'No, pre-diabetes or borderline diabetes', 'Yes', 'No', 'Yes, but only during pregnancy (female)'}
DeafOrHardOfHearing {'Yes', 'No'}
BlindOrVisionDifficulty {'Yes', 'No'}
DifficultyConcentrating {'Yes', 'No'}
DifficultyWalking {'Yes', 'No'}
DifficultyDressingBathing {'Yes', 'No'}
DifficultyErrands {'Yes', 'No'}
SmokerStatus {'Never smo

In [4]:
# Preprocessing - convert binary options to 0 and 1
# The encoder determines 0 or 1 based on alphanumeric order
# Yes, Male = 1
# No, Female = 0

# Change all entries in CovidPos to Yes/No. Tested positive and Yes are equivalent enough to not justify 1-hot encoding
temp_data = data.replace(to_replace="Tested positive using home test without a health professional", value="Yes")
binary_columns.append("CovidPos")

# Remove HadHeartAttack because we want to predict this value
binary_columns.remove("HadHeartAttack")

display(temp_data[binary_columns])
binary_encoder = OrdinalEncoder()
binary_entries = binary_encoder.fit_transform(data[binary_columns])
print("Array shape:", binary_entries.shape)
print(binary_entries)

Unnamed: 0,Sex,PhysicalActivities,HadAngina,HadStroke,HadAsthma,HadSkinCancer,HadCOPD,HadDepressiveDisorder,HadKidneyDisease,HadArthritis,...,DifficultyWalking,DifficultyDressingBathing,DifficultyErrands,ChestScan,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,HighRiskLastYear,CovidPos
0,Female,Yes,No,No,No,No,No,No,No,Yes,...,No,No,No,No,No,No,Yes,Yes,No,No
1,Male,Yes,No,No,No,No,No,No,No,Yes,...,No,No,No,No,No,No,Yes,Yes,No,No
2,Male,No,No,No,No,No,No,No,No,Yes,...,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes
3,Female,Yes,No,No,No,Yes,No,Yes,No,Yes,...,Yes,No,No,No,No,No,Yes,Yes,No,Yes
4,Female,Yes,No,No,No,No,No,No,No,Yes,...,No,No,No,No,No,No,Yes,Yes,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246017,Male,Yes,No,No,No,No,No,No,No,No,...,No,No,No,No,Yes,No,No,No,No,No
246018,Female,Yes,No,No,No,No,No,Yes,No,No,...,No,No,No,No,No,No,No,No,No,Yes
246019,Male,Yes,No,Yes,No,No,No,No,No,Yes,...,No,No,No,No,Yes,Yes,Yes,Yes,No,Yes
246020,Female,Yes,No,No,No,No,No,No,No,No,...,No,No,No,No,No,Yes,Yes,No,No,No


Array shape: (246022, 23)
[[0. 1. 0. ... 1. 0. 0.]
 [1. 1. 0. ... 1. 0. 0.]
 [1. 0. 0. ... 1. 0. 2.]
 ...
 [1. 1. 0. ... 1. 0. 2.]
 [0. 1. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 1. 0. 2.]]


In [5]:
# Get target in its own series
y = data["HadHeartAttack"].replace({"Yes": 1, "No":0})
y

0         0
1         0
2         0
3         0
4         0
         ..
246017    0
246018    0
246019    0
246020    0
246021    1
Name: HadHeartAttack, Length: 246022, dtype: int64

In [6]:
# Preprocessing - convert columns with multiple values to numeric ones
range_columns = ["GeneralHealth", "LastCheckupTime", "RemovedTeeth", "AgeCategory"]

# Function converts categorical column values to a range
# col is the data, values are the possible entries, and order is the numbers to replace them with
# returns a new array
def to_range(col, values, order):
    values = list(values)
    replacements = {}
    for i in range(len(values)):
        replacements.update({values[i]: order[i]})
    return col.replace(replacements).to_numpy()


# Manually set the ordering for values
order = [[4,1,2,0,3],[5,2,4,1],[5,6,7,0],[18,25,30,35,40,45,50,55,60,65,70,75,80]]
i = 0
sorted_entries = [] # Convert the possible entries to sorted list
for name in range_columns:
    temp = sorted(list(entries[name]))
    sorted_entries.append(temp)
    if len(temp) != len(order[i]):
        print("Error: Order does not match entries")
    j = 0
    print(name)
    for j in range(len(order[i])):
        print(list(temp)[j], end=':(')
        print(order[i][j], end=") ")
        j += 1
    print("\n")
    i+=1

range_entries = [] # 2d array of updated values
i = 0
for name in range_columns:
    range_entries.append(to_range(data[name], sorted_entries[i], order[i]))
    i += 1

range_entries = np.array(range_entries).transpose() # Fix array shape
print("Shape:",range_entries.shape)
print(range_entries)

GeneralHealth
Excellent:(4) Fair:(1) Good:(2) Poor:(0) Very good:(3) 

LastCheckupTime
5 or more years ago:(5) Within past 2 years (1 year but less than 2 years ago):(2) Within past 5 years (2 years but less than 5 years ago):(4) Within past year (anytime less than 12 months ago):(1) 

RemovedTeeth
1 to 5:(5) 6 or more, but not all:(6) All:(7) None of them:(0) 

AgeCategory
Age 18 to 24:(18) Age 25 to 29:(25) Age 30 to 34:(30) Age 35 to 39:(35) Age 40 to 44:(40) Age 45 to 49:(45) Age 50 to 54:(50) Age 55 to 59:(55) Age 60 to 64:(60) Age 65 to 69:(65) Age 70 to 74:(70) Age 75 to 79:(75) Age 80 or older:(80) 

Shape: (246022, 4)
[[ 3  1  0 65]
 [ 3  1  0 70]
 [ 3  1  6 75]
 ...
 [ 2  1  5 65]
 [ 4  1  0 50]
 [ 3  1  0 70]]


In [7]:
# Preprocessing - 1-hot encoding
categorical_cols = ["State", "HadDiabetes", "SmokerStatus", "ECigaretteUsage", "RaceEthnicityCategory", "TetanusLast10Tdap"]
temp_data = data[categorical_cols]
encoder = OneHotEncoder()
encoder.fit(temp_data)
onehot_entries = encoder.transform(temp_data).toarray()
print(onehot_entries.shape)
onehot_entries

(246022, 75)


array([[1., 0., 0., ..., 1., 0., 0.],
       [1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [8]:
onehot_col = [] # New column names for 1-hot encoding
i = 0
for category in encoder.categories_:
    for col in category:
        # Add original column name to front
        onehot_col.append((str(categorical_cols[i]) + str("_") + str(col)))
    i += 1
# Example
onehot_col[-20:-1]

['HadDiabetes_No, pre-diabetes or borderline diabetes',
 'HadDiabetes_Yes',
 'HadDiabetes_Yes, but only during pregnancy (female)',
 'SmokerStatus_Current smoker - now smokes every day',
 'SmokerStatus_Current smoker - now smokes some days',
 'SmokerStatus_Former smoker',
 'SmokerStatus_Never smoked',
 'ECigaretteUsage_Never used e-cigarettes in my entire life',
 'ECigaretteUsage_Not at all (right now)',
 'ECigaretteUsage_Use them every day',
 'ECigaretteUsage_Use them some days',
 'RaceEthnicityCategory_Black only, Non-Hispanic',
 'RaceEthnicityCategory_Hispanic',
 'RaceEthnicityCategory_Multiracial, Non-Hispanic',
 'RaceEthnicityCategory_Other race only, Non-Hispanic',
 'RaceEthnicityCategory_White only, Non-Hispanic',
 'TetanusLast10Tdap_No, did not receive any tetanus shot in the past 10 years',
 'TetanusLast10Tdap_Yes, received Tdap',
 'TetanusLast10Tdap_Yes, received tetanus shot but not sure what type']

###### The additional answer types for smoking, tetanus shots, ect. are too ambiguous to convert to a range.
###### Ex. While not smoking is definitely better than smoking, it's unclear between being a former smoker or infrequent current one.

In [13]:
# Combine all the data
# Column names
column_names = numeric_columns + binary_columns + range_columns + onehot_col
print("Columns:", len(column_names))

# Data values
binary_entries.shape
entries = np.concatenate((numeric_entries, binary_entries), axis=1)
entries = np.concatenate((entries, range_entries), axis=1)
entries = np.concatenate((entries, onehot_entries), axis=1)
print("Data values shape:",entries.shape)

# Create dataframes
x = pd.DataFrame(entries, columns=column_names)
display(x)

Columns: 108
Data values shape: (246022, 108)


Unnamed: 0,PhysicalHealthDays,MentalHealthDays,SleepHours,HeightInMeters,WeightInKilograms,BMI,Sex,PhysicalActivities,HadAngina,HadStroke,...,ECigaretteUsage_Use them some days,"RaceEthnicityCategory_Black only, Non-Hispanic",RaceEthnicityCategory_Hispanic,"RaceEthnicityCategory_Multiracial, Non-Hispanic","RaceEthnicityCategory_Other race only, Non-Hispanic","RaceEthnicityCategory_White only, Non-Hispanic","TetanusLast10Tdap_No, did not receive any tetanus shot in the past 10 years","TetanusLast10Tdap_Yes, received Tdap","TetanusLast10Tdap_Yes, received tetanus shot but not sure what type","TetanusLast10Tdap_Yes, received tetanus shot, but not Tdap"
0,4.0,0.0,9.0,1.60,71.67,27.99,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1,0.0,0.0,6.0,1.78,95.25,30.13,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,8.0,1.85,108.86,31.66,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,5.0,0.0,9.0,1.70,90.72,31.32,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
4,3.0,15.0,5.0,1.55,79.38,33.07,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246017,0.0,0.0,6.0,1.78,102.06,32.28,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
246018,0.0,7.0,7.0,1.93,90.72,24.34,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
246019,0.0,15.0,7.0,1.68,83.91,29.86,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
246020,2.0,2.0,7.0,1.70,83.01,28.66,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


###### All values are now numbers. Most are 1 or 0 but a range of numbers and floats are fine.

In [10]:
# x is the processed data
# y is the heart attacks
# Split into training and validation
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 1000)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train, y_train)

In [11]:
# Function takes 2 list-likes and returns the number of matches
# TODO: Include true/false negatives/positives
def compare(a,b):
    a = list(a)
    b = list(b)
    correct = 0
    total = 0
    for i in range(len(a)):
        if a[i] == b[i]:
            correct += 1
        total += 1
    return [correct, total]
results = clf.predict(x_test)
stats = compare(results, y_test)
print("Decision tree predicted", stats[0], "correct out of", stats[1], end=" ")
print("(%.2f%%)" % (stats[0]/stats[1]))

Decision tree predicted 22579 correct out of 24603 (0.92%)
