# Heart Attack Prediction
###### Using data from https://www.kaggle.com/datasets/kamilpytlak/personal-key-indicators-of-heart-disease
###### The goal is to use decision trees to predict whether someone will have a heart attack.
###### Performance will be compared between different parameters and approaches.

In [None]:
# Import libraries and data
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
import matplotlib.pyplot as plt
data = pd.read_csv("heart_2022_no_nans.csv")
data

##### Data Preprocessing
###### SciKit learn trees can only handle numbers, so all values must be converted.
##### Numeric Values
###### Floats and integers are fine as is. There is no need to scale or bin them.
##### Binary Values
###### Yes/No and Male/Female are converted to 1 and 0.
##### Range Values
###### Values that can be converted to a range of numbers where the difference between them has meaning.
###### Ex. General Health. Poor to Excellent can be represented as 0-4.
##### Categorical Values
###### These must be represented using a one-hot encoding.
###### Ex. States have no numerical relation to each and there are far more than 2.

In [None]:
# Basic stats of numerical data
display(data.describe())

# Preprocessing - Store numeric values
numeric_columns = ["PhysicalHealthDays", "MentalHealthDays", "SleepHours", "HeightInMeters", "WeightInKilograms", "BMI"]
numeric_entries = data[numeric_columns].to_numpy()
print("Shape:",numeric_entries.shape)
numeric_entries

In [None]:
# Examine possible values
binary_columns = [] # All columns with 2 unique values
entries = {} # Dict that stores all unique values by column name

for name in list(data.columns):
    values = set(data[name])
    entries.update({name: values})
    if len(values) == 2:
        binary_columns.append(name)
    if len(values) < 6: # Print some values
        print(name, values)

In [None]:
# Preprocessing - convert binary options to 0 and 1
# The encoder determines 0 or 1 based on alphanumeric order
# Yes, Male = 1
# No, Female = 0

# Change all entries in CovidPos to Yes/No. Tested positive and Yes are equivalent enough to not justify 1-hot encoding
temp_data = data.replace(to_replace="Tested positive using home test without a health professional", value="Yes")
binary_columns.append("CovidPos")

# Remove HadHeartAttack because we want to predict this value
binary_columns.remove("HadHeartAttack")

display(temp_data[binary_columns])
binary_encoder = OrdinalEncoder()
binary_entries = binary_encoder.fit_transform(data[binary_columns])
print("Array shape:", binary_entries.shape)
print(binary_entries)

In [None]:
# Get target in its own series
y = data["HadHeartAttack"].replace({"Yes": 1, "No":0})
y

In [None]:
# Preprocessing - convert columns with multiple values to numeric ones
range_columns = ["GeneralHealth", "LastCheckupTime", "RemovedTeeth", "AgeCategory"]

# Function converts categorical column values to a range
# col is the data, values are the possible entries, and order is the numbers to replace them with
# returns a new array
def to_range(col, values, order):
    values = list(values)
    replacements = {}
    for i in range(len(values)):
        replacements.update({values[i]: order[i]})
    return col.replace(replacements).to_numpy()


# Manually set the ordering for values
order = [[4,1,2,0,3],[5,2,4,1],[5,6,7,0],[18,25,30,35,40,45,50,55,60,65,70,75,80]]
i = 0
sorted_entries = [] # Convert the possible entries to sorted list
for name in range_columns:
    temp = sorted(list(entries[name]))
    sorted_entries.append(temp)
    if len(temp) != len(order[i]):
        print("Error: Order does not match entries")
    j = 0
    print(name)
    for j in range(len(order[i])):
        print(list(temp)[j], end=':(')
        print(order[i][j], end=") ")
        j += 1
    print("\n")
    i+=1

range_entries = [] # 2d array of updated values
i = 0
for name in range_columns:
    range_entries.append(to_range(data[name], sorted_entries[i], order[i]))
    i += 1

range_entries = np.array(range_entries).transpose() # Fix array shape
print("Shape:",range_entries.shape)
print(range_entries)

In [None]:
# Preprocessing - 1-hot encoding
categorical_cols = ["State", "HadDiabetes", "SmokerStatus", "ECigaretteUsage", "RaceEthnicityCategory", "TetanusLast10Tdap"]
temp_data = data[categorical_cols]
encoder = OneHotEncoder()
encoder.fit(temp_data)
onehot_entries = encoder.transform(temp_data).toarray()
print(onehot_entries.shape)
onehot_entries

In [None]:
onehot_col = [] # New column names for 1-hot encoding
i = 0
for category in encoder.categories_:
    for col in category:
        # Add original column name to front
        onehot_col.append((str(categorical_cols[i]) + str("_") + str(col)))
    i += 1
# Example
onehot_col[-20:-1]

###### The additional answer types for smoking, tetanus shots, ect. are too ambiguous to convert to a range.
###### Ex. While not smoking is definitely better than smoking, it's unclear between being a former smoker or infrequent current one.

In [None]:
# Combine all the data
# Column names
column_names = numeric_columns + binary_columns + range_columns + onehot_col
print("Columns:", len(column_names))

# Data values
binary_entries.shape
entries = np.concatenate((numeric_entries, binary_entries), axis=1)
entries = np.concatenate((entries, range_entries), axis=1)
entries = np.concatenate((entries, onehot_entries), axis=1)
print("Data values shape:",entries.shape)

# Create dataframes
x = pd.DataFrame(entries, columns=column_names)
display(x)

###### All values are now numbers. Most are 1 or 0 but a range of numbers and floats are fine.

In [None]:
# x is the processed data
# y is the heart attacks
# Split into training and validation
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.1, random_state = 1000)
clf = tree.DecisionTreeClassifier()
clf = clf.fit(x_train, y_train)

In [None]:
# Function takes 2 list-likes and returns the number of matches
# a is the prediction and b should be the true labels
# returns dict with stats
def compare(a,b, printstats = True):
    if len(a) != len(b):
        print("Error: lists are of unequal length")
        return 0
    a = list(a)
    b = list(b)
    correct = 0
    # True/false positives and negatives
    tp, fp, tn, fn = 0,0,0,0
    for i in range(len(a)):
        if a[i] == b[i]:
            correct += 1
            if a[i] == 1: # Had heart attack
                tp += 1
            else: # Did not have a heart attack
                tn += 1
        else:
            if b[i] == 1: # Predicted heart attack but is wrong
                fp += 1 
            else: # Predicted no heart attack but is wrong
                fn += 1
    if printstats == True:
        print("Correct predicted heart attack:", tp, "out of", tp+fp, end=" ")
        print("(%.2f%%)" % float(100*tp/(tp+fp)))
        print("Correctly predicted no heart attack:", tn, "out of", tn+fn, end=" ")
        print("(%.2f%%)" % float((100*tn)/(tn+fn)))
        print("Total:", tp+tn, "out of", tp+tn+fp+fn, end=" ")
        print("(%.2f%%)" % float(100*(tp+tn)/(tp+tn+fp+fn)))
    return {"correct": correct, "tp": tp, "fp":fp, "tn": tn, "fn":fn}

In [None]:
# Tree
results = clf.predict(x_test)
print("Predictions for tree on test set")
stats = compare(results, y_test)

In [None]:
# Number of heart attacks
print("Training set heart attacks:", np.sum(y_train))
print("Testing set heart attacks:", np.sum(y_test))
print("Total cases:", len(x_train) + len(x_test))

print("\nTraining data results")
s = compare(clf.predict(x_train), y_train)

###### The model, while generally correct, performs poorly when correctly predicting heart attacks.
###### This is likely due to the training data having comparatively few heart attacks and the tree overfitting (It is 100% accurate on training data).

In [None]:
# TODO
# Implement tree pruning and examine feature analysis