Ideas for analysis:
- Segment data by sex (biological difference leading to higher chance in men)
- Proportion of men to women (EDA)

1. age - age in years

2. sex - sex (1 = male; 0 = female)

3. cp - chest pain type (1 = typical angina; 2 = atypical angina; 3 = non-anginal pain; 0 = asymptomatic)

4. trestbps - resting blood pressure (in mm Hg on admission to the hospital)

5. chol - serum cholestoral in mg/dl

6. fbs - fasting blood sugar > 120 mg/dl (1 = true; 0 = false)

7. restecg - resting electrocardiographic results (1 = normal; 2 = having ST-T wave abnormality; 0 = hypertrophy)

8. thalach - maximum heart rate achieved

9. exang - exercise induced angina (1 = yes; 0 = no)

10. oldpeak - ST depression induced by exercise relative to rest

11. slope - the slope of the peak exercise ST segment (2 = upsloping; 1 = flat; 0 = downsloping)

12. ca - number of major vessels (0-3) colored by flourosopy

13. thal - 2 = normal; 1 = fixed defect; 3 = reversable defect

14. num - the predicted attribute - diagnosis of heart disease (angiographic disease status) (Value 0 = < diameter narrowing; Value 1 = > 50% diameter narrowing)

In [59]:
import pandas as pd
import numpy as np

In [60]:
heart = pd.read_csv('heart.csv')
o2 = pd.read_csv('o2Saturation.csv', names=['o2_saturation'])

In [117]:
# Run this for "output" column as the value to predict
feature_cols = list(heart.columns)
feature_cols.remove('output')
X = heart[feature_cols]
y = heart['output']

In [69]:
# Run this for "sex" column as the value to predict
feature_cols = list(heart.columns)
feature_cols.remove('sex')
X = heart[feature_cols]
y = heart['sex']

In [118]:
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [119]:
from ml_helpers import evaluate_model
from svm import best_svc
from dt import best_dt
from adaboost import best_adaboost
from random_forest import best_random_forest
from sklearn.naive_bayes import GaussianNB

top_accuracy = 0
best_model = {}

# Gaussian Naive Bayes
nb_model = GaussianNB()
print()
print("GAUSSIAN NAIVE BAYES CLASSIFIER")
print()
training_time, predict_time, nb_accuracy = evaluate_model(nb_model, features_train, features_test, labels_train, labels_test)
print(f"Training time: {round(training_time, 3)}s")
print(f"Prediction time: {round(predict_time, 3)}s")
print(f"Accuracy: {nb_accuracy}")

# Support Vector Machine
kernels = ["linear", "rbf"]
Cs = [0.01, 0.1, 1, 10, 100, 1000, 10000]
gammas = ["scale", "auto", 0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10]
svc, svc_params = best_svc(kernels, Cs, gammas, features_train, features_test, labels_train, labels_test)

# Decision Tree
min_splits = [2, 10, 20, 40, 50, 100]
random_state = 25
dt, dt_params = best_dt(min_splits, random_state, features_train, features_test, labels_train, labels_test)

# Random Forest
n_estimators_list = [5, 10, 20, 50, 80, 100, 200, 500, 1000]
min_samples_split = [2, 10, 20, 40, 50, 100]
random_state = 25
rand_forest, rand_forest_params = best_random_forest(n_estimators_list, min_samples_split, random_state, features_train, features_test, labels_train, labels_test)

# AdaBoost
n_estimators_list = [5, 10, 20, 50, 80, 100, 200, 500, 1000]
learning_rates = [0.001, 0.01, 0.05, 0.1, 0.25, 0.5, 1]
random_state = 25
adaboost, adaboost_params = best_adaboost(n_estimators_list, learning_rates, random_state, features_train, features_test, labels_train, labels_test)

models = {  adaboost: adaboost_params,
            rand_forest: rand_forest_params,
            dt: dt_params,
            svc: svc_params,
            nb_model: {"accuracy": nb_accuracy}}

for model, parameters in models.items():
    if parameters["accuracy"] > top_accuracy:
        top_accuracy = parameters["accuracy"]
        best_model = {"model": model, "params": parameters}

print("Best model used: ")
print(f"Model = {best_model['model']}")
print(f"Parameters = {best_model['params']}")
training_time, predict_time, accuracy = evaluate_model(best_model["model"], features_train, features_test, labels_train, labels_test)
print("-------------------------------------------------")
print(f"Training time: {round(training_time, 3)}s")
print(f"Prediction time: {round(predict_time, 3)}s")
print(f"Accuracy: {accuracy}")
print()


GAUSSIAN NAIVE BAYES CLASSIFIER

Training time: 0.002s
Prediction time: 0.002s
Accuracy: 0.7704918032786885

SUPPORT VECTOR CLASSIFIER

Kernel = linear---------------------------------
C = 0.01---------------------------------
Kernel = linear
C = 0.01
gamma = scale
Training time: 0.017s
Prediction time: 0.002s
Accuracy: 0.7540983606557377

C = 0.1---------------------------------
Kernel = linear
C = 0.1
gamma = scale
Training time: 0.037s
Prediction time: 0.002s
Accuracy: 0.7704918032786885

C = 1---------------------------------
Kernel = linear
C = 1
gamma = scale
Training time: 0.091s
Prediction time: 0.002s
Accuracy: 0.7377049180327869

C = 10---------------------------------
Kernel = linear
C = 10
gamma = scale
Training time: 5.04s
Prediction time: 0.002s
Accuracy: 0.7377049180327869

C = 100---------------------------------
Kernel = linear
C = 100
gamma = scale
Training time: 13.648s
Prediction time: 0.001s
Accuracy: 0.7377049180327869

C = 1000---------------------------------
K

Kernel = rbf
C = 10000
gamma = auto
Training time: 0.006s
Prediction time: 0.002s
Accuracy: 0.5081967213114754

Kernel = rbf
C = 10000
gamma = 0.001
Training time: 0.008s
Prediction time: 0.003s
Accuracy: 0.5573770491803278

Kernel = rbf
C = 10000
gamma = 0.01
Training time: 0.005s
Prediction time: 0.002s
Accuracy: 0.4918032786885246

Kernel = rbf
C = 10000
gamma = 0.05
Training time: 0.005s
Prediction time: 0.002s
Accuracy: 0.5409836065573771

Kernel = rbf
C = 10000
gamma = 0.1
Training time: 0.006s
Prediction time: 0.002s
Accuracy: 0.5081967213114754

Kernel = rbf
C = 10000
gamma = 0.5
Training time: 0.007s
Prediction time: 0.004s
Accuracy: 0.5081967213114754

Kernel = rbf
C = 10000
gamma = 1
Training time: 0.006s
Prediction time: 0.003s
Accuracy: 0.5081967213114754

Kernel = rbf
C = 10000
gamma = 5
Training time: 0.009s
Prediction time: 0.003s
Accuracy: 0.5081967213114754

Kernel = rbf
C = 10000
gamma = 10
Training time: 0.007s
Prediction time: 0.003s
Accuracy: 0.5081967213114754




n_estimators = 200
min_samples_split = 2
Training time: 0.262s
Prediction time: 0.018s
Accuracy: 0.8032786885245902

min_samples_split = 10---------------------------------
n_estimators = 200
min_samples_split = 10
Training time: 0.241s
Prediction time: 0.02s
Accuracy: 0.7704918032786885

min_samples_split = 20---------------------------------
n_estimators = 200
min_samples_split = 20
Training time: 0.232s
Prediction time: 0.024s
Accuracy: 0.7868852459016393

min_samples_split = 40---------------------------------
n_estimators = 200
min_samples_split = 40
Training time: 0.236s
Prediction time: 0.019s
Accuracy: 0.7868852459016393

min_samples_split = 50---------------------------------
n_estimators = 200
min_samples_split = 50
Training time: 0.232s
Prediction time: 0.018s
Accuracy: 0.8032786885245902

min_samples_split = 100---------------------------------
n_estimators = 200
min_samples_split = 100
Training time: 0.224s
Prediction time: 0.018s
Accuracy: 0.8032786885245902

n_estimators

n_estimators = 80
learning_rate = 0.01
Training time: 0.103s
Prediction time: 0.012s
Accuracy: 0.6721311475409836

learning_rate = 0.05---------------------------------
n_estimators = 80
learning_rate = 0.05
Training time: 0.097s
Prediction time: 0.012s
Accuracy: 0.7868852459016393

learning_rate = 0.1---------------------------------
n_estimators = 80
learning_rate = 0.1
Training time: 0.103s
Prediction time: 0.014s
Accuracy: 0.7704918032786885

learning_rate = 0.25---------------------------------
n_estimators = 80
learning_rate = 0.25
Training time: 0.104s
Prediction time: 0.013s
Accuracy: 0.7213114754098361

learning_rate = 0.5---------------------------------
n_estimators = 80
learning_rate = 0.5
Training time: 0.104s
Prediction time: 0.013s
Accuracy: 0.7049180327868853

learning_rate = 1---------------------------------
n_estimators = 80
learning_rate = 1
Training time: 0.104s
Prediction time: 0.011s
Accuracy: 0.7049180327868853

n_estimators = 100--------------------------------

In [120]:
clf = best_model["model"]

In [125]:
feature_importance = pd.DataFrame(clf.feature_importances_, index=X.columns, columns=["Importance"])
feature_importance.sort_values("Importance", ascending=False)

Unnamed: 0,Importance
cp,0.2
slp,0.2
caa,0.2
thall,0.165351
exng,0.164576
age,0.070073
sex,0.0
trtbps,0.0
chol,0.0
fbs,0.0
