Setup

In [1]:
# Common imports
import numpy as np
import pandas as pd

# To plot pretty figures
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# to make this notebook's output identical at every run
np.random.seed(42)

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

#Variable information

Age: age in years, 
Sex: 1 = male, 0 = female
CP (chest pain): 1 - typical angina, 2: atypical angina, 3: non-anginal pain, 4: asymptomatic
Trestbps: (resting blood pressure in mmHg)
Serum cholesterol levels: in mg/dl
FBS: fasting blood sugar > 120 mg/dl, 1 = true; 0 = false)
Restecg: Resting electrocardiographic results, 0: normal, 1: having ST-T wave abnormality (T  wave inversions and/or ST elevation or depression of > 0.05 mV), 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
Thalach: Maximum heart rate achieved
Exang: Exercise induced angina (1 = yes; 0 = no)
Oldpeak: ST depression induced by exercise relative to rest    
Slope: Slope of the peak exercise ST segment, 1: upsloping, 2: flat, 3: downsloping   
CA: Number of major vessels (0-3) colored by fluoroscopy 
Thal: 3 = normal; 6 = fixed defect; 7 = reversible defect
Num: diagnosis of heart disease (angiographic disease status), 0: < 50% diameter narrowing, 1: > 50% diameter narrowing (in any major vessel: attributes 59 through 68 are vessels)

In [2]:
# Get the data
heart = pd.read_csv('/Users/dustin.ellis/Desktop/heartdata.csv')
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,28,1,2,130,132,0,2,185,0,0.0,?,?,?,0
1,29,1,2,120,243,0,0,160,0,0.0,?,?,?,0
2,29,1,2,140,?,0,0,170,0,0.0,?,?,?,0
3,30,0,1,170,237,0,1,170,0,0.0,?,?,6,0
4,31,0,2,100,219,0,1,150,0,0.0,?,?,?,0


In [3]:
#slope, ca, and thal all have missing values, so I will drop these columns to make the data useable
heart = heart.drop(['slope', 'ca', 'thal'], axis = 1) 

In [4]:
#heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,num
0,28,1,2,130,132,0,2,185,0,0.0,0
1,29,1,2,120,243,0,0,160,0,0.0,0
2,29,1,2,140,?,0,0,170,0,0.0,0
3,30,0,1,170,237,0,1,170,0,0.0,0
4,31,0,2,100,219,0,1,150,0,0.0,0


In [4]:
#heart.describe()

In [6]:
heart.keys()

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'num       '],
      dtype='object')

In [3]:
#heart['age'].value_counts().plot(kind = 'bar', figsize=(7,4))

middle aged people (late 40s-50s) appear to be the ones most likely affected by heart attack

In [2]:
#heart['sex'].value_counts().plot(kind = 'bar', figsize=(7,4))

Men were largely sampled in the study, so maybe worth looking into heart attack risk within sex

In [9]:
#Num: diagnosis of heart disease (angiographic disease status)
#0: < 50% diameter narrowing
#1: > 50% diameter narrowing (in any major vessel: attributes 59 through 68 are vessels) 

corr_matrix = heart.corr()
corr_matrix["num       "].sort_values(ascending=False)

num           1.000000
oldpeak       0.545700
cp            0.505864
sex           0.272781
age           0.159315
Name: num       , dtype: float64

Oldpeak: ST depression induced by exercise relative to rest. 
ST depression refers to a finding on an electrocardiogram,[1][2] wherein the trace in the ST segment is abnormally low below the baseline.

CP: Chestpain, 1 - typical angina, 2: atypical angina, 3: non-anginal pain, 4: asymptomatic. Understandably, someone with chest pain is likely to be at risk for heart disease.

Sex: Male or female, low + correlation with whether someone has heart disease or not. 

Age: Age in years, low + correlation with whether someone has heart disease or not. 

In [10]:
heart["num       "].value_counts()

0    188
1    106
Name: num       , dtype: int64

In [1]:
import seaborn as sns
sns.pairplot(data = heart, hue = 'oldpeak')

NameError: name 'heart' is not defined

In [12]:
heart.isin(['?']).sum()

age            0
sex            0
cp             0
trestbps       1
chol          23
fbs            8
restecg        1
thalach        1
exang          1
oldpeak        0
num            0
dtype: int64

In [13]:
#Get indexes where name column has value '?'
indexName = heart[heart['trestbps'] == '?'].index
 
#Delete these row indexes from dataFrame
heart.drop(indexName , inplace=True)

#Get indexes where name column has value '?'
indexName1 = heart[heart['chol'] == '?'].index
 
#Delete these row indexes from dataFrame
heart.drop(indexName1 , inplace=True)

#Get indexes where name column has value '?'
indexName2 = heart[heart['fbs'] == '?'].index
 
#Delete these row indexes from dataFrame
heart.drop(indexName2 , inplace=True)

#Get indexes where name column has value '?'
indexName3 = heart[heart['restecg'] == '?'].index
 
#Delete these row indexes from dataFrame
heart.drop(indexName3 , inplace=True)

#Get indexes where name column has value '?'
indexName4 = heart[heart['thalach'] == '?'].index
 
#Delete these row indexes from dataFrame
heart.drop(indexName4 , inplace=True)

#Get indexes where name column has value '?'
indexName5 = heart[heart['exang'] == '?'].index
 
#Delete these row indexes from dataFrame
heart.drop(indexName5 , inplace=True)

heart.shape

(261, 11)

In [14]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(heart, test_size=0.2, random_state=42)

#Set labels
train_labels = train_set["num       "] 
test_labels = test_set["num       "]

#Drop target from training and testing set
true_train_set = train_set.drop("num       ", axis = 1)
true_test_set = test_set.drop("num       ", axis = 1)

In [15]:
true_test_set.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak
33,39,1,2,120,204,0,0,145,0,0.0
165,55,1,4,140,229,0,0,110,1,0.5
195,38,1,4,120,282,0,0,170,0,0.0
82,46,1,4,110,238,0,1,140,1,1.0
184,61,0,4,130,294,0,1,120,1,1.0


In [16]:
true_train_set.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak
241,54,1,3,120,237,0,0,150,1,1.5
73,45,0,4,132,297,0,0,144,0,0.0
17,36,1,3,112,340,0,0,184,0,1.0
60,43,0,2,120,201,0,0,165,0,0.0
283,47,1,4,160,291,0,1,158,1,3.0


In [17]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 261 entries, 0 to 293
Data columns (total 11 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   age         261 non-null    int64  
 1   sex         261 non-null    int64  
 2   cp          261 non-null    int64  
 3   trestbps    261 non-null    object 
 4   chol        261 non-null    object 
 5   fbs         261 non-null    object 
 6   restecg     261 non-null    object 
 7   thalach     261 non-null    object 
 8   exang       261 non-null    object 
 9   oldpeak     261 non-null    float64
 10  num         261 non-null    int64  
dtypes: float64(1), int64(4), object(6)
memory usage: 34.5+ KB


In [18]:
#Separate categorical and numerical variables for pipeline

cat_attr = ["trestbps", "chol", "fbs", "restecg", "thalach", "exang"]
num_attr = ["age", "sex", "cp", "oldpeak"]

In [19]:
#Combining categorical and numerical pipeline

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

combine_pipeline = ColumnTransformer([
    ("num", StandardScaler(), num_attr),
    ("cat", OneHotEncoder(handle_unknown = "ignore"), cat_attr),
    ])

In [20]:
#Fit and Transform Train
trans_train = combine_pipeline.fit_transform(true_train_set)


#Transform Test
trans_test = combine_pipeline.transform(true_test_set)

In [21]:
#Create SMOTE algorithm

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state = 42)

#resample training set
x_smote_train, x_smote_labels = smote.fit_resample(trans_train, train_labels)

### Logistic Regression Model

In [22]:
from sklearn.linear_model import LogisticRegression

#Logistic Regression algorithm
log_reg = LogisticRegression(random_state = 42, solver='lbfgs', max_iter = 400)

In [23]:
log_reg.fit(x_smote_train, x_smote_labels)

LogisticRegression(max_iter=400, random_state=42)

In [24]:
#Logisitic Regression Model Score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report

#Predict labels
pred_log_reg = log_reg.predict(trans_test)

#accuracy score
log_acc = accuracy_score(pred_log_reg, test_labels)
#precision score
log_prec = precision_score(pred_log_reg, test_labels, average = 'binary', pos_label = 1)
#recall_score
log_recall = recall_score(pred_log_reg, test_labels, average = 'binary', pos_label = 1)

print(log_acc, log_prec, log_recall)
print(classification_report(test_labels, pred_log_reg))

0.7924528301886793 0.85 0.68
              precision    recall  f1-score   support

           0       0.89      0.76      0.82        33
           1       0.68      0.85      0.76        20

    accuracy                           0.79        53
   macro avg       0.79      0.80      0.79        53
weighted avg       0.81      0.79      0.80        53



#### Automatic hyperparameter search

In [25]:
log_reg = LogisticRegression(random_state = 42)

#Hyperparameter choices
param_grid = [
    {"penalty" : ["l1", "l2", "elasticnet", "none"],
    "solver" : ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
     "max_iter" : [300, 400, 500, 1000]
    }
]

In [26]:
#Use of GridSearchCV
from sklearn.model_selection import GridSearchCV

hyper_log = GridSearchCV(log_reg, param_grid = param_grid, cv = 3, n_jobs = -1)

In [5]:
#find_log = hyper_log.fit(x_smote_train, x_smote_labels)

In [28]:
#Find best hyperparameters to use with log regression

find_log.best_estimator_

LogisticRegression(max_iter=400, penalty='none', random_state=42, solver='saga')

In [29]:
#Examine accuracy for training set and training labels
find_log.score(x_smote_train, x_smote_labels)

1.0

In [30]:
#Use on testing

log_reg2 = LogisticRegression(random_state = 42, solver = "saga", penalty = "none", max_iter = 400)
log_reg2.fit(x_smote_train, x_smote_labels)

#Predict labels
pred_log_reg = log_reg2.predict(trans_test)

#accuracy score
hyper_log_acc = accuracy_score(pred_log_reg, test_labels)
#precision score
hyper_log_prec = precision_score(pred_log_reg, test_labels, average = 'binary', pos_label = 1)
#recall_score
hyper_log_recall = recall_score(pred_log_reg, test_labels, average = 'binary', pos_label = 1)

print(hyper_log_acc, hyper_log_prec, hyper_log_recall)
print(classification_report(test_labels, pred_log_reg))

0.7735849056603774 0.75 0.6818181818181818
              precision    recall  f1-score   support

           0       0.84      0.79      0.81        33
           1       0.68      0.75      0.71        20

    accuracy                           0.77        53
   macro avg       0.76      0.77      0.76        53
weighted avg       0.78      0.77      0.78        53





### Support Vector Machine Model 

#### Without specific hyperparameters

In [31]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

#SVM Linear Algorithm
svm = LinearSVC(C = 1, loss = "hinge")
svm.fit(x_smote_train, x_smote_labels)



LinearSVC(C=1, loss='hinge')

In [33]:
pred_svm = svm.predict(trans_test)

#accuracy score
svm_acc = accuracy_score(pred_svm, test_labels)
#precision score
svm_prec = precision_score(pred_svm, test_labels, average = 'binary', pos_label = 1)
#recall_score
svm_recall = recall_score(pred_svm, test_labels, average = 'binary', pos_label = 1)

print(svm_acc, svm_prec, svm_recall)

0.7735849056603774 0.75 0.6818181818181818


In [34]:
svm = LinearSVC()

#Hyperparameter choices
param_grid = [
    {"penalty" : ["l1", "l2"],
    "loss" : ["hinge", "squared_hinge"],
     "C" : [1.0, 2.0, 3.0, 0.5]
    }
]

In [35]:
#Create GridSearch for SVM
hyper_lsvm = GridSearchCV(svm, param_grid = param_grid, cv = 3, n_jobs = -1) 

In [36]:
find_l_svm = hyper_lsvm.fit(x_smote_train, x_smote_labels)

        nan 0.86554397        nan 0.85017375        nan 0.86171255
        nan 0.83088301        nan 0.87717188]


In [37]:
find_l_svm.best_estimator_

LinearSVC(C=0.5)

In [38]:
#Training score
find_l_svm.score(x_smote_train, x_smote_labels)

0.9961538461538462

In [39]:
#Use on testing

svm_2 = LinearSVC(C = 0.5)
svm_2.fit(x_smote_train, x_smote_labels)

#Predict labels
pred_svm = svm_2.predict(trans_test)

#accuracy score
hyper_svm_acc = accuracy_score(pred_svm, test_labels)
#precision score
hyper_svm_prec = precision_score(pred_svm, test_labels, average = 'binary', pos_label = 1)
#recall_score
hyper_svm_recall = recall_score(pred_svm, test_labels, average = 'binary', pos_label = 1)

print(hyper_svm_acc, hyper_svm_prec, hyper_svm_recall)
print(classification_report(test_labels, pred_svm))

0.7735849056603774 0.75 0.6818181818181818
              precision    recall  f1-score   support

           0       0.84      0.79      0.81        33
           1       0.68      0.75      0.71        20

    accuracy                           0.77        53
   macro avg       0.76      0.77      0.76        53
weighted avg       0.78      0.77      0.78        53



### Naive Bayes Model

In [40]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
bnb.fit(x_smote_train, x_smote_labels)

BernoulliNB()

In [41]:
pred_bnb = bnb.predict(trans_test)

#accuracy score
bnb_acc = accuracy_score(pred_bnb, test_labels)
#precision score
bnb_prec = precision_score(pred_bnb, test_labels, average = 'binary', pos_label = 1)
#recall_score
bnb_recall = recall_score(pred_bnb, test_labels, average = 'binary', pos_label = 1)

print(bnb_acc, bnb_prec, bnb_recall)

0.7735849056603774 0.7 0.7


In [42]:
bnb = BernoulliNB(alpha = 2.0)
bnb.fit(x_smote_train, x_smote_labels)

BernoulliNB(alpha=2.0)

In [43]:
pred_bnb = bnb.predict(trans_test)

#accuracy score
bnb_acc2 = accuracy_score(pred_bnb, test_labels)
#precision score
bnb_prec2 = precision_score(pred_bnb, test_labels, average = 'binary', pos_label = 1)
#recall_score
bnb_recall2 = recall_score(pred_bnb, test_labels, average = 'binary', pos_label = 1)

print(bnb_acc2, bnb_prec2, bnb_recall2)

0.7735849056603774 0.7 0.7


In [46]:
#Creation of metrics dataframe
import pandas as pd

model_metrics = pd.DataFrame([["Logistic Regression", log_acc, log_prec, log_recall], 
                              ["SVM", hyper_svm_acc, hyper_svm_prec, hyper_svm_recall],
                              ["Naive Bayes", bnb_acc, bnb_prec, bnb_recall]],
                              columns = ["Model", "Accuracy", "Precision", "Recall"])
model_metrics

Unnamed: 0,Model,Accuracy,Precision,Recall
0,Logistic Regression,0.792453,0.85,0.68
1,SVM,0.773585,0.75,0.681818
2,Naive Bayes,0.773585,0.7,0.7
