In [15]:
import pandas as pd
import numpy as np 
from sklearn import linear_model


In [16]:
hd_data = pd.read_csv("data/hd_data.csv")
hd_data

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium,Heart Disease
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3,Presence
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7,Absence
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7,Presence
3,64,1,4,128,263,0,0,105,1,0.2,2,1,7,Absence
4,74,0,2,120,269,0,2,121,1,0.2,1,1,3,Absence
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
265,52,1,3,172,199,1,0,162,0,0.5,1,0,7,Absence
266,44,1,2,120,263,0,0,173,0,0.0,1,0,7,Absence
267,56,0,2,140,294,0,2,153,0,1.3,2,0,3,Absence
268,57,1,4,140,192,0,0,148,0,0.4,2,0,6,Absence


In [17]:
hd_data.columns

Index(['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120',
       'EKG results', 'Max HR', 'Exercise angina', 'ST depression',
       'Slope of ST', 'Number of vessels fluro', 'Thallium', 'Heart Disease'],
      dtype='str')

In [18]:
#split features and outcomes
X = hd_data[['Age', 'Sex', 'Chest pain type', 'BP', 'Cholesterol', 'FBS over 120',
       'EKG results', 'Max HR', 'Exercise angina', 'ST depression',
       'Slope of ST', 'Number of vessels fluro', 'Thallium']]
hd_data['Heart Disease Numeric'] = hd_data['Heart Disease'].replace({'Absence': 0, 'Presence': 1})

y = hd_data['Heart Disease Numeric']  
X.head(3)

Unnamed: 0,Age,Sex,Chest pain type,BP,Cholesterol,FBS over 120,EKG results,Max HR,Exercise angina,ST depression,Slope of ST,Number of vessels fluro,Thallium
0,70,1,4,130,322,0,2,109,0,2.4,2,3,3
1,67,0,3,115,564,0,2,160,0,1.6,2,0,7
2,57,1,2,124,261,0,0,141,0,0.3,1,0,7


In [19]:
print(X.dtypes)
print(y.dtypes)

Age                          int64
Sex                          int64
Chest pain type              int64
BP                           int64
Cholesterol                  int64
FBS over 120                 int64
EKG results                  int64
Max HR                       int64
Exercise angina              int64
ST depression              float64
Slope of ST                  int64
Number of vessels fluro      int64
Thallium                     int64
dtype: object
object


In [20]:
y = y.astype(int)
y.dtypes

dtype('int64')

In [21]:
y

0      1
1      0
2      1
3      0
4      0
      ..
265    0
266    0
267    0
268    0
269    1
Name: Heart Disease Numeric, Length: 270, dtype: int64

In [22]:
from sklearn.model_selection import train_test_split

#split data into 70% training, 30% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [23]:
#train data
reg = linear_model.BayesianRidge(max_iter=1000)
reg.fit(X_train, y_train)

0,1,2
,"max_iter  max_iter: int, default=300 Maximum number of iterations over the complete dataset before stopping independently of any early stopping criterion. .. versionchanged:: 1.3",1000
,"tol  tol: float, default=1e-3 Stop the algorithm if w has converged.",0.001
,"alpha_1  alpha_1: float, default=1e-6 Hyper-parameter : shape parameter for the Gamma distribution prior over the alpha parameter.",1e-06
,"alpha_2  alpha_2: float, default=1e-6 Hyper-parameter : inverse scale parameter (rate parameter) for the Gamma distribution prior over the alpha parameter.",1e-06
,"lambda_1  lambda_1: float, default=1e-6 Hyper-parameter : shape parameter for the Gamma distribution prior over the lambda parameter.",1e-06
,"lambda_2  lambda_2: float, default=1e-6 Hyper-parameter : inverse scale parameter (rate parameter) for the Gamma distribution prior over the lambda parameter.",1e-06
,"alpha_init  alpha_init: float, default=None Initial value for alpha (precision of the noise). If not set, alpha_init is 1/Var(y). .. versionadded:: 0.22",
,"lambda_init  lambda_init: float, default=None Initial value for lambda (precision of the weights). If not set, lambda_init is 1. .. versionadded:: 0.22",
,"compute_score  compute_score: bool, default=False If True, compute the log marginal likelihood at each iteration of the optimization.",False
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. The intercept is not treated as a probabilistic parameter and thus has no associated variance. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True


In [24]:
#print coefficients to see which contributes the most
print(reg.coef_)
print(reg.feature_names_in_)
print(reg.intercept_)


[-0.00164581  0.09324814  0.08481745  0.00191024  0.00046306 -0.0355875
  0.00791559 -0.00142066  0.09666815  0.07059312  0.06647226  0.10835131
  0.06234403]
['Age' 'Sex' 'Chest pain type' 'BP' 'Cholesterol' 'FBS over 120'
 'EKG results' 'Max HR' 'Exercise angina' 'ST depression' 'Slope of ST'
 'Number of vessels fluro' 'Thallium']
-0.5389700145843088


In [25]:
df = pd.DataFrame(np.array([reg.feature_names_in_, reg.coef_]).T, columns=["Feature", "Weight"])
df

Unnamed: 0,Feature,Weight
0,Age,-0.001646
1,Sex,0.093248
2,Chest pain type,0.084817
3,BP,0.00191
4,Cholesterol,0.000463
5,FBS over 120,-0.035587
6,EKG results,0.007916
7,Max HR,-0.001421
8,Exercise angina,0.096668
9,ST depression,0.070593


In [26]:
#test data
y_preds = reg.predict(X_test)

In [27]:
#metrics
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, r2_score
y_preds_bin = (y_preds > 0.5).astype(int) #make binary

accuracy = accuracy_score(y_test, y_preds_bin)
recall = recall_score(y_test, y_preds_bin)
precision = precision_score(y_test, y_preds_bin)
f1score = f1_score(y_test, y_preds_bin)
r2 = r2_score(y_test, y_preds_bin)

print("Classification recall:", recall)
print("Classification precision:", precision)
print("Classification f1 score:", f1score)


Classification recall: 0.65625
Classification precision: 0.9130434782608695
Classification f1 score: 0.7636363636363637
