In [1]:
# Utilizing the UCI heart disease dataset, see https://archive.ics.uci.edu/ml/datasets/Heart+Disease

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

In [2]:
# Load data and show basic stats
df = pd.read_csv('heart.csv')
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


In [3]:
# Use all columns except 'target' as features
X = df.drop(['target'], axis=1)
y = df['target']

# Train, test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=123)

In [4]:
# Pipeline standartizes data (removing outliers), then does logistic regression
clp = make_pipeline(RobustScaler(), LogisticRegression())
clp.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('logisticregression', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [5]:
# Basic performance report
y_test_pred = clp.predict(X_test)
print(classification_report(y_test, y_test_pred, target_names=['healthy', 'sick'], digits=3))

             precision    recall  f1-score   support

    healthy      0.811     0.769     0.789        39
       sick      0.769     0.811     0.789        37

avg / total      0.791     0.789     0.789        76



In [6]:
# Feature coefficients
lr = clp.steps[-1][-1]
list(zip(X.columns, lr.coef_[0]))

[('age', -0.21757115288637677),
 ('sex', -1.308533918856016),
 ('cp', 1.673682919222072),
 ('trestbps', -0.29232222797862023),
 ('chol', -0.11228729344765497),
 ('fbs', 0.14792707427214336),
 ('restecg', 0.564424230662709),
 ('thalach', 0.7324955449421936),
 ('exang', -0.6380038516989115),
 ('oldpeak', -1.0330724835007572),
 ('slope', 0.10133322393768755),
 ('ca', -0.7103340363437192),
 ('thal', -0.8873428445797749)]