# South African Heart Disease

In [1]:
import cvxpy as cp
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv("../data/South African Heart Disease.txt")
df['famhist'] = pd.get_dummies(df['famhist'])['Present']
target = ['chd']
features = ['sbp', 'tobacco', 'ldl', 'famhist', 'obesity', 'alcohol', 'age']
X, y = df[features].values, df[target].values
y_flat = np.squeeze(y)
df[features + target].head()

Unnamed: 0,sbp,tobacco,ldl,famhist,obesity,alcohol,age,chd
0,160,12.0,5.73,1,25.3,97.2,52,1
1,144,0.01,4.41,0,28.87,2.06,63,1
2,118,0.08,3.48,1,29.14,3.81,46,0
3,170,7.5,6.41,1,31.99,24.26,58,1
4,134,13.6,3.5,1,25.99,57.34,49,1


## Baseline Accuracy Score

In [3]:
print(f'Baseline test accuracy score: {accuracy_score(y_flat, np.zeros(y_flat.shape[0])):0.3f}')

Baseline test accuracy score: 0.654


## Linear Regression

In [4]:
reg = LinearRegression().fit(X, y)
for feature, coef in list(zip(features, reg.coef_[0])):
    print(f'{feature:>14} {coef:>10.4f}')
y_hat = (np.squeeze(reg.predict(X))>0.5)+0
print('-------------------------')
print(f'accuracy score {accuracy_score(y_flat, y_hat):>10.4f}')

           sbp     0.0012
       tobacco     0.0170
           ldl     0.0359
       famhist     0.1806
       obesity    -0.0074
       alcohol    -0.0001
           age     0.0067
-------------------------
accuracy score     0.7359


## Linear Constrained Regression with CVXPY

In [5]:
coef_ = cp.Variable(len(features))
intercept_ = cp.Variable(1)
loss = cp.sum_squares(y - intercept_ - X@coef_)
constraints = [0 <= coef_]
prob = cp.Problem(cp.Minimize(loss), constraints)
prob.solve()

81.87035937688988

In [6]:
coef_ = np.array(coef_.value)
for feature, coef in list(zip(features, np.squeeze(coef_))):
    print(f'{feature:>14} {coef:>10.4f}')
y_hat = (np.squeeze(intercept_.value + X@coef_)>0.5)+0
print('-------------------------')
print(f'accuracy score {accuracy_score(y_flat, y_hat):>10.4f}')

           sbp     0.0010
       tobacco     0.0171
           ldl     0.0321
       famhist     0.1786
       obesity     0.0000
       alcohol     0.0000
           age     0.0063
-------------------------
accuracy score     0.7424
