In [73]:
import pandas as pd
import numpy as np

In [74]:
df = pd.read_csv("../data/South African Heart Disease.txt")
df['famhist'] = pd.get_dummies(df['famhist'])['Present']
target = ['chd']
features = ['sbp', 'tobacco', 'ldl', 'famhist', 'obesity', 'alcohol', 'age']
df[features + target].head()

Unnamed: 0,sbp,tobacco,ldl,famhist,obesity,alcohol,age,chd
0,160,12.0,5.73,1,25.3,97.2,52,1
1,144,0.01,4.41,0,28.87,2.06,63,1
2,118,0.08,3.48,1,29.14,3.81,46,0
3,170,7.5,6.41,1,31.99,24.26,58,1
4,134,13.6,3.5,1,25.99,57.34,49,1


In [75]:
X, y = df[features].values, np.squeeze(df[target].values)

In [76]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score

In [88]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, train_size=0.8, random_state=279, stratify=y)

In [78]:
i0 = np.where(y_train == 0)[0]
i1 = np.where(y_train == 1)[0]
selected = np.hstack((np.random.choice(i1, size=len(i0), replace=True), i0))
X_train = X_train[selected]
y_train = y_train[selected]

In [92]:
model = Pipeline([('logistic', LogisticRegression(solver='lbfgs', penalty='l2', random_state=1, max_iter=1000))])

In [93]:
grid_search = GridSearchCV(
    model, 
    {'logistic__C': np.linspace(0.0001, 0.001, 20)},
    cv=10,
    scoring='roc_auc'
).fit(X_train, y_train)
grid_search.best_params_

{'logistic__C': 0.0009526315789473685}

In [13]:
lr = LogisticRegression(solver='lbfgs').fit(X_train, y_train)

In [94]:
accuracy_score(y_test, grid_search.best_estimator_.predict(X_test))

0.7096774193548387

In [87]:
roc_auc_score(y_test, grid_search.best_estimator_.predict_proba(X_test)[:,1])

0.7986680327868853