# Logistic Regression and Feature Selection

In [37]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.dummy import DummyClassifier
import matplotlib.pyplot as plt
from evaluation_utils import *
import warnings
warnings.filterwarnings('ignore')

In [11]:
# Load data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

X_train = train.drop(columns='died')
X_test = test.drop(columns='died')
y_train = train.died
y_test = test.died

We will use logistic regression as a baseline using attributes from the patient baseline table including variables related to patient demographics and admission and discharge details for the ICU stay. Then, we will progressively add subsets of variables to see if there is any improvement.

### Baseline Logistic Regression Model with patient variables

In [18]:
X_train_patient = X_train.iloc[:, np.r_[0:5, 128:228, 236]]
X_test_patient = X_test.iloc[:, np.r_[0:5, 128:228, 236]]

In [19]:
lr_base = LogisticRegression(penalty='none',
                             max_iter=1000).fit(X_train_patient, y_train)

train_precision, train_recall, _ = (
    precision_recall_curve(y_train, lr_base.predict_proba(X_train_patient)[:, 1])
)
test_precision, test_recall, _ = (
    precision_recall_curve(y_test, lr_base.predict_proba(X_test_patient)[:, 1])
)

train_auprc = auc(train_recall, train_precision)
test_auprc = auc(test_recall, test_precision)

print(round(train_auprc, 3))
print(round(test_auprc, 3))

0.29
0.275


### Logistic Regression with APACHE prediction data and data on other patient conditions added

In [20]:
X_train_2 = X_train.iloc[:, np.r_[0:30, 128:228, 236]]
X_test_2 = X_test.iloc[:, np.r_[0:30, 128:228, 236]]

In [21]:
lr_mod_2 = LogisticRegression(penalty='none',
                              max_iter=1000).fit(X_train_2, y_train)

train_precision, train_recall, _ = (
    precision_recall_curve(y_train, lr_mod_2.predict_proba(X_train_2)[:, 1])
)
test_precision, test_recall, _ = (
    precision_recall_curve(y_test, lr_mod_2.predict_proba(X_test_2)[:, 1])
)

train_auprc = auc(train_recall, train_precision)
test_auprc = auc(test_recall, test_precision)

print(round(train_auprc, 3))
print(round(test_auprc, 3))

0.448
0.411


### Logistic Regression with hospital attributes added

In [22]:
X_train_3 = X_train.iloc[:, np.r_[0:31, 128:237]]
X_test_3 = X_test.iloc[:, np.r_[0:31, 128:237]]

In [23]:
lr_mod_3 = LogisticRegression(penalty='none').fit(X_train_3, y_train)

train_precision, train_recall, _ = (
    precision_recall_curve(y_train, lr_mod_3.predict_proba(X_train_3)[:, 1])
)
test_precision, test_recall, _ = (
    precision_recall_curve(y_test, lr_mod_3.predict_proba(X_test_3)[:, 1])
)

train_auprc = auc(train_recall, train_precision)
test_auprc = auc(test_recall, test_precision)

print(round(train_auprc, 3))
print(round(test_auprc, 3))

0.394
0.388


### Logistic Regression with drug attribute (number of infusions) added

In [24]:
X_train_4 = X_train.iloc[:, np.r_[0:32, 128:237]]
X_test_4 = X_test.iloc[:, np.r_[0:32, 128:237]]

In [25]:
lr_mod_4 = LogisticRegression(penalty='none').fit(X_train_4, y_train)

train_precision, train_recall, _ = (
    precision_recall_curve(y_train, lr_mod_4.predict_proba(X_train_4)[:, 1])
)
test_precision, test_recall, _ = (
    precision_recall_curve(y_test, lr_mod_4.predict_proba(X_test_4)[:, 1])
)

train_auprc = auc(train_recall, train_precision)
test_auprc = auc(test_recall, test_precision)

print(round(train_auprc, 3))
print(round(test_auprc, 3))

0.397
0.386


### Logistic Regression with lab result data

In [26]:
X_train_5 = X_train.iloc[:, np.r_[0:77, 128:237]]
X_test_5 = X_test.iloc[:, np.r_[0:77, 128:237]]

In [27]:
lr_mod_5 = LogisticRegression(penalty='none').fit(X_train_5, y_train)

train_precision, train_recall, _ = (
    precision_recall_curve(y_train, lr_mod_5.predict_proba(X_train_5)[:, 1])
)
test_precision, test_recall, _ = (
    precision_recall_curve(y_test, lr_mod_5.predict_proba(X_test_5)[:, 1])
)

train_auprc = auc(train_recall, train_precision)
test_auprc = auc(test_recall, test_precision)

print(round(train_auprc, 3))
print(round(test_auprc, 3))

0.406
0.383


### Logistic Regression with respiratory charting data added

In [28]:
X_train_6 = X_train.iloc[:, np.r_[0:77, 125:128, 128:237]]
X_test_6 = X_test.iloc[:, np.r_[0:77, 125:128, 128:237]]

In [29]:
lr_mod_6 = LogisticRegression(penalty='none').fit(X_train_6, y_train)

train_precision, train_recall, _ = (
    precision_recall_curve(y_train, lr_mod_6.predict_proba(X_train_6)[:, 1])
)
test_precision, test_recall, _ = (
    precision_recall_curve(y_test, lr_mod_6.predict_proba(X_test_6)[:, 1])
)

train_auprc = auc(train_recall, train_precision)
test_auprc = auc(test_recall, test_precision)

print(round(train_auprc, 3))
print(round(test_auprc, 3))

0.414
0.381


### Logistic Regression with vital sign data added

In [30]:
lr_mod_7 = LogisticRegression(penalty='none',
                              max_iter=1000).fit(X_train, y_train)

train_precision, train_recall, _ = (
    precision_recall_curve(y_train, lr_mod_7.predict_proba(X_train)[:, 1])
)
test_precision, test_recall, _ = (
    precision_recall_curve(y_test, lr_mod_7.predict_proba(X_test)[:, 1])
)

train_auprc = auc(train_recall, train_precision)
test_auprc = auc(test_recall, test_precision)

print(round(train_auprc, 3))
print(round(test_auprc, 3))

0.468
0.409


### Logistic Regression with lasso penalty

In [31]:
# Set up k-fold validation set
k_folds = KFold(n_splits=5, shuffle=True, random_state=670)
splits = list(k_folds.split(X_train, y_train))

param_grid = {'logit__C': [0.001, 0.01, 0.1, 1]}
    
components = [('scaler', StandardScaler()), 
              ('logit', LogisticRegression(penalty='l1',
                                           solver='liblinear'))]

pipe = Pipeline(components)
grid = GridSearchCV(pipe, param_grid, cv=splits, scoring='average_precision')

grid.fit(X_train, y_train)

print(grid.best_params_)

train_auprc = grid.score(X_train, y_train)
test_auprc = grid.score(X_test, y_test)
print(round(train_auprc, 3))
print(round(test_auprc, 3))

{'logit__C': 0.1}
0.516
0.447


In [36]:
# Get variables with zero and non-zero coefficients from LASSO
coefficients = grid.best_estimator_.named_steps['logit'].coef_
importance = np.abs(coefficients[0])
non_zero_vars = np.array(X_train.columns)[importance > 0]
non_zero_vars2 = np.array(X_train.columns)[importance > 0]
zero_vars = np.array(X_train.columns)[importance == 0]

print(non_zero_vars)
print(zero_vars)

['admissionweight' 'dischargeweight' 'icuduration'
 'weightdiffafterdischarge' 'intubated' 'vent' 'dialysis' 'urine' 'wbc'
 'temperature' 'respiratoryrate' 'sodium' 'heartrate' 'meanbp' 'ph'
 'albumin' 'glucose' 'bilirubin' 'fio2' 'pao2' 'pco2' 'bun'
 'meanapachescore' 'meanpredictedicumortality' 'meanpredictediculos'
 'meanventdays' 'immunosuppression' 'diabetes' 'teachingstatus'
 'numberofinfusions' 'lab_mean_ALT (SGPT)' 'lab_mean_AST (SGOT)'
 'lab_mean_BUN' 'lab_mean_Hct' 'lab_mean_Hgb' 'lab_mean_MCH'
 'lab_mean_MPV' 'lab_mean_O2 Sat (%)' 'lab_mean_PT' 'lab_mean_PTT'
 'lab_mean_RDW' 'lab_mean_WBC x 1000' 'lab_mean_albumin'
 'lab_mean_alkaline phos.' 'lab_mean_anion gap' 'lab_mean_bicarbonate'
 'lab_mean_calcium' 'lab_mean_chloride' 'lab_mean_creatinine'
 'lab_mean_eos' 'lab_mean_glucose' 'lab_mean_lactate' 'lab_mean_lymphs'
 'lab_mean_magnesium' 'lab_mean_monos' 'lab_mean_pH' 'lab_mean_paCO2'
 'lab_mean_paO2' 'lab_mean_phosphate' 'lab_mean_platelets x 1000'
 'lab_mean_sodium' 'lab_m

### Evaluate final model

In [38]:
# Evaluate baseline model
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)

train_preds = dummy_clf.predict(X_train)
test_preds = dummy_clf.predict(X_test)

# Compute precision
train_precision = precision_score(y_train, train_preds)
test_precision = precision_score(y_test, test_preds)

# Compute recall
train_recall = recall_score(y_train, train_preds)
test_recall = recall_score(y_test, test_preds)
        
# Compute F-beta score
train_f_score = fbeta_score(y_train, train_preds, beta=2)
test_f_score = fbeta_score(y_test, test_preds, beta=2)

print(train_precision, test_precision)
print(train_recall, test_recall)
print(train_f_score, test_f_score)

0.0 0.0
0.0 0.0
0.0 0.0


In [42]:
# Logistic Regression with lasso penalty
lasso_features = np.r_[1:16, 17:  18:35, 38:41, 43:46, 47, 49, 50:54, 56:71,
                       73:77, 79:81, 84:86, 87, 89, 93, 95, 97, 99:105, 107, 
                       109:111, 112:116, 118:120, 122, 124:140, 142:145, 
                       146:151, 152:156, 158, 160:162, 163:169, 173:175, 177, 
                       179:188, 189, 191, 194:197, 201, 204:208, 210:212, 213, 
                       216:218, 219, 221, 224, 226, 228, 230, 232:235, 236]

X_train_lasso = X_train.iloc[:, lasso_features]
X_test_lasso = X_test.iloc[:, lasso_features]

lr_lasso_mod = LogisticRegression(penalty='none',
                                  max_iter=1000).fit(X_train_lasso, y_train)

train_pred_probs = lr_lasso_mod.predict_proba(X_train_lasso)[:, 1]
test_pred_probs = lr_lasso_mod.predict_proba(X_test_lasso)[:, 1]

train_results = compute_metrics(train_pred_probs, y_train)
test_results = compute_metrics(test_pred_probs, y_test)

train_precision, train_recall, _ = (
    precision_recall_curve(y_train, 
                           lr_lasso_mod.predict_proba(X_train_lasso)[:, 1])
)
test_precision, test_recall, _ = (
    precision_recall_curve(y_test, 
                           lr_lasso_mod.predict_proba(X_test_lasso)[:, 1])
)

train_auprc = auc(train_recall, train_precision)
test_auprc = auc(test_recall, test_precision)

print(train_results)
print(test_results)
print(round(train_auprc, 3))
print(round(test_auprc, 3))

   threshold  precision    recall   f-score
9       0.09    0.24409  0.803797  0.551072
    threshold  precision   recall   f-score
12       0.12   0.277722  0.68642  0.530332
0.414
0.351
