# AKI - early/delayed renal treatment

*import libraries*

In [4]:
# Import libraries here
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
pd.options.mode.chained_assignment = None  # default='warn'

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import (RidgeCV, 
                                  LassoCV, 
                                  ElasticNetCV, 
                                  LogisticRegressionCV,
                                  LinearRegression,
                                  LogisticRegression)
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import (confusion_matrix, 
                             plot_confusion_matrix, 
                             roc_auc_score, 
                             plot_roc_curve, 
                             accuracy_score,
                            )
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC, SVC

import xgboost as xgb
import time
import warnings
warnings.filterwarnings('ignore')

In [34]:
# import data
aki_data = pd.read_csv('cleaned_data.csv')

In [35]:
pd.set_option('display.max_columns', 100)
aki_data.head()

Unnamed: 0,weight,aki_stage,delay_rrt,gender,admission_age,ethnicity,hematocrit_min,hematocrit_max,hemoglobin_min,hemoglobin_max,platelets_min,platelets_max,wbc_min,wbc_max,aniongap_min,aniongap_max,bicarbonate_min,bicarbonate_max,bun_min,bun_max,calcium_min,calcium_max,chloride_min,chloride_max,creatinine_min,creatinine_max,glucose_min,glucose_max,sodium_min,sodium_max,potassium_min,potassium_max,inr_max,pt_max,ptt_max,heart_rate_mean,sbp_mean,dbp_mean,mbp_mean,resp_rate_mean,temperature_mean,spo2_mean,glucose_mean,gcs_min
0,104.5,1,1,1,66.262081,unknown,31.4,31.9,11.3,11.3,246.0,247.0,11.2,11.2,20.0,23.0,23.0,24.0,109.0,114.0,8.7,9.5,106.0,114.0,3.0,3.1,172.0,317.0,146.0,154.0,3.9,4.1,1.3,14.6,47.5,89.625,132.5,68.5,83.090909,21.673077,37.405,96.909091,257.5,10
1,100.0,1,1,1,70.489938,black,34.0,34.2,11.3,11.4,185.0,186.0,12.1,13.5,14.0,16.0,20.0,21.0,36.0,41.0,7.5,7.7,103.0,104.0,1.1,1.1,106.0,140.0,134.0,135.0,4.4,4.8,1.3,13.9,25.0,84.208333,107.5,56.46,74.019231,17.134615,36.978,96.68,123.666667,3
2,98.95,3,0,1,37.700917,unknown,27.1,29.4,9.9,10.4,185.0,252.0,22.4,31.1,26.0,30.0,15.0,18.0,40.0,53.0,7.4,10.2,73.0,76.0,7.8,10.5,121.0,228.0,118.0,119.0,3.3,3.8,2.6,27.5,36.0,80.956522,101.958333,52.4375,67.541667,20.0,36.132143,95.25,152.625,6
3,105.7,3,1,0,59.967156,white,25.0,30.9,8.1,10.1,70.0,107.0,7.6,14.5,11.0,13.0,19.0,23.0,15.0,16.0,7.5,7.8,106.0,110.0,1.1,1.2,47.0,127.0,140.0,141.0,3.2,4.0,2.6,28.4,51.3,87.521739,108.0,56.4,70.7,24.086957,36.776667,96.478261,102.714286,13
4,69.0,1,0,1,79.06047,white,38.0,38.0,13.2,13.2,262.0,262.0,10.8,10.8,18.0,18.0,21.0,21.0,19.0,19.0,9.4,9.4,106.0,106.0,1.3,1.3,126.0,126.0,141.0,141.0,3.7,3.7,1.2,13.8,35.6,90.318182,140.8,71.85,88.285714,21.0,36.62,95.782609,124.0,15


In [36]:
# one-hot-encode categorical data ethnicity ?
# drop ethnicity
aki_data.drop(columns=['ethnicity'],inplace=True)

### Polynomial features 

Any of the features could be related to each other, or have an interactive effect with each other. Any synergistic effect on delay_rrt could increase the significant predictive power to the model. 


In [37]:
X = aki_data.drop(columns=['delay_rrt'])
y = aki_data['delay_rrt']

In [38]:
# Generating the polynomial features table.  
# instantiate
poly = PolynomialFeatures(include_bias=False, degree=2)

In [39]:
# fit and transform the variables in the numerical dataframe
X_poly = poly.fit_transform(X)
X_poly.shape

(1778, 945)

In [40]:
# Checking column names to all polynomial features
X_poly = pd.DataFrame(X_poly,columns=poly.get_feature_names(X.columns))


In [41]:
# Gennerating list of poly feature correlations
X_poly_corrs = X_poly.corrwith(y)

In [42]:
# Shows top 20 features most positively correlated with delay_rrt
X_poly_corrs.sort_values(ascending=False).head(20)

chloride_min temperature_mean       0.198818
chloride_min spo2_mean              0.182815
chloride_min^2                      0.180279
chloride_min                        0.179294
bicarbonate_min chloride_min        0.158487
bicarbonate_min chloride_max        0.152728
chloride_min sodium_min             0.148421
calcium_min chloride_min            0.145373
chloride_min chloride_max           0.144399
admission_age bicarbonate_min       0.140705
admission_age chloride_min          0.137339
bicarbonate_min temperature_mean    0.132319
bicarbonate_min sodium_min          0.130960
bicarbonate_min spo2_mean           0.130260
bicarbonate_min                     0.128809
bicarbonate_min sodium_max          0.126990
bicarbonate_min heart_rate_mean     0.123727
calcium_min chloride_max            0.121496
chloride_max temperature_mean       0.121356
chloride_min sodium_max             0.119685
dtype: float64

In [43]:
# Shows bottom 20 features most positively correlated with delay_rrt
X_poly_corrs.sort_values(ascending=False).tail(20) 

aniongap_max gcs_min            -0.248238
aniongap_max temperature_mean   -0.249200
aniongap_max sodium_max         -0.249964
aniongap_max spo2_mean          -0.250221
aniongap_max calcium_max        -0.254303
aniongap_max                    -0.254689
aniongap_max^2                  -0.255885
creatinine_max resp_rate_mean   -0.256791
creatinine_max gcs_min          -0.257502
hemoglobin_max creatinine_max   -0.257905
aniongap_min creatinine_max     -0.258068
aniongap_max sbp_mean           -0.259088
hematocrit_max creatinine_max   -0.259116
calcium_max creatinine_max      -0.259650
aniongap_min potassium_max      -0.260355
aniongap_max dbp_mean           -0.262543
creatinine_max potassium_max    -0.273823
aniongap_max mbp_mean           -0.274197
aniongap_max creatinine_max     -0.290806
aniongap_max potassium_max      -0.291583
dtype: float64

In [None]:
# # Adding interaction features into train set ?
# aki_data['aniongap_max*potassium_max'] = aki_data['aniongap_max'] * aki_data['potassium_max']
# aki_data['calcium_max*creatinine_max'] = aki_data['calcium_max'] * aki_data['creatinine_max']

**train_test_split**

In [5]:
X = aki_data.drop(columns=['delay_rrt'])
y = aki_data[['delay_rrt']]

X_train, y_train : 50%
<br> X_valid, y_valid : 25%
<br> X_test, y_test   : 25%

In [6]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## train set

In [7]:
X_train.shape

(1191, 43)

In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1191 entries, 435 to 1674
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   weight            1191 non-null   float64
 1   aki_stage         1191 non-null   int64  
 2   gender            1191 non-null   int64  
 3   admission_age     1191 non-null   float64
 4   ethnicity         1191 non-null   object 
 5   hematocrit_min    1191 non-null   float64
 6   hematocrit_max    1191 non-null   float64
 7   hemoglobin_min    1191 non-null   float64
 8   hemoglobin_max    1191 non-null   float64
 9   platelets_min     1191 non-null   float64
 10  platelets_max     1191 non-null   float64
 11  wbc_min           1191 non-null   float64
 12  wbc_max           1191 non-null   float64
 13  aniongap_min      1191 non-null   float64
 14  aniongap_max      1191 non-null   float64
 15  bicarbonate_min   1191 non-null   float64
 16  bicarbonate_max   1191 non-null   float6

### Baseline model (with SMOTE)

From the correlated features, it seemed that 'aniongap_max' has the highest (negative) correlation. So will start with this. 

In [13]:
X_sm = aki_data[['aniongap_max']]
y_sm = aki_data[['delay_rrt']]

In [14]:
# imbalanced 
y_sm.value_counts(normalize=True)

delay_rrt
1            0.814961
0            0.185039
dtype: float64

In [18]:
# split data
Xsm_train, Xsm_test, ysm_train, ysm_test = train_test_split(X_sm, y_sm, test_size = 0.33, stratify =y)

In [19]:
# scaling
ss = StandardScaler()

Xsm_train_sc = ss.fit_transform(Xsm_train)
Xsm_test_sc = ss.transform(Xsm_test)

In [20]:
# create synthetic data for training set
smote = SMOTE()

In [21]:
# Xsmote_train, ysmote_train = smote.fit_sample(Xsm_train_sc, ysm_train)
Xsmote_train, ysmote_train = smote.fit_resample(Xsm_train_sc, ysm_train)

In [22]:
smote.fit

<bound method SamplerMixin.fit of SMOTE()>

In [23]:
# balanced
ysmote_train.value_counts()

delay_rrt
0            971
1            971
dtype: int64

In [25]:
# instantiate
lr = LogisticRegression()

# fit model
lr.fit(Xsmote_train, ysmote_train)

# score
lr.score(Xsmote_train, ysmote_train), lr.score(Xsm_test_sc, ysm_test)

(0.606076210092688, 0.6712095400340715)

So the **baseline accuracy** is 0.606 (for train set), and 0.671 (for test set)

### Logistic regression (with SMOTE)

In [44]:
# using (most) entire dataset
X = aki_data.drop(columns=['delay_rrt'])
y = aki_data[['delay_rrt']]

In [54]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 42, stratify =y)

In [55]:
pipe = Pipeline([
        ('scale', StandardScaler()),
        ('sampling', SMOTE(sampling_strategy='minority')),
        ('logreg', LogisticRegression(max_iter=1_000, solver='saga'))
    ])


In [56]:
# regardless of parameters, there is overfitting.
pipe_params = {
    'sampling__k_neighbors': [5, 10],   # 
    'logreg__penalty': ['l2', 'l1'],   # 
    'logreg__C': [1] # 
}

In [57]:
grid = GridSearchCV(pipe, pipe_params, scoring='roc_auc', n_jobs=-1)

In [58]:
grid.fit(X_train, y_train)


GridSearchCV(estimator=Pipeline(steps=[('scale', StandardScaler()),
                                       ('sampling',
                                        SMOTE(sampling_strategy='minority')),
                                       ('logreg',
                                        LogisticRegression(max_iter=1000,
                                                           solver='saga'))]),
             n_jobs=-1,
             param_grid={'logreg__C': [1], 'logreg__penalty': ['l2', 'l1'],
                         'sampling__k_neighbors': [5, 10]},
             scoring='roc_auc')

In [59]:
grid.best_params_

{'logreg__C': 1, 'logreg__penalty': 'l1', 'sampling__k_neighbors': 10}

In [60]:
grid.score(X_train, y_train), grid.score(X_test, y_test)

(0.8299082482913585, 0.7254424014433227)

Accuracy for train set is 0.831, while test set is 0.725. So using all the features will lead to overfitting (train accuracy >> test accuracy).