In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import shap
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_score, cross_val_predict
from sklearn.metrics import matthews_corrcoef, confusion_matrix, roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
from sklearn.metrics import recall_score, accuracy_score
from sklearn.impute import KNNImputer

In [2]:
random_state = 7
cv_method = StratifiedKFold(n_splits=5)

In [3]:
df = pd.read_csv('../data/train_egfr_other_1_4.gz', index_col='id')
df.shape

(5603, 1240)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5603 entries, 3916 to 23911
Columns: 1240 entries, egfr__variance_larger_than_standard_deviation to cat8
dtypes: float64(1219), int64(1), object(20)
memory usage: 53.0+ MB


In [5]:
df['aki_1'].fillna(value=0, inplace=True)
df['aki_2'].fillna(value=0, inplace=True)
df['aki_3'].fillna(value=0, inplace=True)

In [6]:
df['gn'].value_counts()

no                 5181
igan                172
anca                 76
lupus nephritis      39
membranous           33
unspecific           30
fsgs                 28
mcd                  21
mp/mcgn              16
fgn/itg               5
pign                  2
Name: gn, dtype: int64

In [7]:
df.loc[df.gn == 'pign', 'gn'] = 'unspecific'
df.loc[df.gn == 'fgn/itg', 'gn'] = 'unspecific'

In [10]:
df.head()

Unnamed: 0_level_0,egfr__variance_larger_than_standard_deviation,egfr__has_duplicate_max,egfr__has_duplicate_min,egfr__has_duplicate,egfr__abs_energy,egfr__mean_abs_change,egfr__mean_change,egfr__mean_second_derivative_central,egfr__median,egfr__mean,...,cat3.5,cat4,cat4.5,cat5,cat5.5,cat6,cat6.5,cat7,cat7.5,cat8
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3916,1.0,0.0,0.0,0.0,6523.7549,10.63,-10.63,,56.865,56.865,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd
3918,0.0,0.0,0.0,0.0,2788.8961,,,,52.81,52.81,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd
3921,,,,,,,,,,,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd
3924,1.0,0.0,0.0,0.0,11451.7994,2.42,-2.42,,75.66,75.66,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd
3930,1.0,0.0,0.0,0.0,11206.0,7.0,0.6,-2.125,43.0,42.666667,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd


In [9]:
def to_category(df):
    cols = df.select_dtypes(include='object').columns
    for col in cols:
        df[col] = df[col].astype('category')
    return df

def drop_missing(df):
    threshold = len(df)*0.6
    df.dropna(axis=1, thresh=threshold, inplace=True)
    return df

def copy_df(df):
    return df.copy()

In [10]:
df_cleaned = (df.pipe(copy_df).pipe(drop_missing).pipe(to_category))
df_cleaned.shape

(5603, 438)

In [47]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5603 entries, 3916 to 23911
Columns: 438 entries, egfr__variance_larger_than_standard_deviation to cat8
dtypes: category(20), float64(417), int64(1)
memory usage: 18.0 MB


### Number of ESKD in-training

In [13]:
df_cleaned['cat1'].value_counts()

non_eskd    4409
eskd        1194
Name: cat1, dtype: int64

### Number of ESKD 2 years post-training

In [48]:
df_cleaned['cat3'].value_counts()

non_eskd    4174
eskd        1429
Name: cat3, dtype: int64

In [49]:
df_cleaned['cat6'].value_counts()

non_eskd    3918
eskd        1685
Name: cat6, dtype: int64

In [53]:
dropped_cols = list(df_cleaned.columns[-18:])
dropped_cols.remove('age.init')
dropped_cols.remove('cat1')
dropped_cols

['egfr.y',
 'cat0.5',
 'cat1.5',
 'cat2',
 'cat2.5',
 'cat3',
 'cat3.5',
 'cat4',
 'cat4.5',
 'cat5',
 'cat5.5',
 'cat6',
 'cat6.5',
 'cat7',
 'cat7.5',
 'cat8']

## ESKD Prediction 2 years post-training

In [81]:
X = df_cleaned.drop(dropped_cols,axis=1).copy()
y = df_cleaned['cat6']
X.shape, y.shape

((5603, 422), (5603,))

In [82]:
y_mapped = y.map({'non_eskd':'no', 'eskd':'yes'})
y_mapped

id
3916      no
3918      no
3921      no
3924      no
3930      no
        ... 
23807     no
23829    yes
23864     no
23882    yes
23911    yes
Name: cat6, Length: 5603, dtype: category
Categories (2, object): ['yes', 'no']

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=0.2, stratify=y_mapped, random_state=random_state)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4482, 422), (1121, 422), (4482,), (1121,))

In [84]:
index_eskd = X_test.query("cat1 == 'eskd'").index
index_eskd

Int64Index([ 7713,  4424, 14377, 16622,  5267, 10908,  7342,  8679, 12670,
            21681,
            ...
            17498, 12046,  4249, 13302, 18709, 16303, 18493, 11468,  5453,
             9887],
           dtype='int64', name='id', length=225)

In [85]:
X_test.drop(index_eskd, inplace=True)
y_test.drop(index_eskd, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [86]:
X_train.drop(columns='cat1', axis=1, inplace=True)
X_test.drop(columns='cat1', axis=1, inplace=True)

In [87]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4482 entries, 13384 to 21818
Columns: 421 entries, egfr__variance_larger_than_standard_deviation to age.init
dtypes: category(4), float64(416), int64(1)
memory usage: 14.3 MB


In [88]:
lab_encode = LabelEncoder()
y_train = lab_encode.fit_transform(y_train)
y_test = lab_encode.transform(y_test)

In [100]:
unique_values, counts = np.unique(y_test, return_counts=True)
ratio = dict(zip(unique_values, counts))
ratio
imbalance_ratio = ratio[0] / ratio[1]
imbalance_ratio

7.0

In [74]:
category_cols = list(X_train.select_dtypes(include='category').columns)
numeric_cols = list(X_train.select_dtypes(include=['int', 'float']).columns)
one_hot = OneHotEncoder(handle_unknown='ignore')
num_imputer = KNNImputer(n_neighbors=5)
len(category_cols), len(numeric_cols)

(4, 417)

In [91]:
preprocess = ColumnTransformer([('num_imputing', num_imputer, numeric_cols), ('cat_encoder', one_hot, category_cols)], remainder='passthrough')
X_train_processed = preprocess.fit_transform(X_train)
X_train_processed.shape

(4482, 432)

In [93]:
X_test_processed = preprocess.transform(X_test)

In [101]:
xgb_class = xgb.XGBClassifier(n_jobs=-1, random_state = random_state, n_estimators=1000, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=imbalance_ratio)
sampler = SMOTETomek(random_state=random_state)
steps = [('resampling', sampler), ('model', xgb_class)]
pipeline = Pipeline(steps=steps)

In [102]:
pipeline.fit(X_train_processed, y_train)

Pipeline(steps=[('resampling', SMOTETomek(random_state=7)),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, eval_metric='logloss',
                               gamma=0, gpu_id=-1, importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=1000,
                               n_jobs=-1, num_parallel_tree=1, random_state=7,
                               reg_alpha=0, reg_lambda=1, scale_pos_weight=7.0,
                               subsample=1, tree_method='exact',
                               use_label_encoder=False, validate_parameters=1,
                               verbo

In [103]:
y_pred = pipeline.predict(X_test_processed)

In [104]:
confusion_matrix(y_true=y_test, y_pred=y_pred)

array([[734,  50],
       [ 68,  44]])

In [105]:
roc_auc_score(y_test, y_pred)

0.6645408163265306

In [106]:
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93       784
           1       0.47      0.39      0.43       112

    accuracy                           0.87       896
   macro avg       0.69      0.66      0.68       896
weighted avg       0.86      0.87      0.86       896



In [107]:
print(matthews_corrcoef(y_test, y_pred))

0.35515634263219253


In [35]:
importance = pipeline.named_steps['model'].feature_importances_
importance.shape, type(importance)

((432,), numpy.ndarray)

In [36]:
category_label = list(preprocess.named_transformers_['cat_encoder'].get_feature_names(category_cols))
feature_label = numeric_cols + category_label

In [37]:
pd.set_option('display.max_rows', 400)
eval_df = pd.DataFrame({'label': feature_label, 'importance_value': importance})
eval_df.sort_values(by='importance_value', ascending=False).head(100)

Unnamed: 0,label,importance_value
326,urea__maximum,0.065696
244,albumin__median,0.028779
357,haematocrit__mean,0.027545
113,egfr__cwt_coefficients__coeff_1__w_2__widths_(...,0.027266
134,"egfr__change_quantiles__f_agg_""""""""mean""""""""__is...",0.02616
66,egfr__large_standard_deviation__r_0.3500000000...,0.022853
31,egfr__benford_correlation,0.021984
285,glucose__minimum,0.021909
200,egfr__approximate_entropy__m_2__r_0.5,0.019067
108,egfr__index_mass_quantile__q_0.9,0.019009


## ESKD Prediction 5 years post-training

In [38]:
X = df_rm_intrain.drop(dropped_cols,axis=1).copy()
y = df_rm_intrain['cat6']
X.shape, y.shape

((4409, 421), (4409,))

In [39]:
y_mapped = y.map({'non_eskd':'no', 'eskd':'yes'})
y_mapped

id
3916     no
3918     no
3921     no
3924     no
3930     no
         ..
23760    no
23771    no
23780    no
23807    no
23864    no
Name: cat6, Length: 4409, dtype: category
Categories (2, object): ['yes', 'no']

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=0.2, stratify=y_mapped, random_state=random_state)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3527, 421), (882, 421), (3527,), (882,))

In [41]:
lab_encode = LabelEncoder()
y_train = lab_encode.fit_transform(y_train)
y_test = lab_encode.transform(y_test)

In [42]:
unique_values, counts = np.unique(y_train, return_counts=True)
ratio = dict(zip(unique_values, counts))
ratio
imbalance_ratio = ratio[0] / ratio[1]
imbalance_ratio

7.974554707379135

In [43]:
category_cols = list(X_train.select_dtypes(include='category').columns)
numeric_cols = list(X_train.select_dtypes(include=['int', 'float']).columns)
one_hot = OneHotEncoder(handle_unknown='ignore')
num_imputer = KNNImputer(n_neighbors=5)
len(category_cols), len(numeric_cols)

(4, 417)

In [44]:
preprocess = ColumnTransformer([('num_imputing', num_imputer, numeric_cols), ('cat_encoder', one_hot, category_cols)], remainder='passthrough')
X_train_processed = preprocess.fit_transform(X_train)
X_train_processed.shape

(3527, 432)

In [45]:
X_test_processed = preprocess.transform(X_test)

In [53]:
xgb_class = xgb.XGBClassifier(n_jobs=-1, random_state = random_state, n_estimators=1000, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=1.5*imbalance_ratio)
sampler = SMOTETomek(sampling_strategy=0.5, random_state=random_state)
steps = [('resampling', sampler), ('model', xgb_class)]
pipeline = Pipeline(steps=steps)

In [54]:
pipeline.fit(X_train_processed, y_train)

Pipeline(steps=[('resampling',
                 SMOTETomek(random_state=7, sampling_strategy=0.5)),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, eval_metric='logloss',
                               gamma=0, gpu_id=-1, importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=1000,
                               n_jobs=-1, num_parallel_tree=1, random_state=7,
                               reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=11.961832061068703, subsample=1,
                               tree_method='exact', use_label_encoder=False,
    

In [55]:
y_pred = pipeline.predict(X_test_processed)

In [56]:
confusion_matrix(y_true=y_test, y_pred=y_pred)

array([[729,  55],
       [ 58,  40]])

In [57]:
roc_auc_score(y_test, y_pred)

0.6690051020408163

In [58]:
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93       784
           1       0.42      0.41      0.41        98

    accuracy                           0.87       882
   macro avg       0.67      0.67      0.67       882
weighted avg       0.87      0.87      0.87       882



In [59]:
print(matthews_corrcoef(y_test, y_pred))

0.342650769622793
