In [1]:
import numpy as np
import pandas as pd
import datatable as dt
import xgboost as xgb
import shap
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_score, cross_val_predict
from sklearn.metrics import matthews_corrcoef, confusion_matrix, roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
from sklearn.metrics import recall_score, accuracy_score
from sklearn.impute import KNNImputer

In [2]:
random_state = 7
cv_method = StratifiedKFold(n_splits=5)

# Training of tsfresh-selected Training Max 2 years and eGFR 6 times

In [3]:
df = dt.fread('/home/jupyter-dchristiadi85/PhD Project 1/data/train_selected_2_6.gz')
df = df.to_pandas()
df.shape

(4965, 23639)

In [4]:
df['aki_1'].fillna(value=0, inplace=True)
df['aki_2'].fillna(value=0, inplace=True)
df['aki_3'].fillna(value=0, inplace=True)

In [7]:
df['gn'].value_counts()

no                 4508
igan                184
anca                 85
lupus nephritis      45
unspecific           37
membranous           36
fsgs                 27
mcd                  24
mp/mcgn              19
Name: gn, dtype: int64

In [6]:
df.loc[df.gn == 'pign', 'gn'] = 'unspecific'
df.loc[df.gn == 'fgn/itg', 'gn'] = 'unspecific'

In [10]:
percent_missing = df.isnull().sum()*100 / len(df)
missing_df = pd.DataFrame({'column_name': df.columns, 'percent_miss': percent_missing})

In [11]:
missing_df.sort_values(by='percent_miss', ascending=False)

Unnamed: 0,column_name,percent_miss
23185,"crp__fft_coefficient__attr_""""""""""""""""real""""""""""""""...",100.0
23476,"crp__fft_coefficient__attr_""""""""""""""""angle""""""""""""...",100.0
23456,"crp__fft_coefficient__attr_""""""""""""""""angle""""""""""""...",100.0
23457,"crp__fft_coefficient__attr_""""""""""""""""angle""""""""""""...",100.0
23458,"crp__fft_coefficient__attr_""""""""""""""""angle""""""""""""...",100.0
...,...,...
23614,htn,0.0
23613,aki_3,0.0
23612,aki_2,0.0
23611,aki_1,0.0


### Number of patients reaching ESKD within training period 

In [8]:
df['cat2'].value_counts()

non_eskd    3708
eskd        1257
Name: cat2, dtype: int64

In [9]:
df.set_index('id', inplace=True)

In [10]:
def to_category(df):
    cols = df.select_dtypes(include='object').columns
    for col in cols:
        df[col] = df[col].astype('category')
    return df

def drop_missing(df):
    threshold = len(df)*0.6
    df.dropna(axis=1, thresh=threshold, inplace=True)
    return df

def copy_df(df):
    return df.copy()

In [11]:
df_cleaned = (df.pipe(copy_df).pipe(drop_missing))
df_cleaned.shape

(4965, 6924)

In [12]:
df_rm_intrain = df_cleaned.query("cat2 == 'non_eskd'")
df_rm_intrain.shape

(3708, 6924)

In [40]:
df_rm_intrain = dt.fread('/home/jupyter-dchristiadi85/PhD Project 1/data/cleaned60_selected_2_6.gz')
df_rm_intrain = df_rm_intrain.to_pandas()
df_rm_intrain.set_index('id', inplace=True)
df_rm_intrain.shape

(3708, 6924)

In [41]:
df_rm_intrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3708 entries, 3916 to 23864
Columns: 6924 entries, ureacreat_ratio__variance_larger_than_standard_deviation to cat8
dtypes: float64(5079), int32(1), object(1844)
memory usage: 195.9+ MB


In [27]:
df_rm_intrain = (df_rm_intrain.pipe(copy_df).pipe(to_category))
df_rm_intrain.shape

Unnamed: 0_level_0,ureacreat_ratio__variance_larger_than_standard_deviation,ureacreat_ratio__has_duplicate_max,ureacreat_ratio__has_duplicate_min,ureacreat_ratio__has_duplicate,ureacreat_ratio__sum_values,ureacreat_ratio__abs_energy,ureacreat_ratio__mean_abs_change,ureacreat_ratio__mean_change,ureacreat_ratio__mean_second_derivative_central,ureacreat_ratio__median,...,cat3.5,cat4,cat4.5,cat5,cat5.5,cat6,cat6.5,cat7,cat7.5,cat8
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3916,False,False,False,False,0.152681,0.007990,0.016659,0.004164,-0.016659,0.049505,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd
3918,False,False,False,False,0.201586,0.013588,0.004620,-0.004620,-0.000227,0.067347,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd
3921,False,False,False,False,0.245740,0.021038,0.020792,0.020792,-0.008116,0.087324,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd
3924,False,False,False,False,0.185151,0.011960,0.015921,0.015921,0.006274,0.057534,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd
3930,False,False,False,False,0.496335,0.035761,0.011239,0.003173,0.002983,0.069399,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23771,False,False,False,False,0.160833,0.009284,0.024167,0.011667,0.024167,0.050000,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd
23775,False,False,False,False,0.044286,0.001961,,,,0.044286,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd
23780,False,False,False,False,0.078750,0.006202,,,,0.078750,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd
23823,,,,,,,,,,,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd


### Number of patients reaching ESKD in at 1y post-training 

In [14]:
df_rm_intrain['cat3'].value_counts()

non_eskd    3586
eskd         122
Name: cat3, dtype: int64

### Number of patients reaching ESKD in 2y post-training

In [15]:
df_rm_intrain['cat4'].value_counts()

non_eskd    3484
eskd         224
Name: cat4, dtype: int64

### Number of patients reaching ESKD in 5y post-training

In [16]:
df_rm_intrain['cat7'].value_counts()

non_eskd    3261
eskd         447
Name: cat7, dtype: int64

In [28]:
percent_missing = df_rm_intrain.isnull().sum()*100 / len(df_rm_intrain)
missing_df_rm_intrain = pd.DataFrame({'column_name': df_rm_intrain.columns, 'percent_miss': percent_missing})

In [29]:
missing_df_rm_intrain.sort_values(by='percent_miss', ascending=False)

Unnamed: 0,column_name,percent_miss
chloride__autocorrelation__lag_3,chloride__autocorrelation__lag_3,51.024811
chloride__partial_autocorrelation__lag_1,chloride__partial_autocorrelation__lag_1,51.024811
"chloride__augmented_dickey_fuller__attr_""""""""""""""""pvalue""""""""""""""""__autolag_""""""""""""""""AIC""""""""""""""""","chloride__augmented_dickey_fuller__attr_""""""""""""...",51.024811
"chloride__augmented_dickey_fuller__attr_""""""""""""""""teststat""""""""""""""""__autolag_""""""""""""""""AIC""""""""""""""""","chloride__augmented_dickey_fuller__attr_""""""""""""...",51.024811
chloride__partial_autocorrelation__lag_0,chloride__partial_autocorrelation__lag_0,50.970874
...,...,...
"crp__fft_coefficient__attr_""""""""""""""""imag""""""""""""""""__coeff_64","crp__fft_coefficient__attr_""""""""""""""""imag""""""""""""""...",0.000000
"crp__fft_coefficient__attr_""""""""""""""""imag""""""""""""""""__coeff_65","crp__fft_coefficient__attr_""""""""""""""""imag""""""""""""""...",0.000000
"crp__fft_coefficient__attr_""""""""""""""""imag""""""""""""""""__coeff_66","crp__fft_coefficient__attr_""""""""""""""""imag""""""""""""""...",0.000000
"crp__fft_coefficient__attr_""""""""""""""""imag""""""""""""""""__coeff_67","crp__fft_coefficient__attr_""""""""""""""""imag""""""""""""""...",0.000000


## Features and Target Input for Training 

In [30]:
dropped_cols = list(df_rm_intrain.columns[-18:])
dropped_cols.remove('age.init')

In [31]:
X = df_rm_intrain.drop(dropped_cols,axis=1).copy()
y = df_rm_intrain['cat7']
X.shape, y.shape

((3708, 6907), (3708,))

In [32]:
y_mapped = y.map({'non_eskd':'no', 'eskd':'yes'})
y_mapped

id
3916     no
3918     no
3921     no
3924     no
3930     no
         ..
23771    no
23775    no
23780    no
23823    no
23864    no
Name: cat7, Length: 3708, dtype: category
Categories (2, object): ['yes', 'no']

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=0.3, stratify=y_mapped, random_state=random_state)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2595, 6907), (1113, 6907), (2595,), (1113,))

In [34]:
lab_encode = LabelEncoder()
y_train = lab_encode.fit_transform(y_train)
y_test = lab_encode.transform(y_test)

In [35]:
unique_values, counts = np.unique(y_train, return_counts=True)
ratio = dict(zip(unique_values, counts))
ratio

{0: 2282, 1: 313}

In [36]:
imbalance_ratio = ratio[0] / ratio[1]
imbalance_ratio

7.2907348242811505

In [44]:
category_cols = list(X_train.select_dtypes(include='category').columns)
numeric_cols = list(X_train.select_dtypes(include=['int', 'float']).columns)
one_hot = OneHotEncoder(handle_unknown='ignore')
num_imputer = KNNImputer(n_neighbors=5)
len(category_cols), len(numeric_cols)

(1828, 5079)

In [45]:
preprocess = ColumnTransformer([('num_imputing', num_imputer, numeric_cols), ('cat_encoder', one_hot, category_cols)], remainder='passthrough')
X_train_processed = preprocess.fit_transform(X_train)
X_train_processed.shape

invalid value encountered in reduce
invalid value encountered in reduce
invalid value encountered in reduce
invalid value encountered in reduce


(2595, 9470)

In [46]:
X_test_processed = preprocess.transform(X_test)

invalid value encountered in reduce
invalid value encountered in reduce
invalid value encountered in reduce


In [48]:
xgb_class = xgb.XGBClassifier(n_jobs=-1, random_state = random_state, n_estimators=1000, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=imbalance_ratio)
sampler = SMOTETomek(sampling_strategy=0.5, random_state=random_state)
steps = [('resampling', sampler), ('model', xgb_class)]
pipeline = Pipeline(steps=steps)

In [50]:
pipeline.fit(X_train_processed, y_train)

Pipeline(steps=[('resampling',
                 SMOTETomek(random_state=7, sampling_strategy=0.5)),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, eval_metric='logloss',
                               gamma=0, gpu_id=-1, importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=1000,
                               n_jobs=-1, num_parallel_tree=1, random_state=7,
                               reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=7.2907348242811505, subsample=1,
                               tree_method='exact', use_label_encoder=False,
    

In [51]:
y_pred = pipeline.predict(X_test_processed)

In [144]:
y_pred_prob = (pipeline.predict_proba(X_test)[:,1] >= 0.3).astype(bool)
y_pred_prob

array([False,  True, False, ..., False, False, False])

In [147]:
y_pred_prob = y_pred_prob.astype(int)
y_pred_prob

array([0, 1, 0, ..., 0, 0, 0])

In [52]:
confusion_matrix(y_true=y_test, y_pred=y_pred)

array([[935,  44],
       [ 79,  55]])

In [54]:
roc_auc_score(y_test, y_pred)

0.6827519704846554

In [55]:
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       979
           1       0.56      0.41      0.47       134

    accuracy                           0.89      1113
   macro avg       0.74      0.68      0.71      1113
weighted avg       0.88      0.89      0.88      1113



In [56]:
print(matthews_corrcoef(y_test, y_pred))

0.41782962996968953
