In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_score, cross_val_predict
from sklearn.metrics import matthews_corrcoef, confusion_matrix, roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
from sklearn.metrics import recall_score, accuracy_score
from sklearn.impute import KNNImputer
%config Completer.use_jedi = False

In [2]:
random_state = 7
cv_method = StratifiedKFold(n_splits=5)
pd.set_option('display.max_rows', 100)

# Training of tsfresh-selected Training Max 2 years and eGFR 2 times

In [3]:
df = pd.read_csv('train_selected_1_4.gz')
df.shape

(5603, 23639)

In [4]:
df['aki_1'].fillna(value=0, inplace=True)
df['aki_2'].fillna(value=0, inplace=True)
df['aki_3'].fillna(value=0, inplace=True)

In [5]:
percent_missing = df.isnull().sum()*100 / len(df)
missing_df = pd.DataFrame({'column_name': df.columns, 'percent_miss': percent_missing})

In [6]:
missing_df.sort_values(by='percent_miss', ascending=False)

Unnamed: 0,column_name,percent_miss
"mch__fft_coefficient__attr_""""""""""""""""real""""""""""""""""__coeff_71","mch__fft_coefficient__attr_""""""""""""""""real""""""""""""""...",100.0
"platelet__fft_coefficient__attr_""""""""""""""""abs""""""""""""""""__coeff_75","platelet__fft_coefficient__attr_""""""""""""""""abs""""""...",100.0
"platelet__fft_coefficient__attr_""""""""""""""""angle""""""""""""""""__coeff_91","platelet__fft_coefficient__attr_""""""""""""""""angle""...",100.0
"platelet__fft_coefficient__attr_""""""""""""""""angle""""""""""""""""__coeff_90","platelet__fft_coefficient__attr_""""""""""""""""angle""...",100.0
"platelet__fft_coefficient__attr_""""""""""""""""angle""""""""""""""""__coeff_89","platelet__fft_coefficient__attr_""""""""""""""""angle""...",100.0
...,...,...
htn,htn,0.0
aki_3,aki_3,0.0
aki_2,aki_2,0.0
aki_1,aki_1,0.0


### Number of patients reaching ESKD within training period 

In [7]:
df['cat1'].value_counts()

non_eskd    4409
eskd        1194
Name: cat1, dtype: int64

In [8]:
df.set_index('id', inplace=True)

In [9]:
def to_category(df):
    cols = df.select_dtypes(include='object').columns
    for col in cols:
        df[col] = df[col].astype('category')
    return df

def drop_missing(df):
    threshold = len(df)*0.5
    df.dropna(axis=1, thresh=threshold, inplace=True)
    return df

def copy_df(df):
    return df.copy()

In [10]:
df_cleaned = (df.pipe(copy_df).pipe(to_category).pipe(drop_missing))
df_cleaned.shape

(5603, 6737)

In [11]:
df_rm_intrain = df_cleaned.query("cat1 == 'non_eskd'")
df_rm_intrain.shape

(4409, 6737)

### Number of patients reaching ESKD in at 1y post-training 

In [12]:
df_rm_intrain['cat2'].value_counts()

non_eskd    4282
eskd         127
Name: cat2, dtype: int64

### Number of patients reaching ESKD in 2y post-training

In [13]:
df_rm_intrain['cat3'].value_counts()

non_eskd    4174
eskd         235
Name: cat3, dtype: int64

### Number of patients reaching ESKD in 5y post-training

In [14]:
df_rm_intrain['cat6'].value_counts()

non_eskd    3918
eskd         491
Name: cat6, dtype: int64

In [15]:
percent_missing = df_rm_intrain.isnull().sum()*100 / len(df)
missing_df_rm_intrain = pd.DataFrame({'column_name': df_rm_intrain.columns, 'percent_miss': percent_missing})

In [16]:
missing_df_rm_intrain.sort_values(by='percent_miss', ascending=False)

Unnamed: 0,column_name,percent_miss
egfr__permutation_entropy__dimension_4__tau_1,egfr__permutation_entropy__dimension_4__tau_1,47.52811
egfr__kurtosis,egfr__kurtosis,47.52811
egfr__partial_autocorrelation__lag_0,egfr__partial_autocorrelation__lag_0,47.52811
"egfr__cwt_coefficients__coeff_3__w_2__widths_(2, 5, 10, 20)",egfr__cwt_coefficients__coeff_3__w_2__widths_(...,47.52811
"egfr__cwt_coefficients__coeff_3__w_5__widths_(2, 5, 10, 20)",egfr__cwt_coefficients__coeff_3__w_5__widths_(...,47.52811
...,...,...
dkd,dkd,0.00000
htn,htn,0.00000
aki_3,aki_3,0.00000
aki_2,aki_2,0.00000


## Features and Target Input for Training 

In [17]:
dropped_cols = list(df_rm_intrain.columns[-20:])
dropped_cols.remove('age.init')


In [18]:
X = df_rm_intrain.drop(dropped_cols,axis =1).copy()
y = df_rm_intrain['cat6'].values
X.shape, y.shape

((4409, 6718), (4409,))

In [19]:
category_cols = list(X.select_dtypes(include='category').columns)
numeric_cols = list(X.select_dtypes(include=['int', 'float']).columns)
one_hot = OneHotEncoder()
num_imputer = KNNImputer(n_neighbors=5)


In [20]:
y_mapped = y.map({'non_eskd':'no', 'eskd':'yes'})

In [22]:
y_mapped

['no', 'no', 'no', 'no', 'no', ..., 'no', 'no', 'no', 'no', 'no']
Length: 4409
Categories (2, object): ['yes', 'no']

In [23]:
lab_encode = LabelEncoder()
y_transformed = lab_encode.fit_transform(y_mapped)
y_transformed.shape

(4409,)

In [24]:
unique_values, counts = np.unique(y_transformed, return_counts=True)
ratio = dict(zip(unique_values, counts))
ratio

{0: 3918, 1: 491}

In [25]:

imbalance_ratio = ratio[0] / ratio[1]
imbalance_ratio

7.979633401221996

In [26]:
transformer = ColumnTransformer([('one_hot', one_hot, category_cols), ('num_imputer', num_imputer, numeric_cols)], remainder='passthrough')


In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.3, stratify=y_transformed)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3086, 6718), (1323, 6718), (3086,), (1323,))

In [28]:
imputed_X_train = transformer.fit_transform(X_train)
imputed_X_test = transformer.transform(X_test)

## ML Training 

In [33]:
xgb_class = xgb.XGBClassifier(n_jobs=-1, random_state = random_state, n_estimators=1000, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=imbalance_ratio)
sampler = SMOTETomek(sampling_strategy=0.5, random_state=random_state)



In [34]:
steps = [('resampling', sampler), ('model', xgb_class)]
pipeline = Pipeline(steps=steps)

In [35]:
pipeline.fit(imputed_X_train, y_train)

Pipeline(steps=[('resampling',
                 SMOTETomek(random_state=7, sampling_strategy=0.5)),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, eval_metric='logloss',
                               gamma=0, gpu_id=-1, importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=1000,
                               n_jobs=-1, num_parallel_tree=1, random_state=7,
                               reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=7.979633401221996, subsample=1,
                               tree_method='exact', use_label_encoder=False,
     

In [36]:
y_pred = pipeline.predict(imputed_X_test)

In [37]:
confusion_matrix(y_true=y_test, y_pred=y_pred)

array([[1128,   48],
       [  86,   61]])

In [38]:
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.93      0.96      0.94      1176
           1       0.56      0.41      0.48       147

    accuracy                           0.90      1323
   macro avg       0.74      0.69      0.71      1323
weighted avg       0.89      0.90      0.89      1323



In [39]:
print(matthews_corrcoef(y_test, y_pred))

0.42764666609783214


In [40]:
pipeline._final_estimator

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=1000, n_jobs=-1,
              num_parallel_tree=1, random_state=7, reg_alpha=0, reg_lambda=1,
              scale_pos_weight=7.979633401221996, subsample=1,
              tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)