In [36]:
import numpy as np
import pandas as pd
import datatable as dt
import xgboost as xgb
import shap
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_score, cross_val_predict
from sklearn.metrics import matthews_corrcoef, confusion_matrix, roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
from sklearn.metrics import recall_score, accuracy_score
from sklearn.impute import KNNImputer

In [37]:
random_state = 7
cv_method = StratifiedKFold(n_splits=5)

In [38]:
df = dt.fread('/home/jupyter-dchristiadi85/PhD Project 1/data/testmin_2_6.gz')
df = df.to_pandas()
df.shape

(4965, 596)

In [39]:
df['aki_1'].fillna(value=0, inplace=True)
df['aki_2'].fillna(value=0, inplace=True)
df['aki_3'].fillna(value=0, inplace=True)

In [40]:
df['gn'].value_counts()

no                 4508
igan                184
anca                 85
lupus nephritis      45
membranous           36
unspecific           29
fsgs                 27
mcd                  24
mp/mcgn              19
fgn/itg               6
pign                  2
Name: gn, dtype: int64

In [41]:
df.loc[df.gn == 'pign', 'gn'] = 'unspecific'
df.loc[df.gn == 'fgn/itg', 'gn'] = 'unspecific'

In [42]:
pd.set_option('display.max_rows', 600)
percent_missing = df.isnull().sum()*100 / len(df)
missing_df = pd.DataFrame({'column_name': df.columns, 'percent_miss': percent_missing})
missing_df.sort_values(by='percent_miss', ascending=False)


Unnamed: 0,column_name,percent_miss
timedprotein__sum_values,timedprotein__sum_values,95.70997
timedprotein__minimum,timedprotein__minimum,95.70997
timedprotein__maximum,timedprotein__maximum,95.70997
timedprotein__root_mean_square,timedprotein__root_mean_square,95.70997
timedprotein__variance,timedprotein__variance,95.70997
timedprotein__standard_deviation,timedprotein__standard_deviation,95.70997
timedprotein__length,timedprotein__length,95.70997
timedprotein__mean,timedprotein__mean,95.70997
timedprotein__median,timedprotein__median,95.70997
albcreat_ratio__length,albcreat_ratio__length,89.98993


In [43]:
def to_category(df):
    cols = df.select_dtypes(include='object').columns
    for col in cols:
        df[col] = df[col].astype('category')
    return df

def drop_missing(df):
    threshold = len(df)*0.5
    df.dropna(axis=1, thresh=threshold, inplace=True)
    return df

def copy_df(df):
    return df.copy()

In [44]:
df_cleaned = (df.pipe(copy_df).pipe(drop_missing).pipe(to_category))
df_cleaned.shape

(4965, 396)

In [45]:
df_rm_intrain = df_cleaned.query("cat2 == 'non_eskd'")
df_rm_intrain.shape

(3708, 396)

In [46]:
df_rm_intrain.set_index('id', inplace=True)

In [47]:
df_rm_intrain['cat3'].value_counts()

non_eskd    3586
eskd         122
Name: cat3, dtype: int64

In [48]:
df_rm_intrain['cat4'].value_counts()

non_eskd    3484
eskd         224
Name: cat4, dtype: int64

In [49]:
df_rm_intrain['cat7'].value_counts()

non_eskd    3261
eskd         447
Name: cat7, dtype: int64

In [50]:
percent_missing = df_rm_intrain.isnull().sum()*100 / len(df_rm_intrain)
missing_df_rm_intrain = pd.DataFrame({'column_name': df_rm_intrain.columns, 'percent_miss': percent_missing})
missing_df_rm_intrain.sort_values(by='percent_miss', ascending=False)

Unnamed: 0,column_name,percent_miss
sta_dia__mean,sta_dia__mean,46.413161
sta_sys__mean,sta_sys__mean,46.413161
sta_dia__sum_values,sta_dia__sum_values,46.413161
sta_dia__median,sta_dia__median,46.413161
sta_sys__maximum,sta_sys__maximum,46.413161
sta_sys__root_mean_square,sta_sys__root_mean_square,46.413161
sta_sys__variance,sta_sys__variance,46.413161
sta_sys__standard_deviation,sta_sys__standard_deviation,46.413161
sta_sys__length,sta_sys__length,46.413161
sta_sys__minimum,sta_sys__minimum,46.413161


In [51]:
dropped_cols = list(df_rm_intrain.columns[-18:])
dropped_cols.remove('age.init')
dropped_cols


['egfr.y',
 'cat0.5',
 'cat1',
 'cat1.5',
 'cat2',
 'cat2.5',
 'cat3',
 'cat3.5',
 'cat4',
 'cat4.5',
 'cat5',
 'cat5.5',
 'cat6',
 'cat6.5',
 'cat7',
 'cat7.5',
 'cat8']

In [52]:
X = df_rm_intrain.drop(dropped_cols,axis=1).copy()
y = df_rm_intrain['cat7']
X.shape, y.shape

((3708, 378), (3708,))

In [60]:
X['aki_3'] = X['aki_3'].astype('float')

In [61]:
pd.set_option('display.max_info_columns', 600)
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3708 entries, 3916 to 23864
Data columns (total 378 columns):
 #    Column                               Dtype   
---   ------                               -----   
 0    albumin__sum_values                  float64 
 1    albumin__median                      float64 
 2    albumin__mean                        float64 
 3    albumin__length                      float64 
 4    albumin__standard_deviation          float64 
 5    albumin__variance                    float64 
 6    albumin__root_mean_square            float64 
 7    albumin__maximum                     float64 
 8    albumin__minimum                     float64 
 9    alkphos__sum_values                  float64 
 10   alkphos__median                      float64 
 11   alkphos__mean                        float64 
 12   alkphos__length                      float64 
 13   alkphos__standard_deviation          float64 
 14   alkphos__variance                    float64 
 15 

In [62]:
y_mapped = y.map({'non_eskd':'no', 'eskd':'yes'})
y_mapped

id
3916     no
3918     no
3921     no
3924     no
3930     no
         ..
23771    no
23775    no
23780    no
23823    no
23864    no
Name: cat7, Length: 3708, dtype: category
Categories (2, object): ['yes', 'no']

In [63]:
X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=0.3, stratify=y_mapped, random_state=random_state)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2595, 378), (1113, 378), (2595,), (1113,))

In [64]:
lab_encode = LabelEncoder()
y_train = lab_encode.fit_transform(y_train)
y_test = lab_encode.transform(y_test)

In [65]:
unique_values, counts = np.unique(y_train, return_counts=True)
ratio = dict(zip(unique_values, counts))
ratio
imbalance_ratio = ratio[0] / ratio[1]
imbalance_ratio

7.2907348242811505

In [66]:
category_cols = list(X_train.select_dtypes(include='category').columns)
numeric_cols = list(X_train.select_dtypes(include=['int', 'float']).columns)
one_hot = OneHotEncoder(handle_unknown='ignore')
num_imputer = KNNImputer(n_neighbors=5)
len(category_cols), len(numeric_cols)

(4, 374)

In [67]:
preprocess = ColumnTransformer([('num_imputing', num_imputer, numeric_cols), ('cat_encoder', one_hot, category_cols)], remainder='passthrough')
X_train_processed = preprocess.fit_transform(X_train)
X_train_processed.shape

invalid value encountered in reduce
invalid value encountered in reduce
invalid value encountered in reduce
invalid value encountered in reduce


(2595, 389)

In [68]:
X_test_processed = preprocess.transform(X_test)

invalid value encountered in reduce
invalid value encountered in reduce
invalid value encountered in reduce


In [82]:
xgb_class = xgb.XGBClassifier(n_jobs=-1, random_state = random_state, n_estimators=1000, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=imbalance_ratio)
sampler = SMOTETomek(sampling_strategy=0.5, random_state=random_state)
steps = [('resampling', sampler), ('model', xgb_class)]
pipeline = Pipeline(steps=steps)

In [83]:
pipeline.fit(X_train_processed, y_train)

Pipeline(steps=[('resampling',
                 SMOTETomek(random_state=7, sampling_strategy=0.5)),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, eval_metric='logloss',
                               gamma=0, gpu_id=-1, importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=1000,
                               n_jobs=-1, num_parallel_tree=1, random_state=7,
                               reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=7.2907348242811505, subsample=1,
                               tree_method='exact', use_label_encoder=False,
    

In [84]:
y_pred = pipeline.predict(X_test_processed)

In [85]:
confusion_matrix(y_true=y_test, y_pred=y_pred)

array([[934,  45],
       [ 71,  63]])

In [87]:
roc_auc_score(y_test, y_pred)

0.7120919915234857

In [86]:
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       979
           1       0.58      0.47      0.52       134

    accuracy                           0.90      1113
   macro avg       0.76      0.71      0.73      1113
weighted avg       0.89      0.90      0.89      1113



In [88]:
print(matthews_corrcoef(y_test, y_pred))

0.46634046587700295


In [89]:
importance = pipeline.named_steps['model'].feature_importances_
importance.shape, type(importance)

((389,), numpy.ndarray)

In [90]:
category_label = list(preprocess.named_transformers_['cat_encoder'].get_feature_names(category_cols))
feature_label = numeric_cols + category_label

In [92]:
pd.set_option('display.max_rows', 400)
eval_df = pd.DataFrame({'label': feature_label, 'importance_value': importance})
eval_df.sort_values(by='importance_value', ascending=False).head(100)

Unnamed: 0,label,importance_value
56,creatinine__mean,0.159502
378,gn_anca,0.061471
3,albumin__length,0.027427
376,dkd_dkd,0.024637
65,egfr__mean,0.022795
87,haemoglobin__root_mean_square,0.022717
55,creatinine__median,0.014817
380,gn_igan,0.014383
28,caphos_product__median,0.013678
213,eosinophils__root_mean_square,0.01349


In [93]:
xgb_class = xgb.XGBClassifier(n_jobs=-1, random_state = random_state, n_estimators=1000, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=1.5*imbalance_ratio)
sampler = SMOTETomek(sampling_strategy=0.5, random_state=random_state)
steps = [('resampling', sampler), ('model', xgb_class)]
pipeline = Pipeline(steps=steps)

In [94]:
pipeline.fit(X_train_processed, y_train)

Pipeline(steps=[('resampling',
                 SMOTETomek(random_state=7, sampling_strategy=0.5)),
                ('model',
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, eval_metric='logloss',
                               gamma=0, gpu_id=-1, importance_type='gain',
                               interaction_constraints='',
                               learning_rate=0.300000012, max_delta_step=0,
                               max_depth=6, min_child_weight=1, missing=nan,
                               monotone_constraints='()', n_estimators=1000,
                               n_jobs=-1, num_parallel_tree=1, random_state=7,
                               reg_alpha=0, reg_lambda=1,
                               scale_pos_weight=10.936102236421725, subsample=1,
                               tree_method='exact', use_label_encoder=False,
    

In [95]:
y_pred = pipeline.predict(X_test_processed)

In [96]:
confusion_matrix(y_true=y_test, y_pred=y_pred)

array([[931,  48],
       [ 68,  66]])

In [97]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94       979
           1       0.58      0.49      0.53       134

    accuracy                           0.90      1113
   macro avg       0.76      0.72      0.74      1113
weighted avg       0.89      0.90      0.89      1113



In [98]:
matthews_corrcoef(y_test, y_pred)

0.47600304522977643