In [1]:
import numpy as np
import pandas as pd
import modin.modin.pandas as mod_pd
import xgboost as xgb
import shap
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_score, cross_val_predict
from sklearn.metrics import matthews_corrcoef, confusion_matrix, roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
from sklearn.metrics import recall_score, accuracy_score
from sklearn.impute import KNNImputer

In [2]:
random_state = 7
cv_method = StratifiedKFold(n_splits=5)
pd.set_option('display.max_rows', 100)

# Training of tsfresh-selected Training Max 1 years and eGFR 4 times

In [3]:
df = ('/home/jupyter-dchristiadi85/PhD Project 1/data/train_selected_1_4.gz')
df = df.to_pandas()
df.shape

(5603, 23639)

In [4]:
df['aki_1'].fillna(value=0, inplace=True)
df['aki_2'].fillna(value=0, inplace=True)
df['aki_3'].fillna(value=0, inplace=True)

In [5]:
df['gn'].value_counts()

no                 5181
igan                172
anca                 76
lupus nephritis      39
membranous           33
unspecific           30
fsgs                 28
mcd                  21
mp/mcgn              16
fgn/itg               5
pign                  2
Name: gn, dtype: int64

In [6]:
df.loc[df.gn == 'pign', 'gn'] = 'unspecific'
df.loc[df.gn == 'fgn/itg', 'gn'] = 'unspecific'

In [7]:
percent_missing = df.isnull().sum()*100 / len(df)
missing_df = pd.DataFrame({'column_name': df.columns, 'percent_miss': percent_missing})

In [8]:
missing_df.sort_values(by='percent_miss', ascending=False)

Unnamed: 0,column_name,percent_miss
"urea__fft_coefficient__attr_""""""""""""""""real""""""""""""""""__coeff_67","urea__fft_coefficient__attr_""""""""""""""""real""""""""""""...",99.982152
"monocytes__fft_coefficient__attr_""""""""""""""""abs""""""""""""""""__coeff_65","monocytes__fft_coefficient__attr_""""""""""""""""abs""""...",99.982152
"calcium__fft_coefficient__attr_""""""""""""""""abs""""""""""""""""__coeff_73","calcium__fft_coefficient__attr_""""""""""""""""abs""""""""...",99.982152
"calcium__fft_coefficient__attr_""""""""""""""""abs""""""""""""""""__coeff_74","calcium__fft_coefficient__attr_""""""""""""""""abs""""""""...",99.982152
"calcium__fft_coefficient__attr_""""""""""""""""abs""""""""""""""""__coeff_75","calcium__fft_coefficient__attr_""""""""""""""""abs""""""""...",99.982152
...,...,...
"alt__fft_coefficient__attr_""""""""""""""""angle""""""""""""""""__coeff_60","alt__fft_coefficient__attr_""""""""""""""""angle""""""""""""...",0.000000
"alt__fft_coefficient__attr_""""""""""""""""angle""""""""""""""""__coeff_59","alt__fft_coefficient__attr_""""""""""""""""angle""""""""""""...",0.000000
"alt__fft_coefficient__attr_""""""""""""""""abs""""""""""""""""__coeff_99","alt__fft_coefficient__attr_""""""""""""""""abs""""""""""""""""...",0.000000
"alt__fft_coefficient__attr_""""""""""""""""abs""""""""""""""""__coeff_98","alt__fft_coefficient__attr_""""""""""""""""abs""""""""""""""""...",0.000000


### Number of patients reaching ESKD within training period 

In [9]:
df['cat1'].value_counts()

non_eskd    4409
eskd        1194
Name: cat1, dtype: int64

In [10]:
df.set_index('id', inplace=True)

In [11]:
def to_category(df):
    cols = df.select_dtypes(include='object').columns
    for col in cols:
        df[col] = df[col].astype('category')
    return df

def drop_missing(df):
    threshold = len(df)*0.6
    df.dropna(axis=1, thresh=threshold, inplace=True)
    return df

def copy_df(df):
    return df.copy()

In [12]:
df_cleaned = (df.pipe(copy_df).pipe(to_category).pipe(drop_missing))
df_cleaned.shape

(5603, 8985)

In [6]:
df_rm_intrain = df_cleaned.query("cat1 == 'non_eskd'")
df_rm_intrain.shape

(4409, 8986)

### Number of patients reaching ESKD in at 1y post-training 

In [7]:
df_rm_intrain['cat2'].value_counts()

non_eskd    4282
eskd         127
Name: cat2, dtype: int64

### Number of patients reaching ESKD in 2y post-training

In [8]:
df_rm_intrain['cat3'].value_counts()

non_eskd    4174
eskd         235
Name: cat3, dtype: int64

### Number of patients reaching ESKD in 5y post-training

In [9]:
df_rm_intrain['cat6'].value_counts()

non_eskd    3918
eskd         491
Name: cat6, dtype: int64

In [12]:
percent_missing = df_rm_intrain.isnull().sum()*100 / len(df_rm_intrain)
missing_df_rm_intrain = pd.DataFrame({'column_name': df_rm_intrain.columns, 'percent_miss': percent_missing})

In [13]:
missing_df_rm_intrain.sort_values(by='percent_miss', ascending=False)

Unnamed: 0,column_name,percent_miss
caphos_product__autocorrelation__lag_1,caphos_product__autocorrelation__lag_1,47.153550
caphos_product__autocorrelation__lag_0,caphos_product__autocorrelation__lag_0,47.153550
caphos_product__fourier_entropy__bins_3,caphos_product__fourier_entropy__bins_3,47.130869
caphos_product__fourier_entropy__bins_2,caphos_product__fourier_entropy__bins_2,47.130869
caphos_product__fourier_entropy__bins_100,caphos_product__fourier_entropy__bins_100,47.130869
...,...,...
"albumin__fft_coefficient__attr_""""""""""""""""real""""""""""""""""__coeff_79","albumin__fft_coefficient__attr_""""""""""""""""real""""""...",0.000000
"albumin__fft_coefficient__attr_""""""""""""""""real""""""""""""""""__coeff_78","albumin__fft_coefficient__attr_""""""""""""""""real""""""...",0.000000
"albumin__fft_coefficient__attr_""""""""""""""""real""""""""""""""""__coeff_77","albumin__fft_coefficient__attr_""""""""""""""""real""""""...",0.000000
"albumin__fft_coefficient__attr_""""""""""""""""real""""""""""""""""__coeff_76","albumin__fft_coefficient__attr_""""""""""""""""real""""""...",0.000000


## Features and Target Input for Training 

In [14]:
dropped_cols = list(df_rm_intrain.columns[-18:])
dropped_cols.remove('age.init')

In [15]:
X = df_rm_intrain.drop(dropped_cols,axis =1).copy()
y = df_rm_intrain['cat6']
X.shape, y.shape

((4409, 8969), (4409,))

In [16]:
y_mapped = y.map({'non_eskd':'no', 'eskd':'yes'})
y_mapped

0       no
1       no
2       no
3       no
4       no
        ..
5595    no
5596    no
5597    no
5598    no
5600    no
Name: cat6, Length: 4409, dtype: object

In [18]:
lab_encode = LabelEncoder()
y_transformed = lab_encode.fit_transform(y_mapped)
y_transformed.shape

(4409,)

In [20]:
unique_values, counts = np.unique(y_transformed, return_counts=True)
ratio = dict(zip(unique_values, counts))
ratio

{0: 3918, 1: 491}

In [21]:
imbalance_ratio = ratio[0] / ratio[1]
imbalance_ratio

7.979633401221996

In [43]:
X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=0.3, stratify=y_mapped, random_state=random_state)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3086, 8969), (1323, 8969), (3086,), (1323,))

In [58]:
category_cols = list(X.select_dtypes(include='object').columns)
numeric_cols = list(X.select_dtypes(include=['number']).columns)
one_hot = OneHotEncoder()
num_imputer = KNNImputer(n_neighbors=5)

In [None]:
one_hot()

In [62]:
categorical_transformer = Pipeline([('one_hot', one_hot)])
numerical_transformer = Pipeline([('num_imputer', num_imputer)])
transformer = ColumnTransformer([('categorical', categorical_transformer, category_cols), ('numerical', numerical_transformer, numeric_cols)], remainder='passthrough')

In [64]:
transformer.fit(X_train)

TypeError: Encoders require their input to be uniformly strings or numbers. Got ['int', 'str']

In [53]:
xgb_class = xgb.XGBClassifier(n_jobs=-1, random_state = random_state, n_estimators=1000, use_label_encoder=False, eval_metric='logloss', scale_pos_weight=imbalance_ratio)
sampler = SMOTETomek(sampling_strategy=0.5, random_state=random_state)

In [54]:
steps = [('data_input', transformer), ('resampling', sampler), ('model', xgb_class)]
pipeline = Pipeline(steps=steps)

In [55]:
scoring = {'acc': 'accuracy', 'prec_macro':'precision_macro', 'rec_macro':'recall_macro', 'roc':'roc_auc'}

In [32]:
scores = cross_validate(pipeline, X_train, y=y_train, scoring=scoring, cv=cv_method, n_jobs=-1)

In [36]:
scores.keys()

dict_keys(['fit_time', 'score_time', 'test_acc', 'test_prec_macro', 'test_rec_macro', 'test_roc'])

In [39]:
scores['test_rec_macro'], scores['test_prec_macro'], scores['test_roc'], scores['test_acc']

(array([       nan,        nan,        nan, 0.67107003,        nan]),
 array([       nan,        nan,        nan, 0.75130148,        nan]),
 array([       nan,        nan,        nan, 0.80738919,        nan]),
 array([       nan,        nan,        nan, 0.89951378,        nan]))

In [56]:
pipeline.fit(X_train, y_train)

TypeError: Encoders require their input to be uniformly strings or numbers. Got ['int', 'str']

In [41]:
y_pred = pipeline.predict(X_test)

ValueError: Found unknown categories [True] in column 1807 during transform

In [152]:
confusion_matrix(y_true=y_test, y_pred=y_pred)

array([[1113,   63],
       [  77,   70]])

In [149]:
roc_auc_score(y_test, y_pred_prob)

0.721938775510204

In [153]:
print(classification_report(y_true=y_test, y_pred=y_pred))

              precision    recall  f1-score   support

           0       0.94      0.95      0.94      1176
           1       0.53      0.48      0.50       147

    accuracy                           0.89      1323
   macro avg       0.73      0.71      0.72      1323
weighted avg       0.89      0.89      0.89      1323



In [154]:
print(matthews_corrcoef(y_test, y_pred))

0.44168443409167757
