In [1]:
import numpy as np
import pandas as pd
import datatable as dt
import xgboost as xgb
import shap
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_validate, cross_val_score, cross_val_predict
from sklearn.metrics import matthews_corrcoef, confusion_matrix, roc_auc_score, classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
from sklearn.metrics import recall_score, accuracy_score
from sklearn.impute import KNNImputer

In [2]:
random_state = 7
cv_method = StratifiedKFold(n_splits=5)

In [4]:
df = dt.fread('../data/testmin_2_8.gz')
df = df.to_pandas()
df.shape

(4338, 596)

In [5]:
df['aki_1'].fillna(value=0, inplace=True)
df['aki_2'].fillna(value=0, inplace=True)
df['aki_3'].fillna(value=0, inplace=True)

In [6]:
df['gn'].value_counts()

no                 3892
igan                180
anca                 83
lupus nephritis      45
membranous           32
unspecific           29
fsgs                 27
mcd                  24
mp/mcgn              19
fgn/itg               5
pign                  2
Name: gn, dtype: int64

In [7]:
df.loc[df.gn == 'pign', 'gn'] = 'unspecific'
df.loc[df.gn == 'fgn/itg', 'gn'] = 'unspecific'

In [8]:
df.head()

Unnamed: 0,id,albumin__sum_values,albumin__median,albumin__mean,albumin__length,albumin__standard_deviation,albumin__variance,albumin__root_mean_square,albumin__maximum,albumin__minimum,...,cat3.5,cat4,cat4.5,cat5,cat5.5,cat6,cat6.5,cat7,cat7.5,cat8
0,3918,127.0,42.0,42.333333,3.0,0.471405,0.222222,42.335958,43.0,42.0,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd
1,3921,135.0,45.0,45.0,3.0,0.816497,0.666667,45.007407,46.0,44.0,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd
2,3924,118.0,38.0,39.333333,3.0,1.885618,3.555556,39.378505,42.0,38.0,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd
3,3930,284.0,40.0,40.571429,7.0,1.399708,1.959184,40.595566,43.0,39.0,...,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd,non_eskd
4,3934,,,,,,,,,,...,non_eskd,non_eskd,non_eskd,eskd,eskd,eskd,eskd,eskd,eskd,eskd


In [9]:
percent_missing = df.isnull().sum()*100 / len(df)
missing_df = pd.DataFrame({'column_name': df.columns, 'percent_miss': percent_missing})
missing_df.sort_values(by='percent_miss', ascending=False)

Unnamed: 0,column_name,percent_miss
timedprotein__length,timedprotein__length,95.458737
timedprotein__minimum,timedprotein__minimum,95.458737
timedprotein__maximum,timedprotein__maximum,95.458737
timedprotein__root_mean_square,timedprotein__root_mean_square,95.458737
timedprotein__variance,timedprotein__variance,95.458737
...,...,...
htn,htn,0.000000
aki_3,aki_3,0.000000
aki_2,aki_2,0.000000
aki_1,aki_1,0.000000


In [10]:
def to_category(df):
    cols = df.select_dtypes(include='object').columns
    for col in cols:
        df[col] = df[col].astype('category')
    return df

def drop_missing(df):
    threshold = len(df)*0.5
    df.dropna(axis=1, thresh=threshold, inplace=True)
    return df

def copy_df(df):
    return df.copy()

In [11]:
df_cleaned = (df.pipe(copy_df).pipe(drop_missing).pipe(to_category))
df_cleaned.shape

(4338, 387)

In [12]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4338 entries, 0 to 4337
Columns: 387 entries, id to cat8
dtypes: category(21), float64(364), int32(2)
memory usage: 12.2 MB


In [13]:
df_rm_intrain = df_cleaned.query("cat2 == 'non_eskd'")
df_rm_intrain.shape

(3159, 387)

In [14]:
df_rm_intrain.set_index('id', inplace=True)

In [15]:
df_rm_intrain['cat3'].value_counts()

non_eskd    3038
eskd         121
Name: cat3, dtype: int64

In [16]:
df_rm_intrain['cat4'].value_counts()

non_eskd    2939
eskd         220
Name: cat4, dtype: int64

In [17]:
df_rm_intrain['cat7'].value_counts()

non_eskd    2719
eskd         440
Name: cat7, dtype: int64

In [18]:
dropped_cols = list(df_rm_intrain.columns[-18:])
dropped_cols.remove('age.init')
dropped_cols

['egfr.y',
 'cat0.5',
 'cat1',
 'cat1.5',
 'cat2',
 'cat2.5',
 'cat3',
 'cat3.5',
 'cat4',
 'cat4.5',
 'cat5',
 'cat5.5',
 'cat6',
 'cat6.5',
 'cat7',
 'cat7.5',
 'cat8']

In [19]:
X = df_rm_intrain.drop(dropped_cols,axis=1).copy()
y = df_rm_intrain['cat7']
X.shape, y.shape

((3159, 369), (3159,))

In [20]:
y_mapped = y.map({'non_eskd':'no', 'eskd':'yes'})
y_mapped

id
3918      no
3921      no
3924      no
3930      no
3934     yes
        ... 
23771     no
23775     no
23780     no
23823     no
23864     no
Name: cat7, Length: 3159, dtype: category
Categories (2, object): ['yes', 'no']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y_mapped, test_size=0.3, stratify=y_mapped, random_state=random_state)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2211, 369), (948, 369), (2211,), (948,))

In [22]:
lab_encode = LabelEncoder()
y_train = lab_encode.fit_transform(y_train)
y_test = lab_encode.transform(y_test)

In [23]:
unique_values, counts = np.unique(y_train, return_counts=True)
ratio = dict(zip(unique_values, counts))
ratio
imbalance_ratio = ratio[0] / ratio[1]
imbalance_ratio

6.178571428571429

In [24]:
category_cols = list(X_train.select_dtypes(include='category').columns)
numeric_cols = list(X_train.select_dtypes(include=['int', 'float']).columns)
one_hot = OneHotEncoder(handle_unknown='ignore')
num_imputer = KNNImputer(n_neighbors=5)
len(category_cols), len(numeric_cols)

(5, 364)

In [25]:
category_cols

['aki_3', 'htn', 'dkd', 'gn', 'gender']