In [1]:
import numpy as np
import os, shutil
import pandas as pd

%config Completer.use_jedi = False

In [9]:
pd.set_option('display.max_columns', 30)
pd.set_option("display.max_rows", 15000)

## Creation of Features Matrix and Target

In [205]:
df = pd.read_csv('train_ready_eff_2_6.gz')
df.shape

(5318, 12511)

In [206]:
percent_missing = df.isnull().sum()*100 / len(df)
missing_df = pd.DataFrame({'column_name': df.columns, 'percent_miss': percent_missing})

In [207]:
missing_df.sort_values(by='percent_miss', ascending=False)

Unnamed: 0,column_name,percent_miss
creatinine__query_similarity_count__query_None__threshold_0.0,creatinine__query_similarity_count__query_None...,100.0
potassium__query_similarity_count__query_None__threshold_0.0,potassium__query_similarity_count__query_None_...,100.0
caphos_product__query_similarity_count__query_None__threshold_0.0,caphos_product__query_similarity_count__query_...,100.0
calcium__query_similarity_count__query_None__threshold_0.0,calcium__query_similarity_count__query_None__t...,100.0
haemoglobin__query_similarity_count__query_None__threshold_0.0,haemoglobin__query_similarity_count__query_Non...,100.0
ureacreat_ratio__query_similarity_count__query_None__threshold_0.0,ureacreat_ratio__query_similarity_count__query...,100.0
glucose__query_similarity_count__query_None__threshold_0.0,glucose__query_similarity_count__query_None__t...,100.0
urea__query_similarity_count__query_None__threshold_0.0,urea__query_similarity_count__query_None__thre...,100.0
chloride__query_similarity_count__query_None__threshold_0.0,chloride__query_similarity_count__query_None__...,100.0
albumin__query_similarity_count__query_None__threshold_0.0,albumin__query_similarity_count__query_None__t...,100.0


In [208]:
df['eskd_intraining'].value_counts()

non_eskd    4687
eskd         631
Name: eskd_intraining, dtype: int64

In [209]:
df.query("eskd_intraining != 'eskd'", inplace=True)
df.shape

(4687, 12511)

In [210]:
df.set_index('id', inplace=True)

In [211]:
def drop_missing(df):
    col_missing_thresh_col = len(df)*0.7
#     row_missing_thresh_col = len(df.columns)*0.3
    df.dropna(axis=1, thresh=col_missing_thresh_col, inplace=True)
#     df.dropna(axis=0, thresh=row_missing_thresh_col, inplace=True)
    
    return df

def to_category(df):
    cols = df.select_dtypes(include='object').columns
    for col in cols:
        df[col] = df[col].astype('category')
    return df

def copy_df(df):
    return df.copy()

In [212]:
df_cleaned = (df.pipe(copy_df).pipe(drop_missing).pipe(to_category))
df_cleaned.shape

(4687, 3841)

In [213]:
percent_missing = df_cleaned.isnull().sum()*100 / len(df)
missing_df_cleaned = pd.DataFrame({'column_name': df_cleaned.columns, 'percent_miss': percent_missing})
missing_df_cleaned.sort_values(by='percent_miss', ascending=False)

Unnamed: 0,column_name,percent_miss
wcc__autocorrelation__lag_2,wcc__autocorrelation__lag_2,29.827182
"wcc__cwt_coefficients__coeff_2__w_5__widths_(2, 5, 10, 20)",wcc__cwt_coefficients__coeff_2__w_5__widths_(2...,29.78451
"wcc__cwt_coefficients__coeff_2__w_2__widths_(2, 5, 10, 20)",wcc__cwt_coefficients__coeff_2__w_2__widths_(2...,29.78451
"wcc__cwt_coefficients__coeff_2__w_20__widths_(2, 5, 10, 20)",wcc__cwt_coefficients__coeff_2__w_20__widths_(...,29.78451
wcc__mean_second_derivative_central,wcc__mean_second_derivative_central,29.78451
wcc__skewness,wcc__skewness,29.78451
"wcc__cwt_coefficients__coeff_2__w_10__widths_(2, 5, 10, 20)",wcc__cwt_coefficients__coeff_2__w_10__widths_(...,29.78451
wcc__permutation_entropy__dimension_3__tau_1,wcc__permutation_entropy__dimension_3__tau_1,29.78451
"chloride__cwt_coefficients__coeff_2__w_5__widths_(2, 5, 10, 20)",chloride__cwt_coefficients__coeff_2__w_5__widt...,29.741839
"chloride__cwt_coefficients__coeff_2__w_10__widths_(2, 5, 10, 20)",chloride__cwt_coefficients__coeff_2__w_10__wid...,29.741839


In [214]:
df_cleaned['eskd_2y'].value_counts()

non_eskd    4589
eskd          98
Name: eskd_2y, dtype: int64

In [215]:
df_cleaned['eskd_5y'].value_counts()

non_eskd    4466
eskd         221
Name: eskd_5y, dtype: int64

In [216]:
imbalance_ratio = df_cleaned['eskd_5y'].value_counts().non_eskd / df_cleaned['eskd_5y'].value_counts().eskd
imbalance_ratio

20.20814479638009

In [217]:
df_cleaned_numeric = df_cleaned.select_dtypes(include=['int', 'float'])
inf_cols_name = list(df_cleaned_numeric.columns.to_series()[np.isinf(df_cleaned_numeric).any()])

In [218]:
df_cleaned.drop(inf_cols_name, axis=1, inplace=True)

In [219]:
X = df_cleaned.drop(['egfr.y', 'eskd_intraining', 'eskd_2y', 'eskd_5y'], axis=1)
y = df_cleaned['eskd_5y'].values

In [220]:
y = y.map({'non_eskd':'no', 'eskd': 'yes'})

In [221]:
y

['no', 'no', 'no', 'no', 'no', ..., 'no', 'no', 'no', 'no', 'no']
Length: 4687
Categories (2, object): ['yes', 'no']

In [222]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
category_cols = list(X.select_dtypes(include='category').columns)
one_hot = OneHotEncoder()
transformer = ColumnTransformer([('one_hot', one_hot, category_cols)],remainder='passthrough')
X_transformed = transformer.fit_transform(X)

In [223]:
from sklearn.preprocessing import LabelEncoder
lab_encode = LabelEncoder()

In [224]:
y_transformed = lab_encode.fit_transform(y)

In [225]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y_transformed, stratify=y_transformed,
                                                   test_size=0.3, random_state=7)

In [226]:
import xgboost as xgb
xgb_class = xgb.XGBClassifier(n_jobs=-1, random_state = 7, n_estimators=1000, scale_pos_weight=imbalance_ratio,
                             use_label_encoder=False)
xgb_class

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=1000, n_jobs=-1, num_parallel_tree=None,
              random_state=7, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=20.20814479638009, subsample=None,
              tree_method=None, use_label_encoder=False,
              validate_parameters=None, verbosity=None)

In [227]:
xgb_class.fit(X=X_train, y=y_train, eval_metric='error')

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=-1, num_parallel_tree=1, random_state=7,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=20.20814479638009,
              subsample=1, tree_method='exact', use_label_encoder=False,
              validate_parameters=1, verbosity=None)

In [228]:
y_pred = xgb_class.predict(X_test)

In [229]:
from sklearn.metrics import roc_auc_score, classification_report, matthews_corrcoef, confusion_matrix

In [230]:
confusion_matrix(y_test, y_pred)

array([[1325,   16],
       [  55,   11]])

In [231]:
matthews_corrcoef(y_test, y_pred)

0.23848125836155976

In [232]:
params = {'n_estimators':[100, 250, 500, 750, 1000],
          'max_depth':[1, 5, 7, 10, 15, 20, 50, 100] ,
          'learning_rate':[0.001, 0.01, 0.1, 1, 10],
          'subsample':[0.3, 0.5, 0.7, 0.9],
          'colsample_bytree':[0.3, 0.5, 0.7, 0.9],
          'scale_pos_weight':[10, 20, 30, 40],
}

In [233]:
from sklearn.model_selection import StratifiedKFold, GridSearchCV

In [235]:
cv_method = StratifiedKFold(n_splits=5)

In [237]:
model_xgb = GridSearchCV(estimator=xgb_class, cv= cv_method, param_grid=params)

In [None]:
model_xgb.fit(X_train, y_train)











































In [None]:
model_xgb.best_params_

In [29]:
y_pred = model_xgb.predict(X_test)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [32]:
conf_matrix = confusion_matrix(y_test, y_pred)
conf_matrix

array([[1233,    2],
       [  36,    1]])

In [33]:
mcc = matthews_corrcoef(y_test,y_pred=y_pred)
mcc

0.08802508244377058

### SMAC Hyperparameter Tuning