In [1]:
# Importing the libraries
import pandas as pd
import numpy as np

from math import nan
from sklearn.utils import resample
from sklearn.model_selection import train_test_split,cross_val_score,StratifiedKFold,RandomizedSearchCV
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from scikit_posthocs import posthoc_nemenyi_friedman

import shap
from scipy import stats

In [2]:
# load data and see how it looks like
train = pd.read_csv('train', index_col = 0)
train

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,SIM_GIPERT,DLIT_AG,ZSN_A,...,LID_KB,NITR_S,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n,LET_IS
1414,60.0,0,0.0,5.0,2.0,1.0,2.0,0.0,6.0,0.0,...,,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0
227,57.0,0,1.0,1.0,2.0,2.0,2.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0
280,52.0,1,1.0,0.0,0.0,2.0,2.0,0.0,2.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0
1538,73.0,0,1.0,6.0,3.0,,2.0,0.0,6.0,4.0,...,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1
1418,78.0,0,0.0,2.0,2.0,2.0,2.0,0.0,,0.0,...,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
274,55.0,1,3.0,0.0,0.0,2.0,2.0,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0
238,73.0,1,1.0,4.0,2.0,2.0,2.0,0.0,,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0
1444,65.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1
409,75.0,0,0.0,0.0,0.0,2.0,2.0,0.0,2.0,0.0,...,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0


In [3]:
train.isnull().sum()

AGE           6
SEX           0
INF_ANAM      4
STENOK_AN    90
FK_STENOK    62
             ..
GEPAR_S_n    12
ASP_S_n      12
TIKL_S_n     11
TRENT_S_n    11
LET_IS        0
Length: 99, dtype: int64

In [4]:
newtrain = train.loc[:, (train.isnull().sum(axis=0) < 1700/2)]
newtrain.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1360 entries, 1414 to 444
Data columns (total 99 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   AGE            1354 non-null   float64
 1   SEX            1360 non-null   int64  
 2   INF_ANAM       1356 non-null   float64
 3   STENOK_AN      1270 non-null   float64
 4   FK_STENOK      1298 non-null   float64
 5   IBS_POST       1317 non-null   float64
 6   GB             1353 non-null   float64
 7   SIM_GIPERT     1353 non-null   float64
 8   DLIT_AG        1165 non-null   float64
 9   ZSN_A          1315 non-null   float64
 10  nr_11          1342 non-null   float64
 11  nr_01          1342 non-null   float64
 12  nr_02          1342 non-null   float64
 13  nr_03          1342 non-null   float64
 14  nr_04          1342 non-null   float64
 15  nr_07          1342 non-null   float64
 16  nr_08          1342 non-null   float64
 17  np_01          1346 non-null   float64
 18  np_04 

In [5]:
newtrain.fillna(newtrain[['AGE', 'INF_ANAM', 'STENOK_AN', 'FK_STENOK', 'IBS_POST', 'GB', 'DLIT_AG', 'ZSN_A',
                         'S_AD_ORIT', 'D_AD_ORIT', 'ant_im', 'lat_im', 'inf_im', 'post_im', 'K_BLOOD',
                         'NA_BLOOD', 'ALT_BLOOD', 'AST_BLOOD', 'L_BLOOD', 'ROE', 'TIME_B_S']].median(), inplace=True)

In [6]:
for column in newtrain[newtrain.columns[~newtrain.columns.isin(['AGE', 'INF_ANAM', 'STENOK_AN', 'FK_STENOK', 'IBS_POST', 'GB', 'DLIT_AG', 'ZSN_A',
                         'S_AD_ORIT', 'D_AD_ORIT', 'ant_im', 'lat_im', 'inf_im', 'post_im', 'K_BLOOD',
                         'NA_BLOOD', 'ALT_BLOOD', 'AST_BLOOD', 'L_BLOOD', 'ROE', 'TIME_B_S'])]]:
    newtrain[column].fillna(newtrain[column].mode()[0], inplace=True)

In [7]:
newtrain.isnull().sum()

AGE          0
SEX          0
INF_ANAM     0
STENOK_AN    0
FK_STENOK    0
            ..
GEPAR_S_n    0
ASP_S_n      0
TIKL_S_n     0
TRENT_S_n    0
LET_IS       0
Length: 99, dtype: int64

In [8]:
# Load data and see how it looks like
test = pd.read_csv('test', index_col = 0)
test

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,SIM_GIPERT,DLIT_AG,ZSN_A,...,LID_KB,NITR_S,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n,LET_IS
455,50.0,1,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0
1499,64.0,0,0.0,5.0,3.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1
1045,61.0,1,1.0,5.0,2.0,1.0,2.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0
187,75.0,0,0.0,6.0,1.0,2.0,2.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0
1472,62.0,0,0.0,6.0,2.0,1.0,3.0,0.0,6.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,51.0,1,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,...,,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0
207,76.0,0,0.0,1.0,2.0,2.0,2.0,0.0,7.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0
779,83.0,0,1.0,2.0,2.0,1.0,3.0,0.0,,0.0,...,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0
954,76.0,1,0.0,4.0,2.0,2.0,2.0,0.0,,2.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0


In [9]:
newtest = test.loc[:, (test.isnull().sum(axis=0) < 1700/2)]
newtest.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 340 entries, 455 to 1428
Data columns (total 99 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   AGE            338 non-null    float64
 1   SEX            340 non-null    int64  
 2   INF_ANAM       340 non-null    float64
 3   STENOK_AN      324 non-null    float64
 4   FK_STENOK      329 non-null    float64
 5   IBS_POST       332 non-null    float64
 6   GB             338 non-null    float64
 7   SIM_GIPERT     339 non-null    float64
 8   DLIT_AG        287 non-null    float64
 9   ZSN_A          331 non-null    float64
 10  nr_11          337 non-null    float64
 11  nr_01          337 non-null    float64
 12  nr_02          337 non-null    float64
 13  nr_03          337 non-null    float64
 14  nr_04          337 non-null    float64
 15  nr_07          337 non-null    float64
 16  nr_08          337 non-null    float64
 17  np_01          336 non-null    float64
 18  np_04  

In [10]:
newtest.fillna(newtrain[['AGE', 'INF_ANAM', 'STENOK_AN', 'FK_STENOK', 'IBS_POST', 'GB', 'DLIT_AG', 'ZSN_A',
                         'S_AD_ORIT', 'D_AD_ORIT', 'ant_im', 'lat_im', 'inf_im', 'post_im', 'K_BLOOD',
                         'NA_BLOOD', 'ALT_BLOOD', 'AST_BLOOD', 'L_BLOOD', 'ROE', 'TIME_B_S']].median(), inplace=True)

In [11]:
for column in newtest[newtest.columns[~newtest.columns.isin(['AGE', 'INF_ANAM', 'STENOK_AN', 'FK_STENOK', 'IBS_POST', 'GB', 'DLIT_AG', 'ZSN_A',
                         'S_AD_ORIT', 'D_AD_ORIT', 'ant_im', 'lat_im', 'inf_im', 'post_im', 'K_BLOOD',
                         'NA_BLOOD', 'ALT_BLOOD', 'AST_BLOOD', 'L_BLOOD', 'ROE', 'TIME_B_S'])]]:
    newtest[column].fillna(newtest[column].mode()[0], inplace=True)

In [12]:
newtest.isnull().sum()

AGE          0
SEX          0
INF_ANAM     0
STENOK_AN    0
FK_STENOK    0
            ..
GEPAR_S_n    0
ASP_S_n      0
TIKL_S_n     0
TRENT_S_n    0
LET_IS       0
Length: 99, dtype: int64

In [13]:
# separate x_train and y_train from train (with imputation)
X_trainimp = newtrain.loc[:, newtrain.columns!='LET_IS']
y_trainimp = pd.Series(newtrain['LET_IS'])

# separate x_test and y_test from test (with imputation)
X_testimp = newtest.loc[:, newtest.columns!='LET_IS']
y_testimp = pd.Series(newtest['LET_IS'])

# check the first class is 0 (alive), and the second class is 1 (dead). The result shouldn't change
y_trainimp.value_counts()
y_testimp.value_counts()

0    286
1     54
Name: LET_IS, dtype: int64

In [14]:
# separate x_train and y_train from train (without imputation)
X_train = train.loc[:, train.columns!='LET_IS']
y_train = pd.Series(train['LET_IS'])

# separate x_test and y_test from test (without imputation)
X_test = test.loc[:, test.columns!='LET_IS']
y_test = pd.Series(test['LET_IS'])

# check the first class is 0 (alive), and the second class is 1 (dead)
y_train.value_counts()
y_test.value_counts()

0    286
1     54
Name: LET_IS, dtype: int64

In [15]:
# XGBoost, our best model
the_bestmodel = xgb.XGBClassifier(
    base_score=None, 
    booster=None, 
    colsample_bylevel=None,
    colsample_bynode=None,
    colsample_bytree=0.9,
    enable_categorical=False, 
    gamma=0.4, 
    gpu_id=None,
    importance_type=None, 
    interaction_constraints=None,
    learning_rate=None,
    max_delta_step=None, 
    max_depth=7,
    min_child_weight=3, 
    missing=nan, 
    monotone_constraints=None,
    n_estimators=100, 
    n_jobs=None, 
    num_parallel_tree=None,
    predictor=None, 
    random_state=1, 
    reg_alpha=1e-05,
    reg_lambda=None, 
    scale_pos_weight=None, 
    subsample=0.7,
    tree_method=None, 
    use_label_encoder=False,
    validate_parameters=None,
    verbosity=None
)

In [16]:
# The best model using imputed data
the_bestmodel.fit(X_trainimp,y_trainimp)
yhat_imp = the_bestmodel.predict(X_testimp)
f1score_imp = f1_score(yhat_imp,y_testimp)
print(f1score_imp)
print(X_testimp)

df_imp = pd.DataFrame(data = X_testimp, columns = ['AGE', 'SEX',
                                            'INF_ANAM', 'STENOK_AN',
                                            'FK_STENOK', 'IBS_POST',
                                            'GB', 'SIM_GIPERT', 'DLIT_AG', 
                                            'ZSN_A', 'nr_11', 'nr_01', 'nr_02',
                                            'nr_03', 'nr_04', 'nr_07', 'nr_08',
                                            'np_01', 'np_04', 'np_05', 'np_07',
                                            'np_08', 'np_09', 'np_10', 'endocr_01',
                                            'endocr_02', 'endocr_03', 'zab_leg_01',
                                            'zab_leg_02', 'zab_leg_03', 'zab_leg_04',
                                            'zab_leg_06', 'S_AD_ORIT', 'D_AD_ORIT',
                                            'O_L_POST', 'K_SH_POST', 'MP_TP_POST', 
                                            'SVT_POST', 'GT_POST', 'FIB_G_POST',
                                            'ant_im', 'lat_im', 'inf_im', 'post_im',
                                            'IM_PG_P', 'ritm_ecg_p_01', 'ritm_ecg_p_02',
                                            'ritm_ecg_p_04', 'ritm_ecg_p_06', 'ritm_ecg_p_07',
                                            'ritm_ecg_p_08', 'n_r_ecg_p_01', 'n_r_ecg_p_02',
                                            'n_r_ecg_p_03', 'n_r_ecg_p_04', 'n_r_ecg_p_05',
                                            'n_r_ecg_p_06', 'n_r_ecg_p_08', 'n_r_ecg_p_09', 
                                            'n_r_ecg_p_10', 'n_p_ecg_p_01', 'n_p_ecg_p_03',
                                            'n_p_ecg_p_04', 'n_p_ecg_p_05', 'n_p_ecg_p_06',
                                            'n_p_ecg_p_07', 'n_p_ecg_p_08', 'n_p_ecg_p_09',
                                            'n_p_ecg_p_10', 'n_p_ecg_p_11', 'n_p_ecg_p_12',
                                            'fibr_ter_01', 'fibr_ter_02', 'fibr_ter_03',
                                            'fibr_ter_05', 'fibr_ter_06', 'fibr_ter_07',
                                            'fibr_ter_08', 'GIPO_K', 'K_BLOOD',
                                            'GIPER_NA', 'NA_BLOOD', 'ALT_BLOOD',
                                            'AST_BLOOD', 'L_BLOOD', 'ROE', 'TIME_B_S',
                                            'NA_KB',
                                            'NOT_NA_KB', 
                                            'LID_KB', 'NITR_S',
                                            'LID_S_n', 'B_BLOK_S_n',
                                            'ANT_CA_S_n', 'GEPAR_S_n',
                                            'ASP_S_n', 'TIKL_S_n', 'TRENT_S_n'])

df_imp

0.5111111111111111
       AGE  SEX  INF_ANAM  STENOK_AN  FK_STENOK  IBS_POST   GB  SIM_GIPERT  \
455   50.0    1       0.0        0.0        0.0       2.0  0.0         0.0   
1499  64.0    0       0.0        5.0        3.0       2.0  0.0         0.0   
1045  61.0    1       1.0        5.0        2.0       1.0  2.0         0.0   
187   75.0    0       0.0        6.0        1.0       2.0  2.0         0.0   
1472  62.0    0       0.0        6.0        2.0       1.0  3.0         0.0   
...    ...  ...       ...        ...        ...       ...  ...         ...   
749   51.0    1       0.0        1.0        2.0       1.0  0.0         0.0   
207   76.0    0       0.0        1.0        2.0       2.0  2.0         0.0   
779   83.0    0       1.0        2.0        2.0       1.0  3.0         0.0   
954   76.0    1       0.0        4.0        2.0       2.0  2.0         0.0   
1428  76.0    0       1.0        1.0        2.0       1.0  3.0         0.0   

      DLIT_AG  ZSN_A  ...  NOT_NA_KB  LID_KB

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,SIM_GIPERT,DLIT_AG,ZSN_A,...,NOT_NA_KB,LID_KB,NITR_S,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n
455,50.0,1,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
1499,64.0,0,0.0,5.0,3.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
1045,61.0,1,1.0,5.0,2.0,1.0,2.0,0.0,6.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
187,75.0,0,0.0,6.0,1.0,2.0,2.0,0.0,7.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1472,62.0,0,0.0,6.0,2.0,1.0,3.0,0.0,6.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,51.0,1,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
207,76.0,0,0.0,1.0,2.0,2.0,2.0,0.0,7.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
779,83.0,0,1.0,2.0,2.0,1.0,3.0,0.0,3.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
954,76.0,1,0.0,4.0,2.0,2.0,2.0,0.0,3.0,2.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [6]:
# The best model using not imputed data
the_bestmodel.fit(X_train,y_train)
yhat = the_bestmodel.predict(X_test)
f1score = f1_score(yhat,y_test)
print(f1score)
print(X_test)

df = pd.DataFrame(data = X_test, columns = ['AGE', 'SEX',
                                            'INF_ANAM', 'STENOK_AN',
                                            'FK_STENOK', 'IBS_POST',
                                            'GB', 'SIM_GIPERT', 'DLIT_AG', 
                                            'ZSN_A', 'nr_11', 'nr_01', 'nr_02',
                                            'nr_03', 'nr_04', 'nr_07', 'nr_08',
                                            'np_01', 'np_04', 'np_05', 'np_07',
                                            'np_08', 'np_09', 'np_10', 'endocr_01',
                                            'endocr_02', 'endocr_03', 'zab_leg_01',
                                            'zab_leg_02', 'zab_leg_03', 'zab_leg_04',
                                            'zab_leg_06', 'S_AD_ORIT', 'D_AD_ORIT',
                                            'O_L_POST', 'K_SH_POST', 'MP_TP_POST', 
                                            'SVT_POST', 'GT_POST', 'FIB_G_POST',
                                            'ant_im', 'lat_im', 'inf_im', 'post_im',
                                            'IM_PG_P', 'ritm_ecg_p_01', 'ritm_ecg_p_02',
                                            'ritm_ecg_p_04', 'ritm_ecg_p_06', 'ritm_ecg_p_07',
                                            'ritm_ecg_p_08', 'n_r_ecg_p_01', 'n_r_ecg_p_02',
                                            'n_r_ecg_p_03', 'n_r_ecg_p_04', 'n_r_ecg_p_05',
                                            'n_r_ecg_p_06', 'n_r_ecg_p_08', 'n_r_ecg_p_09', 
                                            'n_r_ecg_p_10', 'n_p_ecg_p_01', 'n_p_ecg_p_03',
                                            'n_p_ecg_p_04', 'n_p_ecg_p_05', 'n_p_ecg_p_06',
                                            'n_p_ecg_p_07', 'n_p_ecg_p_08', 'n_p_ecg_p_09',
                                            'n_p_ecg_p_10', 'n_p_ecg_p_11', 'n_p_ecg_p_12',
                                            'fibr_ter_01', 'fibr_ter_02', 'fibr_ter_03',
                                            'fibr_ter_05', 'fibr_ter_06', 'fibr_ter_07',
                                            'fibr_ter_08', 'GIPO_K', 'K_BLOOD',
                                            'GIPER_NA', 'NA_BLOOD', 'ALT_BLOOD',
                                            'AST_BLOOD', 'L_BLOOD', 'ROE', 'TIME_B_S',
                                            'NA_KB',
                                            'NOT_NA_KB', 
                                            'LID_KB', 'NITR_S',
                                            'LID_S_n', 'B_BLOK_S_n',
                                            'ANT_CA_S_n', 'GEPAR_S_n',
                                            'ASP_S_n', 'TIKL_S_n', 'TRENT_S_n'])

df

0.6222222222222222
       AGE  SEX  INF_ANAM  STENOK_AN  FK_STENOK  IBS_POST   GB  SIM_GIPERT  \
455   50.0    1       0.0        0.0        0.0       2.0  0.0         0.0   
1499  64.0    0       0.0        5.0        3.0       2.0  0.0         0.0   
1045  61.0    1       1.0        5.0        2.0       1.0  2.0         0.0   
187   75.0    0       0.0        6.0        1.0       2.0  2.0         0.0   
1472  62.0    0       0.0        6.0        2.0       1.0  3.0         0.0   
...    ...  ...       ...        ...        ...       ...  ...         ...   
749   51.0    1       0.0        1.0        2.0       1.0  0.0         0.0   
207   76.0    0       0.0        1.0        2.0       2.0  2.0         0.0   
779   83.0    0       1.0        2.0        2.0       1.0  3.0         0.0   
954   76.0    1       0.0        4.0        2.0       2.0  2.0         0.0   
1428  76.0    0       1.0        1.0        2.0       1.0  3.0         0.0   

      DLIT_AG  ZSN_A  ...  NOT_NA_KB  LID_KB

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,SIM_GIPERT,DLIT_AG,ZSN_A,...,NOT_NA_KB,LID_KB,NITR_S,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n
455,50.0,1,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
1499,64.0,0,0.0,5.0,3.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
1045,61.0,1,1.0,5.0,2.0,1.0,2.0,0.0,6.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
187,75.0,0,0.0,6.0,1.0,2.0,2.0,0.0,7.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1472,62.0,0,0.0,6.0,2.0,1.0,3.0,0.0,6.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,51.0,1,0.0,1.0,2.0,1.0,0.0,0.0,0.0,0.0,...,,,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
207,76.0,0,0.0,1.0,2.0,2.0,2.0,0.0,7.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
779,83.0,0,1.0,2.0,2.0,1.0,3.0,0.0,,0.0,...,,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
954,76.0,1,0.0,4.0,2.0,2.0,2.0,0.0,,2.0,...,,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [8]:
# Alive result (without imputed data)

# force_plot() takes three values: the base value (explainer.expected_value), the SHAP values (shap_values) and the matrix of 
# feature values (X_test)
# The base value (expected value) which is around 0.84, is the average target value of the model output over the training data (X_train)
# The output value, which is 1, is the prediction for that observation (the prediction of the first row in X_test)
# Red/blue: Features that push the prediction higher (to the right) are shown in red, and those pushing the prediction lower are in blue.
# From the result, for example, we can see that the TIME_B_S and AGE are both red. They both have a positive impact on pushing 
# the model higher.  
# However, S_AD_ORIT has the biggest impact compared with other features.

shap.initjs()
explainer = shap.KernelExplainer(the_bestmodel.predict_proba,X_train)
shap_values = explainer.shap_values(df.iloc[0,:])
shap.force_plot(explainer.expected_value[0], shap_values[0], df.iloc[0,:])


Using 1360 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


In [18]:
# Alive result (with imputed data)

# The result is a little different after using imputed data.

# The base value (expected value) which is around 0.84, is the average target value of the model output over the training data (X_train)
# It is the same.

# The output value, which is 1, is the prediction for that observation (the prediction of the first row in X_test). It is the same.

# Red/blue: Features that push the prediction higher (to the right) are shown in red, and those pushing the prediction lower are in blue.
# From the result, for example, we can see now there is a feature, ant_im is blue. 
# ant_im has a negative impact on pushing the model lower.  
# It is different from the result withour imputed data. The previous result has no blue.

shap.initjs()
explainer = shap.KernelExplainer(the_bestmodel.predict_proba,X_trainimp)
shap_values = explainer.shap_values(df_imp.iloc[0,:])
shap.force_plot(explainer.expected_value[0], shap_values[0], df_imp.iloc[0,:])

Using 1360 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


In [18]:
# Alive result (without imputed data)

# The global force plot allows us to see shap values made-up by record. X shows each record, and positive shap values show in red
# and negative shap values are blue.
# Here I chose the X axis as the sample order by output value, and the y axis as the prediction (f(x))
# When I moved the X axis to 1 and click the diagram, there's a popup message saying 'This original index of the sample you clicked is 94'
# This sample has more red contribution than the blue ones, the prediction for this record would be higher than the expected value.

shap_values = explainer.shap_values(df)
shap.force_plot(explainer.expected_value[0], shap_values[0], df)

  0%|          | 0/340 [00:00<?, ?it/s]

In [19]:
# Alive result (with imputed data)

# The global force plot allows us to see shap values made-up by record. X shows each record, and positive shap values show in red
# and negative shap values are blue.
# It is similar with the non-imputed data.

shap_values = explainer.shap_values(df_imp)
shap.force_plot(explainer.expected_value[0], shap_values[0], df_imp)

  0%|          | 0/340 [00:00<?, ?it/s]

In [19]:
# Dead result (without imputed data)

# As the description before. Shap values closer to zero means the feature contributes little to the prediction. 
# The base value (expected value) which is around 0.16, is the average target value of the model output over the training data (X_train)
# The output value, which is 0, is the prediction for that observation (the prediction of the first row in X_test)
# Red/blue: Features that push the prediction higher (to the right) are shown in red, and those pushing the prediction lower are in blue.
# From the result, only ant_im (ECG changes in leads V1–V4) has the positive impact, and other features push the prediction lower.
# It is interesting to observe that compared with the alive result, most results are in the opposite direction, 
# like S_AD_ORIT (Systolic blood pressure according to intensive care unit).

shap.initjs()
explainer = shap.KernelExplainer(the_bestmodel.predict_proba,X_train)
shap_values = explainer.shap_values(df.iloc[0,:])
shap.force_plot(explainer.expected_value[1], shap_values[1], df.iloc[0,:])

Using 1360 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


In [20]:
# Dead result (with imputed data)

# The pattern of the result looks similar with the previous one. Only ant_im is red, while other features are blue. However, the 
# features which push the prediction lower changed. For example, TIME_B_S has the biggest effect on pushing the prediction lower.

shap.initjs()
explainer = shap.KernelExplainer(the_bestmodel.predict_proba,X_trainimp)
shap_values = explainer.shap_values(df_imp.iloc[0,:])
shap.force_plot(explainer.expected_value[1], shap_values[1], df_imp.iloc[0,:])

Using 1360 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


In [20]:
# The second class (1, dead) without imputed data.

# When I moved the X axis to 1 and click the diagram, there's a popup message saying 'This original index of the sample you clicked is 117'
# This sample has more positive red contributions than blue ones, the prediction for this record would be higher than the expected value.
# The graph looks very different from the previous class result.

shap_values = explainer.shap_values(df)
shap.force_plot(explainer.expected_value[1], shap_values[1], df)

  0%|          | 0/340 [00:00<?, ?it/s]

In [24]:
# The second  class (1, dead) with imputed data.

# The global force plot allows us to see shap values made-up by record. X shows each record, and positive shap values show in red
# and negative shap values are blue.
# It is similar with the without imputed data.

shap_values = explainer.shap_values(df_imp)
shap.force_plot(explainer.expected_value[1], shap_values[1], df_imp)

  0%|          | 0/340 [00:00<?, ?it/s]

## Group project step 9
## Discussion and Limitations: to comment on your results and to enumerate the limitations and future work.

1. Different ways to do feature selection will also change the result of feature selection. However, we've done our best to do feature selection.
2. XGBoost got the best result, but it took a long time.
3. It would be better to compare the SHAP result using the data that only includes features in feature selection. Because it is not easy to see the features impact on global force, I think it is because there are too many features.
4. It seems the SHAP result didn't save, so we need to rerun the code every time. It takes more time to finish and confirm the result.
5. Changing the XGBoost hyperparameters may still change the result of SHAP.
6. We didn't have time to compare the result using feature selection results in the dataset. The performance for the classifier and SHAP must change if using that. Therefore, it is the future work.
7. Everyone tried to do their best to finish the project!