In [45]:
import pandas as pd
import numpy as np
import impyute as impy
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import copy
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from scipy import stats
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from scikit_posthocs import posthoc_nemenyi_friedman

In [33]:
mi=pd.read_csv('Myocardial infarction complications Database.csv')
drop_columns = ['ID',
               'R_AB_1_n', 'R_AB_2_n', 'R_AB_3_n', 'NA_R_1_n', 'NA_R_2_n',
               'NA_R_3_n', 'NOT_NA_1_n', 'NOT_NA_2_n', 'NOT_NA_3_n',
               'FIBR_PREDS', 'PREDS_TAH', 'JELUD_TAH', 'FIBR_JELUD', 'A_V_BLOK',
               'OTEK_LANC', 'RAZRIV', 'DRESSLER', 'ZSN', 'REC_IM', 'P_IM_STEN']

mi = mi.replace({'LET_IS': [i for i in range(1, 8)]}, 1)
mi = mi.drop(columns=drop_columns)
drop_columns = ['IBS_NASL','D_AD_KBRIG','S_AD_KBRIG','KFK_BLOOD' ]

mi = mi.drop(columns=drop_columns)
binary=['SEX',
 'SIM_GIPERT',
 'nr_11',
 'nr_01',
 'nr_02',
 'nr_03',
 'nr_04',
 'nr_07',
 'nr_08',
 'np_01',
 'np_04',
 'np_05',
 'np_07',
 'np_08',
 'np_09',
 'np_10',
 'endocr_01',
 'endocr_02',
 'endocr_03',
 'zab_leg_01',
 'zab_leg_02',
 'zab_leg_03',
 'zab_leg_04',
 'zab_leg_06',
 'O_L_POST',
 'K_SH_POST',
 'MP_TP_POST',
 'SVT_POST',
 'GT_POST',
 'FIB_G_POST',
 'IM_PG_P',
 'ritm_ecg_p_01',
 'ritm_ecg_p_02',
 'ritm_ecg_p_04',
 'ritm_ecg_p_06',
 'ritm_ecg_p_07',
 'ritm_ecg_p_08',
 'n_r_ecg_p_01',
 'n_r_ecg_p_02',
 'n_r_ecg_p_03',
 'n_r_ecg_p_04',
 'n_r_ecg_p_05',
 'n_r_ecg_p_06',
 'n_r_ecg_p_08',
 'n_r_ecg_p_09',
 'n_r_ecg_p_10',
 'n_p_ecg_p_01',
 'n_p_ecg_p_03',
 'n_p_ecg_p_04',
 'n_p_ecg_p_05',
 'n_p_ecg_p_06',
 'n_p_ecg_p_07',
 'n_p_ecg_p_08',
 'n_p_ecg_p_09',
 'n_p_ecg_p_10',
 'n_p_ecg_p_11',
 'n_p_ecg_p_12',
 'fibr_ter_01',
 'fibr_ter_02',
 'fibr_ter_03',
 'fibr_ter_05',
 'fibr_ter_06',
 'fibr_ter_07',
 'fibr_ter_08',
 'GIPO_K',
 'GIPER_NA',
 'NA_KB',
 'NOT_NA_KB',
 'LID_KB',
 'NITR_S',
 'LID_S_n',
 'B_BLOK_S_n',
 'ANT_CA_S_n',
 'GEPAR_S_n',
 'ASP_S_n',
 'TIKL_S_n',
 'TRENT_S_n']
cato=['INF_ANAM',
 'STENOK_AN',
 'FK_STENOK',
 'IBS_POST',
 'GB',
 'DLIT_AG',
 'ZSN_A',
 'ant_im',
 'lat_im',
 'inf_im',
 'post_im',
 'TIME_B_S']
continuous=['AGE',
 'S_AD_ORIT',
 'D_AD_ORIT',
 'K_BLOOD',
 'NA_BLOOD',
 'ALT_BLOOD',
 'AST_BLOOD',
 'L_BLOOD',
 'ROE']
numerical=cato+continuous 

In [34]:
X = mi.loc[:, mi.columns!='LET_IS']
y = mi['LET_IS']
X_train, X_rem, y_train, y_rem = train_test_split(X,y, train_size=0.6)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, test_size=0.5)

### Imputation Exploration

In [35]:
# We need to find the imputation method that works best for our dataset
# We will test the performace of a number of base classifiers against different methods and pick the best one
# Base classiers
neigh = KNeighborsClassifier()
log = LogisticRegression(random_state=1,max_iter=10000)
dt= DecisionTreeClassifier(random_state=1)
svm=SVC(random_state=1)
rf=RandomForestClassifier(random_state=1)
classifers=[neigh,log,dt,svm,rf]
classiferss=['KNN','Logistic Regression','Decision Tree','Support Vector Machine','Random Forest']

In [37]:
#Method 1
imp = IterativeImputer(random_state=1)
X_train_imp=pd.DataFrame(imp.fit_transform(X_train),index=X_train.index,columns=X_train.columns)
X_valid_imp=pd.DataFrame(imp.fit_transform(X_valid),index=X_valid.index,columns=X_valid.columns)

m1=''
for i in range(len(classifers)):
    result=classifers[i].fit(X_train_imp,y_train)
    y_pred=result.predict(X_valid_imp)
    f1score=f1_score(y_valid,y_pred)
    recall=recall_score(y_valid,y_pred)
    m1=m1+str("%s: F1Score: %f Recall: %f\n"%(classiferss[i],f1score,recall))



In [38]:
#Method 2
imp = KNNImputer(n_neighbors=5)
X_train_imp=pd.DataFrame(imp.fit_transform(X_train),index=X_train.index,columns=X_train.columns)
X_valid_imp=pd.DataFrame(imp.fit_transform(X_valid),index=X_valid.index,columns=X_valid.columns)

m2=''
for i in range(len(classifers)):
    result=classifers[i].fit(X_train_imp,y_train)
    y_pred=result.predict(X_valid_imp)
    f1score=f1_score(y_valid,y_pred)
    recall=recall_score(y_valid,y_pred)
    m2=m2+str("%s: F1Score: %f Recall: %f\n"%(classiferss[i],f1score,recall))

In [39]:
#Method 3
X_train_imp=copy.deepcopy(X_train)
X_valid_imp=copy.deepcopy(X_valid)

X_train_imp.update(pd.DataFrame(impy.median(X_train[binary+cato].to_numpy()), index=X_train[binary+cato].index,columns=X_train[binary+cato].columns))
X_train_imp.update(pd.DataFrame(impy.mice(X_train[continuous].to_numpy()), index=X_train[continuous].index,columns=X_train[continuous].columns))

X_valid_imp.update(pd.DataFrame(impy.median(X_valid[binary+cato].to_numpy()), index=X_valid[binary+cato].index,columns=X_valid[binary+cato].columns))
X_valid_imp.update(pd.DataFrame(impy.mice(X_valid[continuous].to_numpy()), index=X_valid[continuous].index,columns=X_valid[continuous].columns))

m3=''
for i in range(len(classifers)):
    result=classifers[i].fit(X_train_imp,y_train)
    y_pred=result.predict(X_valid_imp)
    f1score=f1_score(y_valid,y_pred)
    recall=recall_score(y_valid,y_pred)
    m3=m3+str("%s: F1Score: %f Recall: %f\n"%(classiferss[i],f1score,recall))

In [40]:
#Method 4
X_train_imp=copy.deepcopy(X_train)
X_valid_imp=copy.deepcopy(X_valid)

X_train_imp.update(pd.DataFrame(impy.median(X_train_imp[binary+cato].to_numpy()), index=X_train[binary+cato].index,columns=X_train[binary+cato].columns))
X_train_imp.update(pd.DataFrame(impy.fast_knn(X_train_imp[continuous].to_numpy()), index=X_train[continuous].index,columns=X_train[continuous].columns))

X_valid_imp.update(pd.DataFrame(impy.median(X_valid_imp[binary+cato].to_numpy()), index=X_valid[binary+cato].index,columns=X_valid[binary+cato].columns))
X_valid_imp.update(pd.DataFrame(impy.fast_knn(X_valid_imp[continuous].to_numpy()), index=X_valid[continuous].index,columns=X_valid[continuous].columns))

m4=''
for i in range(len(classifers)):
    result=classifers[i].fit(X_train_imp,y_train)
    y_pred=result.predict(X_valid_imp)
    f1score=f1_score(y_valid,y_pred)
    recall=recall_score(y_valid,y_pred)
    m4=m4+str("%s: F1Score: %f Recall: %f\n"%(classiferss[i],f1score,recall))

In [41]:
#Method 5
X_train_imp=copy.deepcopy(X_train)
X_valid_imp=copy.deepcopy(X_valid)


X_train_imp.update(pd.DataFrame(impy.median(X_train_imp[binary+cato].to_numpy()), index=X_train[binary+cato].index,columns=X_train[binary+cato].columns))
X_train_imp.update(pd.DataFrame(impy.em(X_train_imp[continuous].to_numpy()), index=X_train[continuous].index,columns=X_train[continuous].columns))

X_valid_imp.update(pd.DataFrame(impy.median(X_valid_imp[binary+cato].to_numpy()), index=X_valid[binary+cato].index,columns=X_valid[binary+cato].columns))
X_valid_imp.update(pd.DataFrame(impy.em(X_valid_imp[continuous].to_numpy()), index=X_valid[continuous].index,columns=X_valid[continuous].columns))

m5=''
for i in range(len(classifers)):
    result=classifers[i].fit(X_train_imp,y_train)
    y_pred=result.predict(X_valid_imp)
    f1score=f1_score(y_valid,y_pred)
    recall=recall_score(y_valid,y_pred)
    m5=m5+str("%s: F1Score: %f Recall: %f\n"%(classiferss[i],f1score,recall))

In [47]:
print("Method #1:\n"+m1)
print("Method #2:\n"+m2)
print("Method #3:\n"+m3)
print("Method #4:\n"+m4)
print("Method #5:\n"+m5)

Method #1:
KNN: F1Score: 0.275862 Recall: 0.184615
Logistic Regression: F1Score: 0.560748 Recall: 0.461538
Decision Tree: F1Score: 0.403361 Recall: 0.369231
Support Vector Machine: F1Score: 0.115942 Recall: 0.061538
Random Forest: F1Score: 0.329114 Recall: 0.200000

Method #2:
KNN: F1Score: 0.261905 Recall: 0.169231
Logistic Regression: F1Score: 0.540000 Recall: 0.415385
Decision Tree: F1Score: 0.459016 Recall: 0.430769
Support Vector Machine: F1Score: 0.059701 Recall: 0.030769
Random Forest: F1Score: 0.311688 Recall: 0.184615

Method #3:
KNN: F1Score: 0.282609 Recall: 0.200000
Logistic Regression: F1Score: 0.547170 Recall: 0.446154
Decision Tree: F1Score: 0.439024 Recall: 0.415385
Support Vector Machine: F1Score: 0.115942 Recall: 0.061538
Random Forest: F1Score: 0.285714 Recall: 0.169231

Method #4:
KNN: F1Score: 0.285714 Recall: 0.200000
Logistic Regression: F1Score: 0.524272 Recall: 0.415385
Decision Tree: F1Score: 0.434109 Recall: 0.430769
Support Vector Machine: F1Score: 0.115942 

In [48]:
#Preperation to see if any Method is statistically significantly better
m1g=[m1.split(' ')[2],m1.split(' ')[7],m1.split(' ')[12],m1.split(' ')[18],m1.split(' ')[23]]
m2g=[m2.split(' ')[2],m2.split(' ')[7],m2.split(' ')[12],m2.split(' ')[18],m2.split(' ')[23]]
m3g=[m3.split(' ')[2],m3.split(' ')[7],m3.split(' ')[12],m3.split(' ')[18],m3.split(' ')[23]]
m4g=[m4.split(' ')[2],m4.split(' ')[7],m4.split(' ')[12],m4.split(' ')[18],m4.split(' ')[23]]
m5g=[m5.split(' ')[2],m5.split(' ')[7],m5.split(' ')[12],m5.split(' ')[18],m5.split(' ')[23]]
bg = [ float(x) for x in bg ]
m1g = [ float(x) for x in m1g ]
m2g = [ float(x) for x in m2g ]
m3g = [ float(x) for x in m3g ]
m4g = [ float(x) for x in m4g ]
m5g = [ float(x) for x in m5g ]

In [49]:
#Stastical Significance of methods
groups=[m1g,m2g,m3g,m4g,m5g]
#Check if Friedman test is signifiant
chi_square,p_value_mean=stats.friedmanchisquare(*groups)
print(p_value_mean)

0.34254747982605815
          0         1         2         3         4
0  1.000000  0.900000  0.900000  0.900000  0.374437
1  0.900000  1.000000  0.900000  0.900000  0.724319
2  0.900000  0.900000  1.000000  0.900000  0.374437
3  0.900000  0.900000  0.900000  1.000000  0.724319
4  0.374437  0.724319  0.374437  0.724319  1.000000


### It is shown that each method is not significantly better than any other method. Although it is shown that Method 3 has the broadest improvement over all classifiers. We will chose this method to proceed.