# Myocardial Infraction Complications Analysis

## 0. Introduction

In [420]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [421]:
data = pd.read_csv("MI.data", header=None)
data.columns = ["ID", "AGE", "SEX", "INF_ANAM", "STENOK_AN", "FK_STENOK", "IBS_POST", "IBS_NASL", "GB", "SIM_GIPERT", "DLIT_AG", "ZSN_A", "nr_11", "nr_01", "nr_02", "nr_03", "nr_04", "nr_07", "nr_08", "np_01", "np_04", "np_05", "np_07", "np_08", "np_09", "np_10", "endocr_01", "endocr_02", "endocr_03", "zab_leg_01", "zab_leg_02", "zab_leg_03", "zab_leg_04", "zab_leg_06", "S_AD_KBRIG", "D_AD_KBRIG", "S_AD_ORIT", "D_AD_ORIT", "O_L_POST", "K_SH_POST", "MP_TP_POST", "SVT_POST", "GT_POST", "FIB_G_POST", "ant_im", "lat_im", "inf_im", "post_im", "IM_PG_P", "ritm_ecg_p_01", "ritm_ecg_p_02", "ritm_ecg_p_04", "ritm_ecg_p_06", "ritm_ecg_p_07", "ritm_ecg_p_08", "n_r_ecg_p_01", "n_r_ecg_p_02", "n_r_ecg_p_03", "n_r_ecg_p_04", "n_r_ecg_p_05", "n_r_ecg_p_06", "n_r_ecg_p_08", "n_r_ecg_p_09", "n_r_ecg_p_10", "n_p_ecg_p_01", "n_p_ecg_p_03", "n_p_ecg_p_04", "n_p_ecg_p_05", "n_p_ecg_p_06", "n_p_ecg_p_07", "n_p_ecg_p_08", "n_p_ecg_p_09", "n_p_ecg_p_10", "n_p_ecg_p_11", "n_p_ecg_p_12", "fibr_ter_01", "fibr_ter_02", "fibr_ter_03", "fibr_ter_05", "fibr_ter_06", "fibr_ter_07", "fibr_ter_08", "GIPO_K", "K_BLOOD", "GIPER_NA", "NA_BLOOD", "ALT_BLOOD", "AST_BLOOD", "KFK_BLOOD", "L_BLOOD", "ROE", "TIME_B_S", "R_AB_1_n", "R_AB_2_n", "R_AB_3_n", "NA_KB", "NOT_NA_KB", "LID_KB", "NITR_S", "NA_R_1_n", "NA_R_2_n", "NA_R_3_n", "NOT_NA_1_n", "NOT_NA_2_n", "NOT_NA_3_n", "LID_S_n", "B_BLOK_S_n", "ANT_CA_S_n", "GEPAR_S_n", "ASP_S_n", "TIKL_S_n", "TRENT_S_n", "FIBR_PREDS", "PREDS_TAH", "JELUD_TAH", "FIBR_JELUD", "A_V_BLOK", "OTEK_LANC", "RAZRIV", "DRESSLER", "ZSN", "REC_IM", "P_IM_STEN", "LET_IS"]
data.replace("?", np.NaN, inplace=True)
data = data.apply(pd.to_numeric, errors = "coerce")
data

Unnamed: 0,ID,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,IBS_NASL,GB,SIM_GIPERT,...,JELUD_TAH,FIBR_JELUD,A_V_BLOK,OTEK_LANC,RAZRIV,DRESSLER,ZSN,REC_IM,P_IM_STEN,LET_IS
0,1,77.0,1,2.0,1.0,1.0,2.0,,3.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2,55.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,3,52.0,1,0.0,0.0,0.0,2.0,,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,4,68.0,0,0.0,0.0,0.0,2.0,,2.0,0.0,...,0,0,0,0,0,0,1,0,0,0
4,5,60.0,1,0.0,0.0,0.0,2.0,,3.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,1696,77.0,0,0.0,4.0,2.0,1.0,,2.0,0.0,...,0,0,1,0,1,0,0,0,0,3
1696,1697,70.0,0,0.0,6.0,2.0,1.0,,2.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1697,1698,55.0,1,3.0,6.0,2.0,2.0,,0.0,0.0,...,0,0,0,0,0,0,0,1,0,6
1698,1699,79.0,0,2.0,2.0,2.0,1.0,,2.0,0.0,...,0,0,0,1,0,0,0,0,0,1


## 1. Train-Validation-Test Split

In [422]:
from sklearn.model_selection import train_test_split
X = data.iloc[:, 1:112]
y = [data["ZSN"], data["FIBR_PREDS"], data["P_IM_STEN"], data["REC_IM"], data["OTEK_LANC"]]

X_train, X_val, X_test, y_train, y_val, y_test = [], [], [], [], [], []

# ! Caution
# i = 0 => ZSN or Chronic heart failure
# i = 1 => FIBR_PREDS or Atrial fibrillation
# i = 2 => P_IM_STEN or Post-infarction angina
# i = 3 => REC_IM or Relapse of the myocardial infarction
# i = 4 => OTEK_LANC or Pulmonary edema

for i in range(len(y)):
    # Separate train/test split for each target variable
    # Split the data into train and temporary sets
    X_train_i, X_temp_i, y_train_i, y_temp_i = train_test_split(X, y[i], train_size=0.7, random_state=0, shuffle = False)
    
    # Split the temporary set into validation and test sets
    X_val_i, X_test_i, y_val_i, y_test_i = train_test_split(X_temp_i, y_temp_i, train_size=0.5, random_state=0, shuffle = False)
    X_train.append(X_train_i)
    X_val.append(X_val_i)
    X_test.append(X_test_i)
    y_train.append(y_train_i)
    y_val.append(y_val_i)
    y_test.append(y_test_i)


In [423]:
# Check the number of samples in each set and print as a table
results = {"Set": ["Train", "Validation", "Test"]}
for i in range(len(y)):
    results[f"y{i}"] = [len(y_train[i]), len(y_val[i]), len(y_test[i])]
results = pd.DataFrame(results)
results

Unnamed: 0,Set,y0,y1,y2,y3,y4
0,Train,1190,1190,1190,1190,1190
1,Validation,255,255,255,255,255
2,Test,255,255,255,255,255


## 2. EDA

### Data Overview

In [424]:
data

Unnamed: 0,ID,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,IBS_NASL,GB,SIM_GIPERT,...,JELUD_TAH,FIBR_JELUD,A_V_BLOK,OTEK_LANC,RAZRIV,DRESSLER,ZSN,REC_IM,P_IM_STEN,LET_IS
0,1,77.0,1,2.0,1.0,1.0,2.0,,3.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2,55.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,3,52.0,1,0.0,0.0,0.0,2.0,,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,4,68.0,0,0.0,0.0,0.0,2.0,,2.0,0.0,...,0,0,0,0,0,0,1,0,0,0
4,5,60.0,1,0.0,0.0,0.0,2.0,,3.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,1696,77.0,0,0.0,4.0,2.0,1.0,,2.0,0.0,...,0,0,1,0,1,0,0,0,0,3
1696,1697,70.0,0,0.0,6.0,2.0,1.0,,2.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1697,1698,55.0,1,3.0,6.0,2.0,2.0,,0.0,0.0,...,0,0,0,0,0,0,0,1,0,6
1698,1699,79.0,0,2.0,2.0,2.0,1.0,,2.0,0.0,...,0,0,0,1,0,0,0,0,0,1


### Target Balance Check

In [425]:
# The number of instances in the dataset
number_of_instances = len(data)
# Create a table to show the balance of each target variable
results = {"Target": [], "0": [], "1": []}
for i in range(len(y)):
    results["Target"].append(f"y{i}")
    results["0"].append(y[i].value_counts()[0] / number_of_instances)
    results["1"].append(y[i].value_counts()[1] / number_of_instances)
results = pd.DataFrame(results)
results

Unnamed: 0,Target,0,1
0,y0,0.768235,0.231765
1,y1,0.9,0.1
2,y2,0.912941,0.087059
3,y3,0.906471,0.093529
4,y4,0.906471,0.093529


### Check for missing values

In [426]:
# Create a table representing number of missing values of each feature, sort them descendingly
missing_values = X.isnull().sum().sort_values(ascending=False)
missing_values = missing_values[missing_values > 0]
missing_values = pd.DataFrame(missing_values, columns=["Number of Missing Values"])
missing_values


Unnamed: 0,Number of Missing Values
KFK_BLOOD,1696
IBS_NASL,1628
D_AD_KBRIG,1076
S_AD_KBRIG,1076
NOT_NA_KB,686
...,...
zab_leg_06,7
zab_leg_04,7
NA_R_1_n,5
INF_ANAM,4


### Data Information of Feature

In [427]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1700 entries, 0 to 1699
Columns: 111 entries, AGE to TRENT_S_n
dtypes: float64(110), int64(1)
memory usage: 1.4 MB


In [428]:
X.describe()

Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,IBS_NASL,GB,SIM_GIPERT,DLIT_AG,...,NOT_NA_1_n,NOT_NA_2_n,NOT_NA_3_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n
count,1692.0,1700.0,1696.0,1594.0,1627.0,1649.0,72.0,1691.0,1692.0,1452.0,...,1690.0,1590.0,1569.0,1690.0,1689.0,1687.0,1683.0,1683.0,1684.0,1684.0
mean,61.856974,0.626471,0.554835,2.316186,1.205286,1.160703,0.375,1.393258,0.033688,3.34022,...,0.331953,0.113208,0.084767,0.283432,0.127294,0.666864,0.714795,0.74391,0.017815,0.202494
std,11.259936,0.483883,0.836801,2.440586,1.040814,0.8014,0.48752,1.088803,0.180478,3.098646,...,0.628311,0.399514,0.355107,0.450798,0.333401,0.471474,0.451646,0.436602,0.132317,0.401978
min,26.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,54.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,63.0,1.0,0.0,1.0,2.0,1.0,0.0,2.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
75%,70.0,1.0,1.0,5.0,2.0,2.0,1.0,2.0,0.0,7.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
max,92.0,1.0,3.0,6.0,4.0,2.0,1.0,3.0,1.0,7.0,...,4.0,3.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [429]:
continuous_features = ["AGE", "S_AD_ORIT", "D_AD_ORIT", "K_BLOOD", "NA_BLOOD", "ALT_BLOOD", "AST_BLOOD", "L_BLOOD", "ROE"]
categorical_features = ["SEX" ,"INF_ANAM", "STENOK_AN", "FK_STENOK", "IBS_POST", "GB", "SIM_GIPERT", "DLIT_AG", "ZSN_A", "nr_11", "nr_01", "nr_02", "nr_03", "nr_04", "nr_07", "nr_08", "np_01", "np_04", "np_05", "np_07", "np_08", "np_09", "np_10", "endocr_01", "endocr_02", "endocr_03", "zab_leg_01", "zab_leg_02", "zab_leg_03", "zab_leg_04", "zab_leg_06", "O_L_POST", "K_SH_POST", "MP_TP_POST", "SVT_POST", "GT_POST", "FIB_G_POST", "ant_im", "lat_im", "inf_im", "post_im", "IM_PG_P", "ritm_ecg_p_01", "ritm_ecg_p_02", "ritm_ecg_p_04", "ritm_ecg_p_06", "ritm_ecg_p_07", "ritm_ecg_p_08", "n_r_ecg_p_01", "n_r_ecg_p_02", "n_r_ecg_p_03", "n_r_ecg_p_04", "n_r_ecg_p_05", "n_r_ecg_p_06", "n_r_ecg_p_08", "n_r_ecg_p_09", "n_r_ecg_p_10", "n_p_ecg_p_01", "n_p_ecg_p_03", "n_p_ecg_p_04", "n_p_ecg_p_05", "n_p_ecg_p_06", "n_p_ecg_p_07", "n_p_ecg_p_08", "n_p_ecg_p_09", "n_p_ecg_p_10", "n_p_ecg_p_11", "n_p_ecg_p_12", "fibr_ter_01", "fibr_ter_02", "fibr_ter_03", "fibr_ter_05", "fibr_ter_06", "fibr_ter_07", "fibr_ter_08", "GIPO_K", "GIPER_NA", "TIME_B_S", "R_AB_1_n", "R_AB_2_n", "R_AB_3_n", "NITR_S", "NA_R_1_n", "NA_R_2_n", "NA_R_3_n", "NOT_NA_1_n", "NOT_NA_2_n", "NOT_NA_3_n", "LID_S_n", "B_BLOK_S_n", "ANT_CA_S_n", "GEPAR_S_n", "ASP_S_n", "TIKL_S_n", "TRENT_S_n"]

### Data Distribution

In [430]:
for continuous_feature in continuous_features:
    skew_value = abs(X_train[0][continuous_feature].skew())
    print(f"{continuous_feature} skew: {skew_value}")

AGE skew: 0.15108305442457992
S_AD_ORIT skew: 0.5409602704221258
D_AD_ORIT skew: 0.5503704578194044
K_BLOOD skew: 1.006181101800303
NA_BLOOD skew: 0.04373144416584097
ALT_BLOOD skew: 2.1427276814427985
AST_BLOOD skew: 2.6450289454994618
L_BLOOD skew: 1.358850262629433
ROE skew: 2.8319741042642663


### Correlation Matrix


In [431]:
# plt.figure(figsize=(40, 30))
# sns.heatmap(X_train[0].corr(), annot=True, cmap='coolwarm', fmt=".2f", annot_kws={"size": 4})  # Adjust size as needed
# plt.savefig('heatmap.pdf', format='pdf')  # Save the plot as a PDF file
# plt.show()

## 3. Preprocessing

In [432]:
%store X_train
X_train[0]

Stored 'X_train' (list)


Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,IBS_NASL,GB,SIM_GIPERT,DLIT_AG,...,NOT_NA_1_n,NOT_NA_2_n,NOT_NA_3_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n
0,77.0,1,2.0,1.0,1.0,2.0,,3.0,0.0,7.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,55.0,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
2,52.0,1,0.0,0.0,0.0,2.0,,2.0,0.0,2.0,...,3.0,2.0,2.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
3,68.0,0,0.0,0.0,0.0,2.0,,2.0,0.0,3.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
4,60.0,1,0.0,0.0,0.0,2.0,,3.0,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1185,78.0,1,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1186,59.0,1,1.0,3.0,2.0,1.0,,2.0,0.0,,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
1187,69.0,0,0.0,6.0,2.0,1.0,,2.0,0.0,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
1188,53.0,1,0.0,0.0,0.0,0.0,,2.0,0.0,,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0


In [433]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold):
        self.columns_to_drop = None
        self.threshold = threshold

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        self.columns_to_drop = X.columns[X.isnull().sum() > self.threshold]
        return self

    def transform(self, X):
        # Ensure the input is a DataFrame
        X = pd.DataFrame(X)
        global X_keep 
        X_keep = list(X.columns[~X.columns.isin(self.columns_to_drop)])
        return X.drop(columns=self.columns_to_drop)
    

class RowDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold):
        self.rows_to_drop = None
        self.threshold = threshold

    def fit(self, X, y=None):
        self.rows_to_drop = X.index[X.isnull().sum(axis=1) > self.threshold]
        return self

    def transform(self, X):
        # Ensure the input is a DataFrame
        X = pd.DataFrame(X)
        return X.drop(index=self.rows_to_drop)
    

In [434]:
binary_features = ["SEX", "IBS_NASL", "SIM_GIPERT", "nr_11", "nr_01", "nr_02", "nr_03", "nr_04", "nr_07", "nr_08", "np_01", "np_04", "np_05", "np_07", "np_08", "np_09", "np_10", "endocr_01", "endocr_02", "endcor_03", "zab_leg_01", "zab_leg_02", "zab_leg_03", "zab_leg_04", "zab_leg_06", "O_L_POST", "K_SH_POST", "MP_TP_POST", "SVT_POST", "GT_POST", "FIB_G_POST", "IM_PG_P", "ritm_ecg_p_01", "ritm_ecg_p_02", "ritm_ecg_p_04", "ritm_ecg_p_06", "ritm_ecg_p_07", "ritm_ecg_p_08", "n_r_ecg_p_01", "n_r_ecg_p_02", "n_r_ecg_p_03", "n_r_ecg_p_04", "n_r_ecg_p_05", "n_r_ecg_p_06", "n_r_ecg_p_08", "n_r_ecg_p_09", "n_r_ecg_p_10", "n_p_ecg_p_01", "n_p_ecg_p_03", "n_p_ecg_p_04", "n_p_ecg_p_05", "n_p_ecg_p_06", "n_p_ecg_p_07", "n_p_ecg_p_08", "n_p_ecg_p_09", "n_p_ecg_p_10", "n_p_ecg_p_11", "n_p_ecg_p_12", "fibr_ter_01", "fibr_ter_02", "fibr_ter_03", "fibr_ter_05", "fibr_ter_06", "fibr_ter_07", "fibr_ter_08", "GIPO_K", "GIPER_NA", "NA_KB", "NOT_NA_KB", "LID_KB", "NITR_S", "LID_S_n", "B_BLOK_S_n", "ANT_CA_S_n", "GEPAR_S_n", "ASP_S_n", "TIKL_S_n", "TRENT_S_n"]

In [435]:

# Define outliers handler class
class OutliersHandler(BaseEstimator, TransformerMixin):
    def __init__(self, coefficient):
        self.coefficient = coefficient
        self.lower_bounds = None
        self.upper_bounds = None

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        q1 = X.quantile(0.25)
        q3 = X.quantile(0.75)
        iqr = q3 - q1
        self.lower_bounds = q1 - iqr * self.coefficient
        self.upper_bounds = q3 + iqr * self.coefficient
        return self

    def transform(self, X):
        # Ensure the input is a DataFrame
        X = pd.DataFrame(X)
        for feature in X.columns:
            if feature not in binary_features: # if feature is not binary
                X[feature] = X[feature].clip(self.lower_bounds[feature], self.upper_bounds[feature])
        return X

In [436]:
def impute_by_type(X, continuous_features, categorical_features):
    """Fills missing values based on data type, handling potential errors, works with subsets"""
    for feature in categorical_features:
        if feature in X.columns:
            try:
                X[feature].fillna(X[feature].mode()[0], inplace=True)
            except KeyError:
                pass
    for feature in continuous_features:
        if feature in X.columns:
            try:
                X[feature].fillna(X[feature].mean(axis=0), inplace=True)
            except KeyError:
                pass
    return X

In [437]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer

# Define dropper pipeline
dropper = Pipeline(steps=[
    ('column_dropper', ColumnDropper(threshold=100)),
    ('row_dropper', RowDropper(threshold=100))
])

# Define preprocessing pipeline
preprocessing_pipeline = Pipeline(steps=[
    ('dropper', dropper),
    ('outliers_clipper',OutliersHandler(coefficient=1.5)),
    ('imputation', FunctionTransformer(impute_by_type, kw_args={"continuous_features":continuous_features, "categorical_features":categorical_features })),  # Fill missing values using mean
    ('scaling', StandardScaler())  # Standardize features by removing the mean and scaling to unit variance
])

# Apply the preprocessing pipeline to each set
for i in range(len(y)):
    X_train[i] = preprocessing_pipeline.fit_transform(X_train[i])
    X_val[i] = preprocessing_pipeline.transform(X_val[i])
    X_test[i] = preprocessing_pipeline.transform(X_test[i])

preprocessed_data = pd.DataFrame(data=X_train[0], columns=X_keep)
%store preprocessed_data
preprocessed_data

Stored 'preprocessed_data' (DataFrame)


Unnamed: 0,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,GB,SIM_GIPERT,ZSN_A,nr_11,...,NOT_NA_1_n,NOT_NA_2_n,NOT_NA_3_n,LID_S_n,B_BLOK_S_n,ANT_CA_S_n,GEPAR_S_n,ASP_S_n,TIKL_S_n,TRENT_S_n
0,1.496141,0.708890,2.003196,-0.411275,-0.122760,1.080799,1.520508,-0.184075,0.0,-0.171499,...,-0.561916,0.0,0.0,1.600898,-0.404036,-1.584400,0.633758,0.537842,-0.14649,-0.526141
1,-0.475808,0.708890,0.677019,-0.836485,-1.077563,-1.383091,-1.256072,-0.184075,0.0,-0.171499,...,1.154848,0.0,0.0,1.600898,-0.404036,0.631154,0.633758,0.537842,-0.14649,1.900632
2,-0.744710,0.708890,-0.649158,-0.836485,-1.077563,1.080799,0.594981,-0.184075,0.0,-0.171499,...,3.729994,0.0,0.0,1.600898,2.475025,-1.584400,0.633758,0.537842,-0.14649,-0.526141
3,0.689435,-1.410656,-0.649158,-0.836485,-1.077563,1.080799,0.594981,-0.184075,0.0,-0.171499,...,-0.561916,0.0,0.0,-0.624649,-0.404036,0.631154,0.633758,0.537842,-0.14649,-0.526141
4,-0.027638,0.708890,-0.649158,-0.836485,-1.077563,1.080799,1.520508,-0.184075,0.0,-0.171499,...,-0.561916,0.0,0.0,-0.624649,-0.404036,0.631154,-1.577890,0.537842,-0.14649,1.900632
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1185,1.585775,0.708890,-0.649158,-0.836485,-1.077563,-1.383091,-1.256072,-0.184075,0.0,-0.171499,...,-0.561916,0.0,0.0,-0.624649,-0.404036,0.631154,-1.577890,0.537842,-0.14649,-0.526141
1186,-0.117272,0.708890,0.677019,0.439146,0.832042,-0.151146,0.594981,-0.184075,0.0,-0.171499,...,-0.561916,0.0,0.0,1.600898,2.475025,-1.584400,0.633758,0.537842,-0.14649,-0.526141
1187,0.779069,-1.410656,-0.649158,1.714776,0.832042,-0.151146,0.594981,-0.184075,0.0,-0.171499,...,-0.561916,0.0,0.0,-0.624649,-0.404036,0.631154,-1.577890,0.537842,-0.14649,-0.526141
1188,-0.655076,0.708890,-0.649158,-0.836485,-1.077563,-1.383091,0.594981,-0.184075,0.0,-0.171499,...,-0.561916,0.0,0.0,1.600898,-0.404036,0.631154,-1.577890,0.537842,-0.14649,-0.526141


## 4. Model Selection

In [438]:
# write a section using Cost Sensitive Decision Tree and print the results as a table. A table includes several rows representing each of a metrics for imbalanced data. Each colummn represents each target variable.
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Define the model
model = DecisionTreeClassifier(random_state=0, class_weight='balanced')

# Train the model for each target variable
results = {"Metric": ["Accuracy", "Precision", "Recall", "F1 Score"]}
for i in range(len(y)):
    model.fit(X_train[i], y_train[i])
    y_pred = model.predict(X_val[i])
    results[f"y{i}"] = [accuracy_score(y_val[i], y_pred), precision_score(y_val[i], y_pred), recall_score(y_val[i], y_pred), f1_score(y_val[i], y_pred)]

results = pd.DataFrame(results)
# Change y0,y1,y2,y3,y4 to ZSN, FIBR_PREDS, P_IM_STEN, REC_IM, OTEK_LANC
results.columns = ["Metric", "ZSN", "FIBR_PREDS", "P_IM_STEN", "REC_IM", "OTEK_LANC"]

results



Unnamed: 0,Metric,ZSN,FIBR_PREDS,P_IM_STEN,REC_IM,OTEK_LANC
0,Accuracy,0.658824,0.713725,0.698039,0.745098,0.709804
1,Precision,0.22807,0.407407,0.222222,0.173913,0.282051
2,Recall,0.232143,0.161765,0.059701,0.08,0.192982
3,F1 Score,0.230088,0.231579,0.094118,0.109589,0.229167


## 5. Hyperparameter Tuning

### Decision Tree

#### SMOTE + balanced

In [439]:
# apply grid search for hyperparameter tuning for: first is the SMOTE upsampling ratio, second is the hyperparameter tuning for the cost-sensitive parameters.
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
from imblearn.metrics import geometric_mean_score

# Define the pipeline
pipeline = Pipeline(steps=[
    ('smote', SMOTE(random_state=0)),
    ('model', DecisionTreeClassifier(random_state=0, class_weight='balanced'))
])

# Post-pruning
# path = DecisionTreeClassifier(random_state=0, class_weight={0: 1, 1: 3}).cost_complexity_pruning_path(X_train[0], y_train[0])
# ccp_alphas, impurities = path.ccp_alphas, path.impurities
# candidate_alphas = ccp_alphas[::10]

# Define the hyperparameters
param_grid = {
    'smote__sampling_strategy': [0.5, 0.6, 0.7],
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [5, 10, 15, 20],
    'model__min_samples_split': [2, 5, 10],
    # 'model__ccp_alpha': candidate_alphas,
}

# Define the grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1')

# Train the model for all target variables and print the results as a table. A table includes several rows representing each of a metrics for imbalanced data. Each colummn represents each target variable.
results_SMOTE_balanced = {"Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC", "G mean"]}
for i in range(len(y)):
    # pipeline.set_params(model__class_weight=weights[i])
    grid_search.fit(X_train[i], y_train[i])
    y_pred = grid_search.predict(X_val[i])
    results_SMOTE_balanced[f"y{i}"] = [
        accuracy_score(y_val[i], y_pred), 
        precision_score(y_val[i], y_pred), 
        recall_score(y_val[i], y_pred), 
        f1_score(y_val[i], y_pred),
        roc_auc_score(y_val[i], y_pred),
        geometric_mean_score(y_val[i], y_pred)
    ]
    print("========================================") # print the best hyperparameters for the current target
    print(f"Hyperparameters for y{i}")
    for name, param in grid_search.best_estimator_.get_params().items():
        print(f"{name}: {param}")

results_SMOTE_balanced = pd.DataFrame(results_SMOTE_balanced)
# Change y0,y1,y2,y3,y4 to ZSN, FIBR_PREDS, P_IM_STEN, REC_IM, OTEK_LANC
results_SMOTE_balanced.columns = ["Metric", "ZSN", "FIBR_PREDS", "P_IM_STEN", "REC_IM", "OTEK_LANC"]

results_SMOTE_balanced

Hyperparameters for y0
memory: None
steps: [('smote', SMOTE(random_state=0, sampling_strategy=0.5)), ('model', DecisionTreeClassifier(class_weight='balanced', max_depth=5,
                       min_samples_split=10, random_state=0))]
verbose: False
smote: SMOTE(random_state=0, sampling_strategy=0.5)
model: DecisionTreeClassifier(class_weight='balanced', max_depth=5,
                       min_samples_split=10, random_state=0)
smote__k_neighbors: 5
smote__n_jobs: None
smote__random_state: 0
smote__sampling_strategy: 0.5
model__ccp_alpha: 0.0
model__class_weight: balanced
model__criterion: gini
model__max_depth: 5
model__max_features: None
model__max_leaf_nodes: None
model__min_impurity_decrease: 0.0
model__min_samples_leaf: 1
model__min_samples_split: 10
model__min_weight_fraction_leaf: 0.0
model__monotonic_cst: None
model__random_state: 0
model__splitter: best
Hyperparameters for y1
memory: None
steps: [('smote', SMOTE(random_state=0, sampling_strategy=0.6)), ('model', DecisionTreeCla

Unnamed: 0,Metric,ZSN,FIBR_PREDS,P_IM_STEN,REC_IM,OTEK_LANC
0,Accuracy,0.560784,0.678431,0.658824,0.74902,0.756863
1,Precision,0.262712,0.414634,0.327586,0.181818,0.444444
2,Recall,0.553571,0.5,0.283582,0.08,0.350877
3,F1 Score,0.356322,0.453333,0.304,0.111111,0.392157
4,ROC AUC,0.558193,0.621658,0.538068,0.496098,0.612307
5,G mean,0.558174,0.609637,0.474082,0.27014,0.553692


#### Cost-sensitive Decision Tree

In [452]:
# Define the pipeline
pipeline = Pipeline(steps=[
    ('model', DecisionTreeClassifier(random_state=0))
])

# Define class weights
class_weights_zsn = {0: 1, 1: y[0].value_counts()[0] / y[0].value_counts()[1]}
class_weights_fibr_preds = {0: 1, 1: y[1].value_counts()[0] / y[1].value_counts()[1]}
class_weights_p_im_sten = {0: 1, 1: y[2].value_counts()[0] / y[2].value_counts()[1]}
class_weights_rec_im = {0: 1, 1: y[3].value_counts()[0] / y[3].value_counts()[1]}
class_weights_otek_lanc = {0: 1, 1: y[4].value_counts()[0] / y[4].value_counts()[1]}
weights = [class_weights_zsn, class_weights_fibr_preds, class_weights_p_im_sten, class_weights_rec_im, class_weights_otek_lanc]

# Post-pruning
# path = DecisionTreeClassifier(random_state=0, class_weight={0: 1, 1: 3}).cost_complexity_pruning_path(X_train[0], y_train[0])
# ccp_alphas, impurities = path.ccp_alphas, path.impurities
# candidate_alphas = ccp_alphas[::10]

# Define the hyperparameters
param_grid = {
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [5, 10, 15, 20],
    'model__min_samples_split': [2, 5, 10],
    # 'model__ccp_alpha': candidate_alphas,
}

# Define the grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1')

# Train the model for all target variables and print the results as a table. A table includes several rows representing each of a metrics for imbalanced data. Each colummn represents each target variable.
results_cost_sensitive = {"Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC", "G mean"]}
for i in range(len(y)):
    pipeline.set_params(model__class_weight=weights[i]) # Adding weights to the current model
    grid_search.fit(X_train[i], y_train[i])
    y_pred = grid_search.predict(X_val[i])
    results_cost_sensitive[f"y{i}"] = [
        accuracy_score(y_val[i], y_pred),
        precision_score(y_val[i], y_pred),
        recall_score(y_val[i], y_pred),
        f1_score(y_val[i], y_pred),
        roc_auc_score(y_val[i], y_pred),
        geometric_mean_score(y_val[i], y_pred)
    ]
    print("========================================") # print the best hyperparameters for the current target
    print(f"Hyperparameters for y{i}")
    for name, param in grid_search.best_estimator_.get_params().items():
        print(f"{name}: {param}")

results_cost_sensitive = pd.DataFrame(results_cost_sensitive)
# Change y0,y1,y2,y3,y4 to ZSN, FIBR_PREDS, P_IM_STEN, REC_IM, OTEK_LANC
results_cost_sensitive.columns = ["Metric", "ZSN", "FIBR_PREDS", "P_IM_STEN", "REC_IM", "OTEK_LANC"]

results_cost_sensitive

Hyperparameters for y0
memory: None
steps: [('model', DecisionTreeClassifier(class_weight={0: 1, 1: 3.314720812182741},
                       criterion='entropy', max_depth=5, random_state=0))]
verbose: False
model: DecisionTreeClassifier(class_weight={0: 1, 1: 3.314720812182741},
                       criterion='entropy', max_depth=5, random_state=0)
model__ccp_alpha: 0.0
model__class_weight: {0: 1, 1: 3.314720812182741}
model__criterion: entropy
model__max_depth: 5
model__max_features: None
model__max_leaf_nodes: None
model__min_impurity_decrease: 0.0
model__min_samples_leaf: 1
model__min_samples_split: 2
model__min_weight_fraction_leaf: 0.0
model__monotonic_cst: None
model__random_state: 0
model__splitter: best
Hyperparameters for y1
memory: None
steps: [('model', DecisionTreeClassifier(class_weight={0: 1, 1: 9.0}, criterion='entropy',
                       max_depth=15, min_samples_split=5, random_state=0))]
verbose: False
model: DecisionTreeClassifier(class_weight={0: 1, 1: 9.0

Unnamed: 0,Metric,ZSN,FIBR_PREDS,P_IM_STEN,REC_IM,OTEK_LANC
0,Accuracy,0.458824,0.647059,0.639216,0.709804,0.705882
1,Precision,0.267045,0.28,0.349398,0.147059,0.32
2,Recall,0.839286,0.205882,0.432836,0.1,0.280702
3,F1 Score,0.405172,0.237288,0.386667,0.119048,0.299065
4,ROC AUC,0.595522,0.506684,0.572801,0.479268,0.554492
5,G mean,0.543347,0.407734,0.555437,0.293008,0.482183


### Cost-sensitive + SMOTE

In [449]:
# Define the pipeline
pipeline = Pipeline(steps=[
    ('smote', SMOTE(random_state=0)),
    ('model', DecisionTreeClassifier(random_state=0))
])

# Define class weights
class_weights_zsn = {}
class_weights_fibr_preds = {}
class_weights_p_im_sten = {}
class_weights_rec_im = {}
class_weights_otek_lanc = {}
weights = [class_weights_zsn, class_weights_fibr_preds, class_weights_p_im_sten, class_weights_rec_im, class_weights_otek_lanc]

# Post-pruning
# path = DecisionTreeClassifier(random_state=0, class_weight={0: 1, 1: 3}).cost_complexity_pruning_path(X_train[0], y_train[0])
# ccp_alphas, impurities = path.ccp_alphas, path.impurities
# candidate_alphas = ccp_alphas[::10]

# Define the hyperparameters
param_grid = {
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [5, 10, 15, 20],
    'model__min_samples_split': [2, 5, 10],
    # 'model__ccp_alpha': candidate_alphas,
}

# Define the grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1')

# Train the model for all target variables and print the results as a table. A table includes several rows representing each of a metrics for imbalanced data. Each colummn represents each target variable.
results_SMOTE_cost_sensitive = {"Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC", "G mean"]}
for i in range(len(y)):
    best_sampling_strategy = {}
    jmax = 0
    score_max = 0
    for j in [0.5, 0.6]:
        pipeline.set_params(smote__sampling_strategy=j)
        pipeline.set_params(model__class_weight={0: 1, 1: (1 / j)}) # Adding weights to the current model
        grid_search.fit(X_train[i], y_train[i])
        y_pred = grid_search.predict(X_val[i])
        if score_max <= f1_score(y_val[i], y_pred):
            score_max = f1_score(y_val[i], y_pred)
            jmax = j
            results_SMOTE_cost_sensitive[f"y{i}"] = [
                accuracy_score(y_val[i], y_pred),
                precision_score(y_val[i], y_pred),
                recall_score(y_val[i], y_pred),
                f1_score(y_val[i], y_pred),
                roc_auc_score(y_val[i], y_pred),
                geometric_mean_score(y_val[i], y_pred)
            ]
        dictionary = {}
        for name, param in grid_search.best_estimator_.get_params().items():
            dictionary[name] = param
        best_sampling_strategy[j] = dictionary
    print("========================================") # print the best hyperparameters for the current target
    print(f"Hyperparameters for y{i}")
    print("Best SMOTE sampling strategy: ", jmax)
    print("Best hyperparameters: ", best_sampling_strategy[jmax])

results_SMOTE_cost_sensitive = pd.DataFrame(results_SMOTE_cost_sensitive)
# Change y0,y1,y2,y3,y4 to ZSN, FIBR_PREDS, P_IM_STEN, REC_IM, OTEK_LANC
# results_SMOTE_cost_sensitive.columns = ["Metric", "ZSN", "FIBR_PREDS", "P_IM_STEN", "REC_IM", "OTEK_LANC"]

# results_SMOTE_cost_sensitive

Hyperparameters for y0
Best SMOTE sampling strategy:  0.5
Best hyperparameters:  {'memory': None, 'steps': [('smote', SMOTE(random_state=0, sampling_strategy=0.5)), ('model', DecisionTreeClassifier(class_weight={0: 1, 1: 2.0}, max_depth=5,
                       min_samples_split=5, random_state=0))], 'verbose': False, 'smote': SMOTE(random_state=0, sampling_strategy=0.5), 'model': DecisionTreeClassifier(class_weight={0: 1, 1: 2.0}, max_depth=5,
                       min_samples_split=5, random_state=0), 'smote__k_neighbors': 5, 'smote__n_jobs': None, 'smote__random_state': 0, 'smote__sampling_strategy': 0.5, 'model__ccp_alpha': 0.0, 'model__class_weight': {0: 1, 1: 2.0}, 'model__criterion': 'gini', 'model__max_depth': 5, 'model__max_features': None, 'model__max_leaf_nodes': None, 'model__min_impurity_decrease': 0.0, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__min_weight_fraction_leaf': 0.0, 'model__monotonic_cst': None, 'model__random_state': 0, 'model__split

In [451]:
results_SMOTE_cost_sensitive
results_SMOTE_cost_sensitive.columns = ["Metric", "ZSN", "FIBR_PREDS", "P_IM_STEN", "REC_IM", "OTEK_LANC"]

results_SMOTE_cost_sensitive

Unnamed: 0,Metric,ZSN,FIBR_PREDS,P_IM_STEN,REC_IM,OTEK_LANC
0,Accuracy,0.560784,0.701961,0.658824,0.764706,0.72549
1,Precision,0.262712,0.409091,0.327586,0.1875,0.367347
2,Recall,0.553571,0.264706,0.283582,0.06,0.315789
3,F1 Score,0.356322,0.321429,0.304,0.090909,0.339623
4,ROC AUC,0.558193,0.562834,0.538068,0.498293,0.579612
5,G mean,0.558174,0.477391,0.474082,0.237055,0.516089


### Random Forest

In [None]:
# # Define the pipeline
# from sklearn.ensemble import RandomForestClassifier
# pipeline = Pipeline(steps=[
#     ('smote', SMOTE(random_state=0)),
#     ('model', RandomForestClassifier(random_state=0))
# ])
# 
# # Define the hyperparameters
# param_grid = {
#     'smote__sampling_strategy': [0.5, 0.6, 0.7],
#     'model__criterion': ['gini', 'entropy'],
#     'model__max_depth': [5, 10, 15, 20],
#     'model__min_samples_split': [2, 5, 10],
#     'model__n_estimators': [5, 10, 15],
# }
# 
# # Define the grid search
# grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1')
# 
# # Train the model for all target variables and print the results as a table. A table includes several rows representing each of a metrics for imbalanced data. Each colummn represents each target variable.
# results_random_forest = {"Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC", "G mean"]}
# for i in range(len(y)):
#     pipeline.set_params(model__class_weight=weights[i])
#     grid_search.fit(X_train[i], y_train[i])
#     y_pred = grid_search.predict(X_val[i])
#     results_random_forest[f"y{i}"] = [
#         accuracy_score(y_val[i], y_pred), 
#         precision_score(y_val[i], y_pred), 
#         recall_score(y_val[i], y_pred), 
#         f1_score(y_val[i], y_pred),
#         roc_auc_score(y_val[i], y_pred),
#         geometric_mean_score(y_val[i], y_pred)
#     ]
#     print("========================================") # print the best hyperparameters for the current target
#     print(f"Hyperparameters for y{i}")
#     for name, param in grid_search.best_estimator_.get_params().items():
#         print(f"{name}: {param}")
# 
# results_random_forest = pd.DataFrame(results_random_forest)
# 
# # Change y0,y1,y2,y3,y4 to ZSN, FIBR_PREDS, P_IM_STEN, REC_IM, OTEK_LANC
# results_random_forest.columns = ["Metric", "ZSN", "FIBR_PREDS", "P_IM_STEN", "REC_IM", "OTEK_LANC"]
# 
# results_random_forest

### ZSN

In [454]:
# Create a table to compare multiple metrics: accuracy, precision, recall, f1 score, roc auc, g mean of multiple classifiers for target ZSN
results_zsn = {"Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC", "G mean"],
               "Decision Tree: SMOTE": results_SMOTE_balanced["ZSN"],
               "Decision Tree: Cost-sensitive": results_cost_sensitive["ZSN"],
               "Decision Tree: SMOTE + Cost-sensitive": results_SMOTE_cost_sensitive["ZSN"]}
results_zsn = pd.DataFrame(results_zsn)
results_zsn.columns = ["Metric", "Decision Tree: SMOTE", "Decision Tree: Cost-sensitive", "Decision Tree: SMOTE + Cost-sensitive"]

results_zsn

Unnamed: 0,Metric,Decision Tree: SMOTE,Decision Tree: Cost-sensitive,Decision Tree: SMOTE + Cost-sensitive
0,Accuracy,0.560784,0.458824,0.560784
1,Precision,0.262712,0.267045,0.262712
2,Recall,0.553571,0.839286,0.553571
3,F1 Score,0.356322,0.405172,0.356322
4,ROC AUC,0.558193,0.595522,0.558193
5,G mean,0.558174,0.543347,0.558174


### FIBR_PREDS

In [453]:
results_fibr_preds = {"Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC", "G mean"],
               "Decision Tree: SMOTE": results_SMOTE_balanced["FIBR_PREDS"],
               "Decision Tree: Cost-sensitive": results_cost_sensitive["FIBR_PREDS"],
               "Decision Tree: SMOTE + Cost-sensitive": results_SMOTE_cost_sensitive["FIBR_PREDS"]}
results_fibr_preds = pd.DataFrame(results_fibr_preds)
results_fibr_preds.columns = ["Metric", "Decision Tree: SMOTE", "Decision Tree: Cost-sensitive", "Decision Tree: SMOTE + Cost-sensitive"]

results_fibr_preds

Unnamed: 0,Metric,Decision Tree: SMOTE,Decision Tree: Cost-sensitive,Decision Tree: SMOTE + Cost-sensitive
0,Accuracy,0.678431,0.647059,0.701961
1,Precision,0.414634,0.28,0.409091
2,Recall,0.5,0.205882,0.264706
3,F1 Score,0.453333,0.237288,0.321429
4,ROC AUC,0.621658,0.506684,0.562834
5,G mean,0.609637,0.407734,0.477391


### P_IM_STEN

In [455]:
results_p_im_sten = {"Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC", "G mean"],
                      "Decision Tree: SMOTE": results_SMOTE_balanced["P_IM_STEN"],
                      "Decision Tree: Cost-sensitive": results_cost_sensitive["P_IM_STEN"],
                      "Decision Tree: SMOTE + Cost-sensitive": results_SMOTE_cost_sensitive["P_IM_STEN"]}
results_p_im_sten = pd.DataFrame(results_p_im_sten)
results_p_im_sten.columns = ["Metric", "Decision Tree: SMOTE", "Decision Tree: Cost-sensitive", "Decision Tree: SMOTE + Cost-sensitive"]

results_p_im_sten

Unnamed: 0,Metric,Decision Tree: SMOTE,Decision Tree: Cost-sensitive,Decision Tree: SMOTE + Cost-sensitive
0,Accuracy,0.658824,0.639216,0.658824
1,Precision,0.327586,0.349398,0.327586
2,Recall,0.283582,0.432836,0.283582
3,F1 Score,0.304,0.386667,0.304
4,ROC AUC,0.538068,0.572801,0.538068
5,G mean,0.474082,0.555437,0.474082


### REC_IM

In [456]:
results_rec_im = {"Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC", "G mean"],
                      "Decision Tree: SMOTE": results_SMOTE_balanced["REC_IM"],
                      "Decision Tree: Cost-sensitive": results_cost_sensitive["REC_IM"],
                      "Decision Tree: SMOTE + Cost-sensitive": results_SMOTE_cost_sensitive["REC_IM"]}
results_rec_im = pd.DataFrame(results_rec_im)
results_rec_im.columns = ["Metric", "Decision Tree: SMOTE", "Decision Tree: Cost-sensitive", "Decision Tree: SMOTE + Cost-sensitive"]

results_rec_im

Unnamed: 0,Metric,Decision Tree: SMOTE,Decision Tree: Cost-sensitive,Decision Tree: SMOTE + Cost-sensitive
0,Accuracy,0.74902,0.709804,0.764706
1,Precision,0.181818,0.147059,0.1875
2,Recall,0.08,0.1,0.06
3,F1 Score,0.111111,0.119048,0.090909
4,ROC AUC,0.496098,0.479268,0.498293
5,G mean,0.27014,0.293008,0.237055


### OTEK_LANC

In [457]:
results_otek_lanc = {"Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "ROC AUC", "G mean"],
                      "Decision Tree: SMOTE": results_SMOTE_balanced["OTEK_LANC"],
                      "Decision Tree: Cost-sensitive": results_cost_sensitive["OTEK_LANC"],
                      "Decision Tree: SMOTE + Cost-sensitive": results_SMOTE_cost_sensitive["OTEK_LANC"]}
results_otek_lanc = pd.DataFrame(results_otek_lanc)
results_otek_lanc.columns = ["Metric", "Decision Tree: SMOTE", "Decision Tree: Cost-sensitive", "Decision Tree: SMOTE + Cost-sensitive"]

results_otek_lanc

Unnamed: 0,Metric,Decision Tree: SMOTE,Decision Tree: Cost-sensitive,Decision Tree: SMOTE + Cost-sensitive
0,Accuracy,0.756863,0.705882,0.72549
1,Precision,0.444444,0.32,0.367347
2,Recall,0.350877,0.280702,0.315789
3,F1 Score,0.392157,0.299065,0.339623
4,ROC AUC,0.612307,0.554492,0.579612
5,G mean,0.553692,0.482183,0.516089


## 6. Model Evaluation

### ZSN

### FIBR_PREDS

### P_IM_STEN

### REC_IM

### OTEK_LANC