# Myocardial Infraction Complications Analysis

## 0. Introduction

In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [None]:
data = pd.read_csv("MI.data", header=None)
data.columns = ["ID", "AGE", "SEX", "INF_ANAM", "STENOK_AN", "FK_STENOK", "IBS_POST", "IBS_NASL", "GB", "SIM_GIPERT", "DLIT_AG", "ZSN_A", "nr_11", "nr_01", "nr_02", "nr_03", "nr_04", "nr_07", "nr_08", "np_01", "np_04", "np_05", "np_07", "np_08", "np_09", "np_10", "endocr_01", "endocr_02", "endocr_03", "zab_leg_01", "zab_leg_02", "zab_leg_03", "zab_leg_04", "zab_leg_06", "S_AD_KBRIG", "D_AD_KBRIG", "S_AD_ORIT", "D_AD_ORIT", "O_L_POST", "K_SH_POST", "MP_TP_POST", "SVT_POST", "GT_POST", "FIB_G_POST", "ant_im", "lat_im", "inf_im", "post_im", "IM_PG_P", "ritm_ecg_p_01", "ritm_ecg_p_02", "ritm_ecg_p_04", "ritm_ecg_p_06", "ritm_ecg_p_07", "ritm_ecg_p_08", "n_r_ecg_p_01", "n_r_ecg_p_02", "n_r_ecg_p_03", "n_r_ecg_p_04", "n_r_ecg_p_05", "n_r_ecg_p_06", "n_r_ecg_p_08", "n_r_ecg_p_09", "n_r_ecg_p_10", "n_p_ecg_p_01", "n_p_ecg_p_03", "n_p_ecg_p_04", "n_p_ecg_p_05", "n_p_ecg_p_06", "n_p_ecg_p_07", "n_p_ecg_p_08", "n_p_ecg_p_09", "n_p_ecg_p_10", "n_p_ecg_p_11", "n_p_ecg_p_12", "fibr_ter_01", "fibr_ter_02", "fibr_ter_03", "fibr_ter_05", "fibr_ter_06", "fibr_ter_07", "fibr_ter_08", "GIPO_K", "K_BLOOD", "GIPER_NA", "NA_BLOOD", "ALT_BLOOD", "AST_BLOOD", "KFK_BLOOD", "L_BLOOD", "ROE", "TIME_B_S", "R_AB_1_n", "R_AB_2_n", "R_AB_3_n", "NA_KB", "NOT_NA_KB", "LID_KB", "NITR_S", "NA_R_1_n", "NA_R_2_n", "NA_R_3_n", "NOT_NA_1_n", "NOT_NA_2_n", "NOT_NA_3_n", "LID_S_n", "B_BLOK_S_n", "ANT_CA_S_n", "GEPAR_S_n", "ASP_S_n", "TIKL_S_n", "TRENT_S_n", "FIBR_PREDS", "PREDS_TAH", "JELUD_TAH", "FIBR_JELUD", "A_V_BLOK", "OTEK_LANC", "RAZRIV", "DRESSLER", "ZSN", "REC_IM", "P_IM_STEN", "LET_IS"]
data.replace("?", np.NaN, inplace=True)
data = data.apply(pd.to_numeric, errors = "coerce")
data

In [None]:
related_time_features = ["R_AB_1_n","R_AB_2_n", "R_AB_3_n", "NA_R_1_n", "NA_R_2_n", "NA_R_3_n", "NOT_NA_1_n","NOT_NA_2_n", "NOT_NA_3_n"]

## 1. Train-Validation-Test Split

In [None]:
from sklearn.model_selection import train_test_split
X = data.iloc[:, 1:112]
X = X.drop(related_time_features, axis=1)
y = [data["ZSN"], data["FIBR_PREDS"], data["P_IM_STEN"], data["REC_IM"], data["OTEK_LANC"]]

X_train, X_test, y_train, y_test = [], [], [], []

# ! Caution
# i = 0 => ZSN or Chronic heart failure
# i = 1 => FIBR_PREDS or Atrial fibrillation
# i = 2 => P_IM_STEN or Post-infarction angina
# i = 3 => REC_IM or Relapse of the myocardial infarction
# i = 4 => OTEK_LANC or Pulmonary edema

for i in range(len(y)):
    # Separate train/test split for each target variable
    # Split the data into train and temporary sets
    X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X, y[i], train_size=0.7, random_state=0, stratify=y[i])
    
    X_train.append(X_train_i)
    X_test.append(X_test_i)
    y_train.append(y_train_i)
    y_test.append(y_test_i)


In [None]:
# Check the number of samples in each set and print as a table
results = {"Set": ["Train", "Test"]}
for i in range(len(y)):
    results[f"y{i}"] = [len(y_train[i]), len(y_test[i])]
results = pd.DataFrame(results)
results

## 2. EDA

### Data Overview

In [None]:
data

### Target Balance Check

In [None]:
# The number of instances in the dataset
number_of_instances = len(data)
# Create a table to show the balance of each target variable
results = {"Target": [], "0": [], "1": []}
for i in range(len(y)):
    results["Target"].append(f"y{i}")
    results["0"].append(y[i].value_counts()[0] / number_of_instances)
    results["1"].append(y[i].value_counts()[1] / number_of_instances)
results = pd.DataFrame(results)
results

### Check for missing values

In [None]:
# Create a table representing number of missing values of each feature, sort them descendingly
missing_values = X.isnull().sum().sort_values(ascending=False)
missing_values = missing_values[missing_values > 0]
missing_values = pd.DataFrame(missing_values, columns=["Number of Missing Values"])
missing_values


### Data Information of Feature

In [None]:
X.describe()

In [None]:
continuous_features = ["AGE", "S_AD_ORIT", "D_AD_ORIT", "K_BLOOD", "NA_BLOOD", "ALT_BLOOD", "AST_BLOOD", "L_BLOOD", "ROE"]
binary_categorical_features = [x for x in X.columns if x not in continuous_features]

In [None]:
binary_features = []
for col in binary_categorical_features:
    if  X[col].max() == 1:
        binary_features.append(col)

In [None]:
categorical_features = [x for x in binary_categorical_features if x not in binary_features]

## 3. Preprocessing

In [None]:
import copy
unprocessed_data = X_train
%store unprocessed_data
unprocessed_data[0]

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class ColumnDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold):
        self.columns_to_drop = None
        self.threshold = threshold

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        self.columns_to_drop = X.columns[X.isnull().sum() > self.threshold]
        return self

    def transform(self, X):
        # Ensure the input is a DataFrame
        X = pd.DataFrame(X)
        global X_keep 
        X_keep = list(X.columns[~X.columns.isin(self.columns_to_drop)])
        return X.drop(columns=self.columns_to_drop)
    

class RowDropper(BaseEstimator, TransformerMixin):
    def __init__(self, threshold):
        self.rows_to_drop = None
        self.threshold = threshold

    def fit(self, X, y=None):
        self.rows_to_drop = X.index[X.isnull().sum(axis=1) > self.threshold]
        return self

    def transform(self, X):
        # Ensure the input is a DataFrame
        X = pd.DataFrame(X)
        return X.drop(index=self.rows_to_drop)
    

In [None]:
# Define outliers handler class
class OutliersHandler(BaseEstimator, TransformerMixin):
    def __init__(self, coefficient):
        self.coefficient = coefficient
        self.lower_bounds = None
        self.upper_bounds = None

    def fit(self, X, y=None):
        X = pd.DataFrame(X)
        q1 = X.quantile(0.25)
        q3 = X.quantile(0.75)
        iqr = q3 - q1
        self.lower_bounds = q1 - iqr * self.coefficient
        self.upper_bounds = q3 + iqr * self.coefficient
        return self

    def transform(self, X):
        # Ensure the input is a DataFrame
        X = pd.DataFrame(X)
        for feature in X.columns:
            if feature in continuous_features: # if feature is continuous, because clipping in the other one will probably result in missing information
                X[feature] = X[feature].clip(self.lower_bounds[feature], self.upper_bounds[feature])
        return X

In [None]:
def impute_by_type(X, continuous_features=None, categorical_features=None, binary_features=None, binary_categorical_features=None):
    """Fills missing values based on data type, handling potential errors, works with subsets"""
    if categorical_features is not None:
        for feature in categorical_features:
            if feature in X.columns:
                try:
                    X[feature].fillna(X[feature].mode()[0], inplace=True)
                except KeyError:
                    pass
    if continuous_features is not None:
        for feature in continuous_features:
            if feature in X.columns:
                try:
                    X[feature].fillna(X[feature].mean(axis=0), inplace=True)
                except KeyError:
                    pass
    if binary_features is not None:
        for feature in binary_features:
            if feature in X.columns:
                try:
                    X[feature].fillna(X[feature].mode()[0], inplace=True)
                except KeyError:
                    pass
    if binary_categorical_features is not None and binary_features is None and categorical_features is None:
        for feature in binary_categorical_features:
            if feature in X.columns:
                try:
                    X[feature].fillna(X[feature].mode()[0], inplace=True)
                except KeyError:
                    pass
    return X

In [None]:
# from sklearn.impute import KNNImputer
# def impute_by_type(X, continuous_features=None, categorical_features=None, binary_features=None):
#     """Fills missing values based on data type, handling potential errors, works with subsets"""
#     if categorical_features is not None:
#         for feature in categorical_features:
#             if feature in X.columns:
#                 try:
#                     X[feature].fillna(X[feature].mode()[0], inplace=True)
#                 except KeyError:
#                     pass
#     imputer = KNNImputer(n_neighbors=5)
#     if continuous_features is not None:
#         try:
#             X_continuous = pd.DataFrame(imputer.fit_transform(X[continuous_features]))
#             X.update(X_continuous)
#         except KeyError:
#             pass
#     if binary_features is not None:
#         try:
#             X_binary = pd.DataFrame(imputer.fit_transform(X[binary_features]))
#             X.update(X_binary)
#         except KeyError:
#             pass
#     return X

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import FunctionTransformer

# Define dropper pipeline
dropper = Pipeline(steps=[
    ('column_dropper', ColumnDropper(threshold=100)),
    ('row_dropper', RowDropper(threshold=100))
])

# Define preprocessing pipeline
preprocessing_pipeline = Pipeline(steps=[
    ('dropper', dropper),
    ('outliers_clipper',OutliersHandler(coefficient=1.5)),
    ('imputation', FunctionTransformer(impute_by_type, kw_args={"continuous_features":continuous_features, "binary_features": binary_features, "categorical_features": categorical_features })),  # Fill missing values using mean/mode
    ('scaling', StandardScaler())  # Standardize features by removing the mean and scaling to unit variance
])

# Apply the preprocessing pipeline to each set
for i in range(len(y)):
    X_train[i] = preprocessing_pipeline.fit_transform(X_train[i])
    X_test[i] = preprocessing_pipeline.transform(X_test[i])

#%store X_train
#%store y_train
#%store X_test
#%store y_test

preprocessed_data = pd.DataFrame(data=X_train[0], columns=X_keep)
%store preprocessed_data
preprocessed_data

## 4. Model Selection

In [None]:
#create a pipeline to predict y using svm
from sklearn.svm import SVC

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

# Define the pipeline
model_pipeline = Pipeline(steps=[
    ('model', SVC())
])

# Define the hyperparameters grid
param_grid= {'model__C': [0.1, 1, 10, 100, 1000], 'model__gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'model__kernel': ['rbf', 'linear']}

# Define the grid search
grid_search = GridSearchCV(model_pipeline, param_grid, scoring = 'roc_auc', cv=5, n_jobs=-1)

# Define a list to store the best parameters
best_params = []

# Fit the grid search for each target variable
for i in range(len(y)):
    grid_search.fit(X_train[i], y_train[i])
    # Print the best parameters
    print("-----------------------------------------------------------------------")
    print(f"Best parameters for y{i}: {grid_search.best_params_}")
    # Store the parameters
    best_params.append(grid_search.best_params_)

    

## 5. Model Evaluation

In [None]:
# Create 5 tables for evaluating 5 targets, each table contains 1 column representing the model, 5 rows representing the metrics, which are accuracy, precision, recall, f1-score, and ROC AUC
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

for i in range(len(y)):
    results = {"Metric" : ["Accuracy", "Precision", "Recall", "F1", "ROC AUC"]}
    # Set the best parameters for the model in the pipeline
    model_pipeline.set_params(**best_params[i])

    # Fit the model with the best parameters
    model_pipeline.fit(X_train[i], y_train[i])

    # Predict the target variable
    y_pred = model_pipeline.predict(X_test[i])

    # Calculate the metrics
    accuracy = accuracy_score(y_test[i], y_pred)
    precision = precision_score(y_test[i], y_pred)
    recall = recall_score(y_test[i], y_pred)
    f1 = f1_score(y_test[i], y_pred)
    roc_auc = roc_auc_score(y_test[i], y_pred)

    # Store the metrics
    results["SVM"] = [accuracy, precision, recall, f1, roc_auc]
    results = pd.DataFrame(results)
    print("-------------------------------------------------")
    if i==0:
        print("Result for ZSN or Chronic heart failure")
    if i==1:
        print("Result for FIBR_PREDS or Atrial fibrillation")
    if i==2:
        print("Result for P_IM_STEN or Post-infarction angina")
    if i==3:
        print("Result for REC_IM or Relapse of the myocardial infarction")
    if i==4:
        print("Result for OTEK_LANC or Pulmonary edema")
    print(results)
    del results
    
