# Environment Set up

## Mount drive

In [1]:
try:
  from google.colab import drive
  drive.mount('/content/drive')
  import sys
  path_to_project = '/content/drive/MyDrive/FairAlgorithm'
  sys.path.append(path_to_project)
  !sudo apt install libcairo2-dev pkg-config python3-dev
  !{sys.executable} -m pip install -U ydata-profiling[notebook]
  !pip install jupyter-contrib-nbextensions
  !jupyter nbextension enable --py widgetsnbextension
  IN_COLAB = True
except:
  IN_COLAB = False

Mounted at /content/drive
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
python3-dev is already the newest version (3.10.6-1~22.04.1).
python3-dev set to manually installed.
The following packages were automatically installed and are no longer required:
  libbz2-dev libpkgconf3 libreadline-dev
Use 'sudo apt autoremove' to remove them.
The following additional packages will be installed:
  libblkid-dev libblkid1 libcairo-script-interpreter2 libffi-dev
  libglib2.0-dev libglib2.0-dev-bin libice-dev liblzo2-2 libmount-dev
  libmount1 libpixman-1-dev libselinux1-dev libsepol-dev libsm-dev
  libxcb-render0-dev libxcb-shm0-dev
Suggested packages:
  libcairo2-doc libgirepository1.0-dev libglib2.0-doc libgdk-pixbuf2.0-bin
  | libgdk-pixbuf2.0-dev libxml2-utils libice-doc cryptsetup-bin libsm-doc
The following packages will be REMOVED:
  pkgconf r-base-dev
The following NEW packages will be installed:
  libblkid-dev libcairo-script-interpreter2 

## Import Libraries

In [2]:
import numpy as np
import pandas as pd
import plotly
np.random.seed(0)
from rich import print
from rich.columns import Columns
from rich.panel import Panel
from rich.align import Align
from source.utils.print_util import *
from source.utils.data_preprocessing import *
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport
import seaborn as sns

# Data loading and Config

## Configuration

In [3]:
dataset_path = path_to_project + '/data/raw/myocardial-infarction.csv' if IN_COLAB else 'data/raw/myocardial-infarction.csv'
X = ["ID", "AGE", "SEX", "INF_ANAM", "STENOK_AN", "FK_STENOK", "IBS_POST", "IBS_NASL", "GB", "SIM_GIPERT", "DLIT_AG", "ZSN_A", "nr_11", "nr_01", "nr_02", "nr_03", "nr_04", "nr_07", "nr_08", "np_01", "np_04", "np_05", "np_07", "np_08", "np_09", "np_10", "endocr_01", "endocr_02", "endocr_03", "zab_leg_01", "zab_leg_02", "zab_leg_03", "zab_leg_04", "zab_leg_06", "S_AD_KBRIG", "D_AD_KBRIG", "S_AD_ORIT", "D_AD_ORIT", "O_L_POST", "K_SH_POST", "MP_TP_POST", "SVT_POST", "GT_POST", "FIB_G_POST", "ant_im", "lat_im", "inf_im", "post_im", "IM_PG_P", "ritm_ecg_p_01", "ritm_ecg_p_02", "ritm_ecg_p_04", "ritm_ecg_p_06", "ritm_ecg_p_07", "ritm_ecg_p_08", "n_r_ecg_p_01", "n_r_ecg_p_02", "n_r_ecg_p_03", "n_r_ecg_p_04", "n_r_ecg_p_05", "n_r_ecg_p_06", "n_r_ecg_p_08", "n_r_ecg_p_09", "n_r_ecg_p_10", "n_p_ecg_p_01", "n_p_ecg_p_03", "n_p_ecg_p_04", "n_p_ecg_p_05", "n_p_ecg_p_06", "n_p_ecg_p_07", "n_p_ecg_p_08", "n_p_ecg_p_09", "n_p_ecg_p_10", "n_p_ecg_p_11", "n_p_ecg_p_12", "fibr_ter_01", "fibr_ter_02", "fibr_ter_03", "fibr_ter_05", "fibr_ter_06", "fibr_ter_07", "fibr_ter_08", "GIPO_K", "K_BLOOD", "GIPER_NA", "NA_BLOOD", "ALT_BLOOD", "AST_BLOOD", "KFK_BLOOD", "L_BLOOD", "ROE", "TIME_B_S", "R_AB_1_n", "R_AB_2_n", "R_AB_3_n", "NA_KB", "NOT_NA_KB", "LID_KB", "NITR_S", "NA_R_1_n", "NA_R_2_n", "NA_R_3_n", "NOT_NA_1_n", "NOT_NA_2_n", "NOT_NA_3_n","LID_S_n", "B_BLOK_S_n", "ANT_CA_S_n", "GEPAR_S_n", "ASP_S_n", "TIKL_S_n", "TRENT_S_n", "FIBR_PREDS", "PREDS_TAH", "JELUD_TAH", "FIBR_JELUD", "A_V_BLOK", "OTEK_LANC", "RAZRIV", "DRESSLER", "ZSN", "REC_IM", "P_IM_STEN", "LET_IS"]
Y = 'Outcome'
Y_labels = 'Complication','No Complication'
sensible_attribute = "SEX"
G = ['SEX']

In [4]:
sensitive_attribute_map = {
    "SEX": {
        0: "Female",
        1: "Male",
    }
}

## Load Data

In [75]:
df=pd.read_csv(dataset_path)
df.head(10)

Unnamed: 0,ID,AGE,SEX,INF_ANAM,STENOK_AN,FK_STENOK,IBS_POST,IBS_NASL,GB,SIM_GIPERT,...,JELUD_TAH,FIBR_JELUD,A_V_BLOK,OTEK_LANC,RAZRIV,DRESSLER,ZSN,REC_IM,P_IM_STEN,LET_IS
0,1,77,1,2,1,1.0,2.0,,3.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2,55,1,1,0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,3,52,1,0,0,0.0,2.0,,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,4,68,0,0,0,0.0,2.0,,2.0,0.0,...,0,0,0,0,0,0,1,0,0,0
4,5,60,1,0,0,0.0,2.0,,3.0,0.0,...,0,0,0,0,0,0,0,0,0,0
5,6,64,1,0,1,2.0,1.0,,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
6,7,70,1,1,1,2.0,1.0,,2.0,0.0,...,0,0,0,0,0,0,1,0,0,0
7,8,65,1,0,1,1.0,2.0,,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
8,9,60,1,0,0,0.0,2.0,,2.0,0.0,...,0,0,0,0,0,0,0,0,0,0
9,10,77,0,2,0,0.0,0.0,,3.0,0.0,...,0,0,0,0,0,0,1,0,0,0


In [76]:
import os
path_to_project = '/content/drive/MyDrive/FairAlgorithm'
os.makedirs(path_to_project, exist_ok=True)

profile = ProfileReport(df, title='Myocardial Analysis Raw')
html_file = profile.to_file(path_to_project + '/data/reports/myocardial_analysis_raw.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]
  df[duplicated_rows]


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [77]:
df.columns

Index(['ID', 'AGE', 'SEX', 'INF_ANAM', 'STENOK_AN', 'FK_STENOK', 'IBS_POST',
       'IBS_NASL', 'GB', 'SIM_GIPERT',
       ...
       'JELUD_TAH', 'FIBR_JELUD', 'A_V_BLOK', 'OTEK_LANC', 'RAZRIV',
       'DRESSLER', 'ZSN', 'REC_IM', 'P_IM_STEN', 'LET_IS'],
      dtype='object', length=124)

In [78]:
df['LET_IS_cat'] = df['LET_IS'].apply(lambda x: 0 if x == 0 else 1)

# Display the first rows to verify the transformation
print(df[['LET_IS_cat', 'LET_IS']].head())

In [79]:
value_counts = df['LET_IS'].value_counts()

print("Count of each unique value in LET_IS:")
print(value_counts)

In [80]:
value_counts = df['LET_IS_cat'].value_counts()

print("Count of each unique value in LET_IS_cat:")
print(value_counts)

In [81]:
rows, columns = df.shape
print(f"The DataFrame has {rows} rows and {columns} columns.")

## Raw probabilities calculus

In [82]:
# Calculate the total number of cases where LET_IS = 0 and LET_IS = 1
total_Y1 = df[df['LET_IS_cat'] == 1].shape[0]
total_Y0 = df[df['LET_IS_cat'] == 0].shape[0]

# Calculate Pr(Y=1) and Pr(Y=0)
pr_Y1 = total_Y1 / len(df) if len(df) > 0 else 0  # Handling division by zero
pr_Y0 = total_Y0 / len(df) if len(df) > 0 else 0  # Handling division by zero

# Print the results
print(f'Pr(Y=1): {pr_Y1:.9f}')
print(f'Pr(Y=0): {pr_Y0:.9f}')

In [83]:
# Filter the cases
cases_Y1_G1 = df[(df['LET_IS_cat'] == 1) & (df['SEX'] == 1)].shape[0]
cases_Y1_G0 = df[(df['LET_IS_cat'] == 1) & (df['SEX'] == 0)].shape[0]
cases_Y0_G1 = df[(df['LET_IS_cat'] == 0) & (df['SEX'] == 1)].shape[0]
cases_Y0_G0 = df[(df['LET_IS_cat'] == 0) & (df['SEX'] == 0)].shape[0]

In [84]:
total_G1 = df[df['SEX'] == 1].shape[0]
total_G0 = df[df['SEX'] == 0].shape[0]

pr_Y1_G1 = cases_Y1_G1 / total_G1 if total_G1 > 0 else 0  # Handling division by zero
pr_Y1_G0 = cases_Y1_G0 / total_G0 if total_G0 > 0 else 0  # Handling division by zero
pr_Y0_G1 = cases_Y0_G1 / total_G1 if total_G1 > 0 else 0  # Handling division by zero
pr_Y0_G0 = cases_Y0_G0 / total_G0 if total_G0 > 0 else 0  # Handling division by zero

pr_G1 = total_G1 / len(df) if len(df) > 0 else 0  # Handling division by zero
pr_G0 = total_G0 / len(df) if len(df) > 0 else 0  # Handling division by zero

# Print the results
print(f'Total G1: {total_G1}')
print(f'Total G0: {total_G0}')
print(f'Pr(Y=1 | G=1): {pr_Y1_G1:.9f}')
print(f'Pr(Y=1 | G=0): {pr_Y1_G0:.9f}')
print(f'Pr(Y=0 | G=1): {pr_Y0_G1:.9f}')
print(f'Pr(Y=0 | G=0): {pr_Y0_G0:.9f}')
print(f'Pr(G=1): {pr_G1:.9f}')
print(f'Pr(G=0): {pr_G0:.9f}')

## Missing value

In [85]:
# Replace '?' elements with NaN so that they can be detected as null
df.replace('?', np.nan, inplace=True)

# Count the null elements in each column
nulls_per_column = df.isnull().sum()

# Filter only the columns with nulls
nulls_per_column = nulls_per_column[nulls_per_column > 0]

# Display the results as a full string without truncation
print(nulls_per_column.to_string())

# Preprocessing

## Missing Values

In [86]:
# Remove columns with a high percentage of null values and identifiers
columns_to_remove = [
    'IBS_NASL', 'S_AD_KBRIG', 'D_AD_KBRIG', 'KFK_BLOOD', 'LET_IS', 'L_BLOOD', 'ritm_ecg_p_07', 'ritm_ecg_p_02', 'NOT_NA_3_n', 'DLIT_AG', 'ROE', 'n_p_ecg_p_12'
]
df = df.drop(columns=columns_to_remove)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1586 entries, 0 to 1585
Columns: 113 entries, ID to LET_IS_cat
dtypes: float64(96), int64(17)
memory usage: 1.4 MB


### Dimensionality Reduction

In [87]:
# Replace '?' with NaN
df = df.replace('?', pd.NA)

# Convert columns to numeric where possible
for col in df.columns:
    try:
        df[col] = pd.to_numeric(df[col])
    except ValueError:
        # Handle columns that cannot be converted, e.g., keep as strings
        pass
# Calculate the correlation of each column with the target 'LET_IS' and sort from highest to lowest
correlation_percentages = sorted([(column, correlation * 100) for column, correlation in zip(df.columns, df.corr()['LET_IS_cat'])], key=lambda x: x[1], reverse=True)

# Display the correlation percentage of each column with 'LET_IS', sorted from highest to lowest
for column, percentage in correlation_percentages:
    print(f"{column}: {percentage:.2f}%")

# Filter columns with a correlation greater than 10% with 'LET_IS', and add 'SEX' regardless of its correlation
features = [column for column, correlation in correlation_percentages if correlation > 10]  # 10% is equivalent to 0.1
if 'SEX' not in features:
    features.append('SEX')  # Ensure 'SEX' is included

# Update the DataFrame to include only the selected columns
df = df[features]


In [88]:
# Count the null elements in each column
nulls_per_column = df.isnull().sum()

# Filter only the columns with nulls
nulls_per_column = nulls_per_column[nulls_per_column > 0]

# Display the results as a full string without truncation
print(nulls_per_column.to_string())

In [89]:
# List of columns to use for dropping rows with NaN values
columns_to_dropna = ['ZSN_A', 'NITR_S', 'AGE', 'R_AB_3_n', 'nr_04', 'ant_im',
                     'MP_TP_POST', 'zab_leg_02', 'STENOK_AN']

# Drop rows with NaN values in any of the specified columns
df = df.dropna(subset=columns_to_dropna)


In [90]:
df = remove_missing_values(df)

In [91]:
# Count the null elements in each column
nulls_per_column = df.isnull().sum()

# Filter only the columns with nulls
nulls_per_column = nulls_per_column[nulls_per_column > 0]

# Display the results as a full string without truncation
print(nulls_per_column.to_string())

In [92]:
rows, columns = df.shape
print(f"The DataFrame has {rows} rows and {columns} columns.")

In [93]:
df.head(10)

Unnamed: 0,LET_IS_cat,ID,RAZRIV,K_SH_POST,AGE,ZSN_A,NITR_S,NA_R_1_n,R_AB_3_n,STENOK_AN,...,FK_STENOK,REC_IM,zab_leg_02,MP_TP_POST,lat_im,IBS_POST,nr_04,NOT_NA_2_n,R_AB_2_n,SEX
0,0,1,0,0.0,77,0.0,0.0,0.0,1.0,1,...,1.0,0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1
1,0,2,0,0.0,55,0.0,0.0,0.0,0.0,0,...,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
2,0,3,0,0.0,52,0.0,0.0,1.0,0.0,0,...,0.0,0,0.0,0.0,1.0,2.0,0.0,2.0,0.0,1
3,0,4,0,0.0,68,1.0,0.0,0.0,1.0,0,...,0.0,0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0
4,0,5,0,0.0,60,0.0,0.0,0.0,0.0,0,...,0.0,0,0.0,0.0,1.0,2.0,0.0,0.0,0.0,1
5,0,6,0,0.0,64,0.0,0.0,0.0,0.0,1,...,2.0,0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1
6,0,7,0,0.0,70,1.0,0.0,0.0,0.0,1,...,2.0,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
7,0,8,0,0.0,65,0.0,0.0,0.0,0.0,1,...,1.0,0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1
8,0,9,0,0.0,60,0.0,0.0,1.0,0.0,0,...,0.0,0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1
9,0,10,0,0.0,77,1.0,0.0,0.0,0.0,0,...,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0


In [94]:
df.columns

Index(['LET_IS_cat', 'ID', 'RAZRIV', 'K_SH_POST', 'AGE', 'ZSN_A', 'NITR_S',
       'NA_R_1_n', 'R_AB_3_n', 'STENOK_AN', 'ant_im', 'R_AB_1_n', 'IM_PG_P',
       'O_L_POST', 'FK_STENOK', 'REC_IM', 'zab_leg_02', 'MP_TP_POST', 'lat_im',
       'IBS_POST', 'nr_04', 'NOT_NA_2_n', 'R_AB_2_n', 'SEX'],
      dtype='object')

In [95]:
# Shuffle data frame to avoid errors when training i.e. due to unbalanced datasets
df = df.sample(frac=1)
df.head()

Unnamed: 0,LET_IS_cat,ID,RAZRIV,K_SH_POST,AGE,ZSN_A,NITR_S,NA_R_1_n,R_AB_3_n,STENOK_AN,...,FK_STENOK,REC_IM,zab_leg_02,MP_TP_POST,lat_im,IBS_POST,nr_04,NOT_NA_2_n,R_AB_2_n,SEX
1078,0,1118,0,0.0,57,0.0,0.0,0.0,0.0,6,...,2.0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1
684,0,705,0,0.0,44,0.0,0.0,0.0,0.0,0,...,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
1261,0,1329,0,0.0,59,0.0,0.0,0.0,0.0,6,...,2.0,1,0.0,0.0,1.0,2.0,0.0,0.0,0.0,1
31,0,32,0,0.0,65,0.0,0.0,0.0,0.0,0,...,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
269,0,278,0,0.0,74,1.0,0.0,0.0,0.0,0,...,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [96]:
# Save the preprocesses DataFrame back to a CSV file
saving_path = path_to_project + '/data/preprocessed/preprocessed-myocardial-infarction.csv' if IN_COLAB else 'data/preprocessed/preprocessed-myocardial-infarction.csv'
df.to_csv(saving_path, sep=',', index=False, encoding='utf-8')

# Probabilistic Analysis

In [97]:
df.head()

Unnamed: 0,LET_IS_cat,ID,RAZRIV,K_SH_POST,AGE,ZSN_A,NITR_S,NA_R_1_n,R_AB_3_n,STENOK_AN,...,FK_STENOK,REC_IM,zab_leg_02,MP_TP_POST,lat_im,IBS_POST,nr_04,NOT_NA_2_n,R_AB_2_n,SEX
1078,0,1118,0,0.0,57,0.0,0.0,0.0,0.0,6,...,2.0,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1
684,0,705,0,0.0,44,0.0,0.0,0.0,0.0,0,...,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
1261,0,1329,0,0.0,59,0.0,0.0,0.0,0.0,6,...,2.0,1,0.0,0.0,1.0,2.0,0.0,0.0,0.0,1
31,0,32,0,0.0,65,0.0,0.0,0.0,0.0,0,...,0.0,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1
269,0,278,0,0.0,74,1.0,0.0,0.0,0.0,0,...,0.0,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [98]:
# Filter the cases
cases_Y1_G1 = df[(df['LET_IS_cat'] == 1) & (df['SEX'] == 1)].shape[0]
cases_Y1_G0 = df[(df['LET_IS_cat'] == 1) & (df['SEX'] == 0)].shape[0]
cases_Y0_G1 = df[(df['LET_IS_cat'] == 0) & (df['SEX'] == 1)].shape[0]
cases_Y0_G0 = df[(df['LET_IS_cat'] == 0) & (df['SEX'] == 0)].shape[0]

In [99]:
# Calculate the total number of cases where LET_IS_cat = 0 and LET_IS_cat = 1
total_Y1 = df[df['LET_IS_cat'] == 1].shape[0]
total_Y0 = df[df['LET_IS_cat'] == 0].shape[0]

# Calculate Pr(Y=1) and Pr(Y=0)
pr_Y1 = total_Y1 / len(df) if len(df) > 0 else 0  # Handling division by zero
pr_Y0 = total_Y0 / len(df) if len(df) > 0 else 0  # Handling division by zero

# Print the results
print(f'Pr(Y=1): {pr_Y1:.9f}')
print(f'Pr(Y=0): {pr_Y0:.9f}')

In [100]:
total_G1 = df[df['SEX'] == 1].shape[0]
total_G0 = df[df['SEX'] == 0].shape[0]

pr_Y1_G1 = cases_Y1_G1 / total_G1 if total_G1 > 0 else 0  # Handling division by zero
pr_Y1_G0 = cases_Y1_G0 / total_G0 if total_G0 > 0 else 0  # Handling division by zero
pr_Y0_G1 = cases_Y0_G1 / total_G1 if total_G1 > 0 else 0  # Handling division by zero
pr_Y0_G0 = cases_Y0_G0 / total_G0 if total_G0 > 0 else 0  # Handling division by zero

pr_G1 = total_G1 / len(df) if len(df) > 0 else 0  # Handling division by zero
pr_G0 = total_G0 / len(df) if len(df) > 0 else 0  # Handling division by zero

# Print the results
print(f'Total G1: {total_G1}')
print(f'Total G0: {total_G0}')
print(f'Pr(Y=1 | G=1): {pr_Y1_G1:.9f}')
print(f'Pr(Y=1 | G=0): {pr_Y1_G0:.9f}')
print(f'Pr(Y=0 | G=1): {pr_Y0_G1:.9f}')
print(f'Pr(Y=0 | G=0): {pr_Y0_G0:.9f}')
print(f'Pr(G=1): {pr_G1:.9f}')
print(f'Pr(G=0): {pr_G0:.9f}')

# Report

In [102]:
import os
path_to_project = '/content/drive/MyDrive/FairAlgorithm'
os.makedirs(path_to_project, exist_ok=True)

profile = ProfileReport(df, title='Myocardial Analysis')
html_file = profile.to_file(path_to_project + '/data/reports/myocardial_analysis.html')

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]