# Predictions: Heart Diseases

Prediction of heart disease given medical informartion. 

## Libraries

In [1]:
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from ydata_profiling import ProfileReport

pd.set_option('display.max_colwidth', None)

In [2]:
MAX_DATASET_SAMPLES = 10000 # Datasets will the capped to this sample_size to keep experiments fast 

## Import Data

In [3]:
datasets = [
    {
        "name": "Heart Attack Prediction Dataset",
        "path": "data/heart.csv",
        "features_numerical": ['age', 'trtbps', 'chol', 'thalachh', 'oldpeak'],
        "features_categorical": ['sex', 'cp', 'fbs', 'restecg', 'exng', 'slp', 'caa', 'thall'],
        "target": "output",
    },

    {
        "name": "Heart Failure Prediction",
        "path": "data/heart(2).csv",
        "features_numerical": ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak' ],
        "features_categorical": ['Sex', 'ChestPainType', 'FastingBS', 'RestingECG', 'ExerciseAngina', 'ST_Slope'],
        "target": "HeartDisease",
    },

    {
        "name": "Heart Failure Prediction 2",
        "path": "data/heart_failure_clinical_records_dataset.csv",
        "features_numerical": ['age', 'creatinine_phosphokinase', 'ejection_fraction', 'platelets', 'serum_creatinine', 'serum_sodium', 'time' ],
        "features_categorical": ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking',],
        "target": "DEATH_EVENT",
    },

    {
        "name": "Cardiovascular Disease Dataset",
        "path": "data/cardio_train.csv",
        "features_numerical": ['age', 'height', 'weight', 'ap_hi', 'ap_lo' ],
        "features_categorical": ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active'],
        "target": "cardio",
    },

    {
        "name": "Indicators of Heart Disease",
        "path": "data/heart_2020_cleaned.csv",
        "features_numerical": ['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime',  ],
        "features_categorical": ['Smoking', 'AlcoholDrinking', 'Stroke', 'DiffWalking', 'Sex', 'AgeCategory', 'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'Asthma', 'KidneyDisease', 'SkinCancer', ],
        "target": "HeartDisease",
    },

]


In [4]:
dataset_path = datasets[0]["path"]
df = pd.read_csv(dataset_path)
df

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


## EDA

In [5]:
def describe_dataframe(df:pd.DataFrame)->pd.DataFrame:
    df_describe = df.describe()
    df_describe.loc['dtype'] = df_describe.dtypes
    df_describe.loc['null_count'] = df_describe.isnull().sum()
    print("df.shape:", df.shape)
    return df_describe

describe_dataframe(df)

df.shape: (303, 14)


Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0
dtype,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64,float64
null_count,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [24]:
# Define a custom configuration
from ydata_profiling.config import Settings
custom_config = Settings(
    plot={
        "correlation": {
            "cmap": "magma",  # Change the colormap for the correlation heatmap
            "fontsize": 10       # Adjust font size
        },
        "missing": {
            "cmap": "viridis"    # Customize the colormap for missing values heatmap
        }
    }
)

df = pd.read_csv(datasets[0]['path'])
report = ProfileReport(df, minimal=False, config=custom_config)
report.to_file("reports/example_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## EDAs

In [26]:
for dataset in datasets:
    name = dataset["name"]
    target = dataset["target"]
    output_file_name = f"reports/profiling_{name}_{target}.html"
    report_title = f"{name} - {target}"
    df = pd.read_csv(dataset['path'])
    if len(df)>MAX_DATASET_SAMPLES: # If too large, reduce Dataset size
        df = df.sample(n=MAX_DATASET_SAMPLES, replace=False, random_state=42)
    report = ProfileReport(df, title=report_title, minimal=False, config=None)
    report.to_file(output_file_name)


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]