In [303]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

In [304]:
df = pd.read_csv('heart_failure_clinical_records_dataset.csv')
df.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


# EDA

In [309]:
def print_general_data_info(df: pd.DataFrame) -> None:
    print(f"Total samples: {df.shape[0]}")
    print(f"Total Memory Used: {df.memory_usage().sum()}")
    print(f"Total Number of duplicate rows: {df.duplicated().sum()}")


def get_data_statistics(df: pd.DataFrame, target_col: str) -> pd.DataFrame:

    result = df.dtypes.to_frame().reset_index().rename(columns={0 : 'dtypes'})

    OPERATIONS = {
        'isnull_sum': df.isnull().sum(),
        'nunique' : df.nunique(),
        'corr_pearson' : df.corr(method='pearson')[target_col],
        'corr_spearman' : df.corr(method='spearman')[target_col],
        'mode' : df.mode(axis=0, dropna=False).T[0],
        'min' : df.select_dtypes(include='number').min(),
        '25%' : df.select_dtypes(include='number').quantile(0.25),
        'mean' : df.select_dtypes(include='number').mean(),
        'median' : df.select_dtypes(include='number').median(),
        '75%' : df.select_dtypes(include='number').quantile(0.75),
        'max' : df.select_dtypes(include='number').max(),
        'std' : df.select_dtypes(include='number').std()
    }

    for operation_name in tqdm(OPERATIONS):
        result = result.merge(OPERATIONS[operation_name].to_frame().reset_index(), how='left', on='index')
        result.rename(columns={result.columns[-1] : operation_name}, inplace=True)

    iqr = result['75%'] - result['25%']

    result['LLP'] = result['25%'] - 1.5*iqr
    result['ULP'] = result['75%'] + 1.5*iqr
    
    result.loc[(result['25%'].isna()==False), 'has_outliers'] = True
    result.loc[(result['min'] > result['LLP']) & (result['max'] < result['ULP']), 'has_outliers'] = False
    
    display(result)
    
def print_eda(df: pd.DataFrame, target_col: str) -> None:
    print(f"General Data info\n")
    print_general_data_info(df=df)
    print('-' * 100)
    print(f"Data Statistics\n")
    get_data_statistics(df=df, target_col=target_col)

In [310]:
print_eda(df=df, target_col='DEATH_EVENT')

General Data info

Total samples: 299
Total Memory Used: 31224
Total Number of duplicate rows: 0
----------------------------------------------------------------------------------------------------
Data Statistics



  0%|          | 0/12 [00:00<?, ?it/s]

100%|██████████| 12/12 [00:00<00:00, 631.06it/s]


Unnamed: 0,index,dtypes,isnull_sum,nunique,corr_pearson,corr_spearman,mode,min,25%,mean,median,75%,max,std,LLP,ULP,has_outliers
0,age,float64,0,47,0.253729,0.218125,60.0,40.0,51.0,60.833893,60.0,70.0,95.0,11.894809,22.5,98.5,False
1,anaemia,int64,0,2,0.06627,0.06627,0.0,0.0,0.0,0.431438,0.0,1.0,1.0,0.496107,-1.5,2.5,False
2,creatinine_phosphokinase,int64,0,208,0.062728,0.023616,582.0,23.0,116.5,581.839465,250.0,582.0,7861.0,970.287881,-581.75,1280.25,True
3,diabetes,int64,0,2,-0.001943,-0.001943,0.0,0.0,0.0,0.41806,0.0,1.0,1.0,0.494067,-1.5,2.5,False
4,ejection_fraction,int64,0,17,-0.268603,-0.286869,35.0,14.0,30.0,38.083612,38.0,45.0,80.0,11.834841,7.5,67.5,True
5,high_blood_pressure,int64,0,2,0.079351,0.079351,0.0,0.0,0.0,0.351171,0.0,1.0,1.0,0.478136,-1.5,2.5,False
6,platelets,float64,0,176,-0.049139,-0.0462,263358.03,25100.0,212500.0,263358.029264,262000.0,303500.0,850000.0,97804.236869,76000.0,440000.0,True
7,serum_creatinine,float64,0,40,0.294278,0.37063,1.0,0.5,0.9,1.39388,1.1,1.4,9.4,1.03451,0.15,2.15,True
8,serum_sodium,int64,0,27,-0.195204,-0.209837,136.0,113.0,134.0,136.625418,137.0,140.0,148.0,4.412477,125.0,149.0,True
9,sex,int64,0,2,-0.004316,-0.004316,1.0,0.0,0.0,0.648829,1.0,1.0,1.0,0.478136,-1.5,2.5,False
