### Heart Attack Prediction EDA
--------------------------------
- columns:
	- Age - Age of patient
	- Diabetes - 0 = No, 1 = Yes
	- Anaemia - 0 = No, 1 = Yes
	- High_blood_pressure - 0 = No, 1 = Yes
	- Smoking - 0 = No, 1 = Yes
	- DEATH_EVENT - 0 = No, 1 = Yes

In [1]:
import numpy as np
import pandas as pd

import os
for dirname, _, filenames in os.walk('../*.csv'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
import matplotlib.pyplot as plt
import warnings
import seaborn as sns
from colorama import Fore, Back, Style
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from mlxtend.plotting import plot_confusion_matrix
from plotly.offline import plot, iplot, init_notebook_mode
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly.express as px
from statsmodels.formula.api import ols
import plotly.figure_factory as ff

init_notebook_mode(connected=True)
warnings.filterwarnings('ignore')

from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import xgboost as xg
import lightgbm as lite
from catboost import CatBoostClassifier




In [3]:
HEART_DATA = pd.read_csv('../heart_failure_clinical_records_dataset.csv')
HEART_DATA.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [4]:
# Age Distribution
hist_data = [HEART_DATA["age"].values]
group_labels = ['age']

fig = ff.create_distplot(hist_data, group_labels)
fig.update_layout(title_text = 'Age Distribution plot')
fig.show()

Note: Ages 40-80 have a high spread, ages <= 40 & >= 80 spread is much lower.
The variance is represented via bar charts below.

In [5]:
fig = px.box(HEART_DATA, x = 'sex', y = 'age', points = 'all')
fig.update_layout(title_text="Gender-wise Age Spread, Male = 1 Female = 0")
fig.show()

In [6]:
male = HEART_DATA[HEART_DATA["sex"] == 1]
female = HEART_DATA[HEART_DATA["sex"] == 0]

male_survive = male[HEART_DATA["DEATH_EVENT"] == 0]
male_ded = male[HEART_DATA["DEATH_EVENT"] == 1]
female_survive = female[HEART_DATA["DEATH_EVENT"] == 0]
female_ded = female[HEART_DATA["DEATH_EVENT"] == 1]

labels = ['Male - Survived', 'Male - ded', 'Female - Survived', 'Female - ded']
# pandas chaining, need more practice with this.
values = [len(male[HEART_DATA["DEATH_EVENT"] == 0]), len(male[HEART_DATA["DEATH_EVENT"] == 1]),
          len(female[HEART_DATA["DEATH_EVENT"] == 0]), len(female[HEART_DATA["DEATH_EVENT"] == 1])]

fig = go.Figure(data = [go.Pie(labels = labels, values = values, hole = 0.4)])
fig.update_layout(title_text = "Analysis of Gender-based Survival Data")
fig.show()
          

In [9]:
# Gender Factor Analysis
survival = HEART_DATA[HEART_DATA["DEATH_EVENT"] == 0]["age"]
not_surv = HEART_DATA[HEART_DATA["DEATH_EVENT"] == 1]["age"]
hist_data = [survival, not_surv]
group_labels = ["Survived", "Not Survived"]
fig = ff.create_distplot(hist_data, group_labels, bin_size = 0.5)
fig.update_layout(title_text = "Analysis of Age-based Survival Status")
fig.show()


<center>Survival is high in the 40 - 70 range. The spread of Not Survived is throughout all ages.</center>

In [10]:
fig = px.violin(HEART_DATA, y = 'age', x = 'sex', color = 'DEATH_EVENT', box = True, points = "all", hover_data=HEART_DATA.columns)
fig.update_layout(title_text = "Analysis in Age and Gender on Survival Status")
fig.show()

In [11]:
# Analysis of Smoking on Survival status

fig = px.violin(HEART_DATA, y = 'age', x = 'smoking', color = 'DEATH_EVENT', box = True, points = "all", hover_data=HEART_DATA.columns)
fig.update_layout(title_text = "Analysis in Age and Smoking on Survival Status")
fig.show()

In [12]:
fig = px.violin(HEART_DATA, y = 'age', x = 'diabetes', color = 'DEATH_EVENT', box = True, points = "all", hover_data=HEART_DATA.columns)
fig.update_layout(title_text = "Analysis in Age and Diabetes on Survival Status")
fig.show()

In [13]:
fig = px.histogram(HEART_DATA, x = "creatinine_phosphokinase", color = "DEATH_EVENT", marginal = "violin", hover_data=HEART_DATA.columns)
fig.show()

In [14]:
fig = px.histogram(HEART_DATA, x = "ejection_fraction", color = "DEATH_EVENT", marginal = "violin", hover_data=HEART_DATA.columns)
fig.show()

In [15]:
fig = px.histogram(HEART_DATA, x = "platelets", color = "DEATH_EVENT", marginal = "violin", hover_data=HEART_DATA.columns)
fig.show()

In [16]:
fig = px.histogram(HEART_DATA, x = "serum_creatinine", color = "DEATH_EVENT", marginal = "violin", hover_data=HEART_DATA.columns)
fig.show()

In [17]:
fig = px.histogram(HEART_DATA, x = "serum_sodium", color = "DEATH_EVENT", marginal = "violin", hover_data=HEART_DATA.columns)
fig.show()

In [18]:
surv = HEART_DATA[HEART_DATA['DEATH_EVENT']==0]['serum_sodium']
not_surv = HEART_DATA[HEART_DATA['DEATH_EVENT']==1]['serum_sodium']
hist_data = [surv,not_surv]
group_labels = ['Survived', 'Not Survived']
fig = ff.create_distplot(hist_data, group_labels, bin_size=0.5)
fig.update_layout(
    title_text="Analysis in Serum Sodium on Survival Status")
fig.show()

In [21]:
surv = HEART_DATA[HEART_DATA['DEATH_EVENT']==0]['serum_creatinine']
not_surv = HEART_DATA[HEART_DATA['DEATH_EVENT']==1]['serum_creatinine']
hist_data = [surv,not_surv]
group_labels = ['Survived', 'Not Survived']
fig = ff.create_distplot(hist_data, group_labels, bin_size=0.5)
fig.update_layout(
    title_text="Analysis in Serum Creatinine on Survival Status")
fig.show()