In [1]:
from DataAnalysis import DataAnalysis
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.core.interchange.dataframe_protocol import DataFrame
from scipy import stats

## Load data and preprocess

In [2]:
df = pd.read_csv('heart_attack_prediction_dataset.csv')
df.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Sedentary Hours Per Day,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk
0,BMW7812,67,Male,208,158/88,72,0,0,1,0,...,6.615001,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0
1,CZE1114,21,Male,389,165/93,98,1,1,1,1,...,4.963459,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0
2,BNI9906,21,Female,324,174/99,72,1,0,0,0,...,9.463426,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0
3,JLN3497,84,Male,383,163/100,73,1,1,1,0,...,7.648981,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0
4,GFO8847,66,Male,318,91/88,93,1,1,1,1,...,1.514821,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0


In [3]:
df['Blood Pressure'] = df['Blood Pressure'].str.split('/').apply(tuple)
df['Blood Mean'] = df['Blood Pressure'].apply(lambda x: (int(x[0]) + int(x[1])) / 2)
df.head()

Unnamed: 0,Patient ID,Age,Sex,Cholesterol,Blood Pressure,Heart Rate,Diabetes,Family History,Smoking,Obesity,...,Income,BMI,Triglycerides,Physical Activity Days Per Week,Sleep Hours Per Day,Country,Continent,Hemisphere,Heart Attack Risk,Blood Mean
0,BMW7812,67,Male,208,"(158, 88)",72,0,0,1,0,...,261404,31.251233,286,0,6,Argentina,South America,Southern Hemisphere,0,123.0
1,CZE1114,21,Male,389,"(165, 93)",98,1,1,1,1,...,285768,27.194973,235,1,7,Canada,North America,Northern Hemisphere,0,129.0
2,BNI9906,21,Female,324,"(174, 99)",72,1,0,0,0,...,235282,28.176571,587,4,4,France,Europe,Northern Hemisphere,0,136.5
3,JLN3497,84,Male,383,"(163, 100)",73,1,1,1,0,...,125640,36.464704,378,3,4,Canada,North America,Northern Hemisphere,0,131.5
4,GFO8847,66,Male,318,"(91, 88)",93,1,1,1,1,...,160555,21.809144,231,1,5,Thailand,Asia,Northern Hemisphere,0,89.5


In [4]:
print(df.isna().sum())

Patient ID                         0
Age                                0
Sex                                0
Cholesterol                        0
Blood Pressure                     0
Heart Rate                         0
Diabetes                           0
Family History                     0
Smoking                            0
Obesity                            0
Alcohol Consumption                0
Exercise Hours Per Week            0
Diet                               0
Previous Heart Problems            0
Medication Use                     0
Stress Level                       0
Sedentary Hours Per Day            0
Income                             0
BMI                                0
Triglycerides                      0
Physical Activity Days Per Week    0
Sleep Hours Per Day                0
Country                            0
Continent                          0
Hemisphere                         0
Heart Attack Risk                  0
Blood Mean                         0
d

In [5]:
print(df.nunique())

Patient ID                         8763
Age                                  73
Sex                                   2
Cholesterol                         281
Blood Pressure                     3915
Heart Rate                           71
Diabetes                              2
Family History                        2
Smoking                               2
Obesity                               2
Alcohol Consumption                   2
Exercise Hours Per Week            8763
Diet                                  3
Previous Heart Problems               2
Medication Use                        2
Stress Level                         10
Sedentary Hours Per Day            8763
Income                             8615
BMI                                8763
Triglycerides                       771
Physical Activity Days Per Week       8
Sleep Hours Per Day                   7
Country                              20
Continent                             6
Hemisphere                            2


In [6]:
category_data = list()
interval_data = list()
for col in df.columns:
    if df[col].nunique() < 30:
        category_data.append(col)
    else:
        interval_data.append(col)
print(f'category: {category_data}\ninterval: {interval_data}')

category: ['Sex', 'Diabetes', 'Family History', 'Smoking', 'Obesity', 'Alcohol Consumption', 'Diet', 'Previous Heart Problems', 'Medication Use', 'Stress Level', 'Physical Activity Days Per Week', 'Sleep Hours Per Day', 'Country', 'Continent', 'Hemisphere', 'Heart Attack Risk']
interval: ['Patient ID', 'Age', 'Cholesterol', 'Blood Pressure', 'Heart Rate', 'Exercise Hours Per Week', 'Sedentary Hours Per Day', 'Income', 'BMI', 'Triglycerides', 'Blood Mean']


## Some anova possibility

01 Diet (Healthy, Average, Unhealthy) vs. Cholesterol

1. H0: There is no significant difference in the mean cholesterol level across different diet categories.
2. Ha: There is a significant difference in the mean cholesterol level across different diet categories.

Res: fail to reject the h0 

In [7]:
res = DataAnalysis.check_normality(df['Cholesterol'])
res

{'test': 'Anderson-Darling',
 'statistic': np.float64(91.34459292230349),
 'p_value': 0.0,
 'is_normal': False}

In [8]:
res = DataAnalysis.anova_kruskal_t_mann_test(df['Diet'].unique(), df['Cholesterol'], False)
res

{'test': 'Kruskal-Wallis',
 'statistic': np.float64(8762.000000000002),
 'p_value': np.float64(0.49799089453512324),
 'is_significant': False}

02 Diet (Healthy, Average, Unhealthy) vs. Blood Pressure

1. H0: There is no significant difference in the mean blood pressure across different diet categories.
2. Ha: There is a significant difference in the mean blood pressure across different diet categories.

Res: fail to reject the h0

In [9]:
res = DataAnalysis.check_normality(df['Blood Mean'])
res

{'test': 'Anderson-Darling',
 'statistic': np.float64(21.96986686017226),
 'p_value': 0.0,
 'is_normal': False}

In [10]:
res = DataAnalysis.anova_kruskal_t_mann_test(df['Diet'].unique(), df['Blood Mean'])
res

{'test': 'Kruskal-Wallis',
 'statistic': np.float64(8761.999999999998),
 'p_value': np.float64(0.4979908945351342),
 'is_significant': False}

03 Continent (Africa, Asia, Europe, North America, South America) vs. Heart Rate

1. H0: There is no significant difference in the mean heart rate across continents.
2. Ha: There is a significant difference in the mean heart rate across continents.

Res: reject the h0

In [11]:
res = DataAnalysis.check_normality(df['Heart Rate'])
res

{'test': 'Anderson-Darling',
 'statistic': np.float64(102.29183776167338),
 'p_value': 0.0,
 'is_normal': False}

In [12]:
res = DataAnalysis.anova_kruskal_t_mann_test(df['Continent'].unique(), df['BMI'])
res

{'test': 'Kruskal-Wallis',
 'statistic': np.float64(8762.0),
 'p_value': np.float64(0.49799089453512874),
 'is_significant': False}