# Exploration and Extraction
- Simplifying the dataset
- Identifying the numerical and categorical data
- Cleaning the data
- Preparing the data for Visualisation and Machine learning models

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt 

In [2]:
heart=pd.read_csv('Heart unclean.csv')
heart.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,HeightInMeters,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,,No,...,,,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,,No,...,1.6,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,,No,...,1.57,63.5,25.61,No,No,No,No,,No,Yes
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,,No,...,1.65,63.5,23.3,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,,No,...,1.57,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No


In [3]:
print("Data type : ", type(heart))
print("Data dims : ", heart.shape)

Data type :  <class 'pandas.core.frame.DataFrame'>
Data dims :  (445132, 40)


In [4]:
# Information about the Variables
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445132 entries, 0 to 445131
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      445132 non-null  object 
 1   Sex                        445132 non-null  object 
 2   GeneralHealth              443934 non-null  object 
 3   PhysicalHealthDays         434205 non-null  float64
 4   MentalHealthDays           436065 non-null  float64
 5   LastCheckupTime            436824 non-null  object 
 6   PhysicalActivities         444039 non-null  object 
 7   SleepHours                 439679 non-null  float64
 8   RemovedTeeth               433772 non-null  object 
 9   HadHeartAttack             442067 non-null  object 
 10  HadAngina                  440727 non-null  object 
 11  HadStroke                  443575 non-null  object 
 12  HadAsthma                  443359 non-null  object 
 13  HadSkinCancer              44

In [5]:
# Check missing values in data
heart.isnull().sum()

State                            0
Sex                              0
GeneralHealth                 1198
PhysicalHealthDays           10927
MentalHealthDays              9067
LastCheckupTime               8308
PhysicalActivities            1093
SleepHours                    5453
RemovedTeeth                 11360
HadHeartAttack                3065
HadAngina                     4405
HadStroke                     1557
HadAsthma                     1773
HadSkinCancer                 3143
HadCOPD                       2219
HadDepressiveDisorder         2812
HadKidneyDisease              1926
HadArthritis                  2633
HadDiabetes                   1087
DeafOrHardOfHearing          20647
BlindOrVisionDifficulty      21564
DifficultyConcentrating      24240
DifficultyWalking            24012
DifficultyDressingBathing    23915
DifficultyErrands            25656
SmokerStatus                 35462
ECigaretteUsage              35660
ChestScan                    56046
RaceEthnicityCategor

# Exploratory Data Analysis 

## Simplifying the variables

In [6]:
heart['AccessedMedicalServices']=np.where((heart['LastCheckupTime'].notnull()) | (heart['ChestScan'] == 'Yes') | (heart['HIVTesting'] == 'Yes') | (heart['FluVaxLast12'] == 'Yes') | (heart['PneumoVaxEver'] == 'Yes') | (heart['CovidPos'] == 'Yes'), 'Yes','No')
heart.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,WeightInKilograms,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,AccessedMedicalServices
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,,No,...,,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No,Yes
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,,No,...,68.04,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,,No,...,63.5,25.61,No,No,No,No,,No,Yes,Yes
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,,No,...,63.5,23.3,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No,Yes
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,,No,...,53.98,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No,Yes


In [7]:
heart['HaveCardiovascularDisease']=np.where(((heart['HadHeartAttack'] == 'Yes') | (heart['HadAngina'] == 'Yes')), 'Yes','No')
heart.head()

Unnamed: 0,State,Sex,GeneralHealth,PhysicalHealthDays,MentalHealthDays,LastCheckupTime,PhysicalActivities,SleepHours,RemovedTeeth,HadHeartAttack,...,BMI,AlcoholDrinkers,HIVTesting,FluVaxLast12,PneumoVaxEver,TetanusLast10Tdap,HighRiskLastYear,CovidPos,AccessedMedicalServices,HaveCardiovascularDisease
0,Alabama,Female,Very good,0.0,0.0,Within past year (anytime less than 12 months ...,No,8.0,,No,...,,No,No,Yes,No,"Yes, received tetanus shot but not sure what type",No,No,Yes,No
1,Alabama,Female,Excellent,0.0,0.0,,No,6.0,,No,...,26.57,No,No,No,No,"No, did not receive any tetanus shot in the pa...",No,No,No,No
2,Alabama,Female,Very good,2.0,3.0,Within past year (anytime less than 12 months ...,Yes,5.0,,No,...,25.61,No,No,No,No,,No,Yes,Yes,No
3,Alabama,Female,Excellent,0.0,0.0,Within past year (anytime less than 12 months ...,Yes,7.0,,No,...,23.3,No,No,Yes,Yes,"No, did not receive any tetanus shot in the pa...",No,No,Yes,No
4,Alabama,Female,Fair,2.0,0.0,Within past year (anytime less than 12 months ...,Yes,9.0,,No,...,21.77,Yes,No,No,Yes,"No, did not receive any tetanus shot in the pa...",No,No,Yes,No


In [8]:
heart['newBMI'] = 1.3 * heart['WeightInKilograms']/(heart['HeightInMeters']**2.5)

In [9]:
heart['originalBMI'] = heart['WeightInKilograms']/(heart['HeightInMeters']*heart['HeightInMeters'])

In [10]:
##dropping irrelevant columns
heart = heart.drop(['RemovedTeeth','TetanusLast10Tdap','WeightInKilograms','HeightInMeters','BMI','HighRiskLastYear','HadHeartAttack','HadAngina'], axis=1)

In [11]:
##dropping the merged columns
heart = heart.drop(['LastCheckupTime','ChestScan','HIVTesting','FluVaxLast12','PneumoVaxEver','CovidPos'], axis=1) 

In [12]:
heart.isnull().sum()

State                            0
Sex                              0
GeneralHealth                 1198
PhysicalHealthDays           10927
MentalHealthDays              9067
PhysicalActivities            1093
SleepHours                    5453
HadStroke                     1557
HadAsthma                     1773
HadSkinCancer                 3143
HadCOPD                       2219
HadDepressiveDisorder         2812
HadKidneyDisease              1926
HadArthritis                  2633
HadDiabetes                   1087
DeafOrHardOfHearing          20647
BlindOrVisionDifficulty      21564
DifficultyConcentrating      24240
DifficultyWalking            24012
DifficultyDressingBathing    23915
DifficultyErrands            25656
SmokerStatus                 35462
ECigaretteUsage              35660
RaceEthnicityCategory        14057
AgeCategory                   9079
AlcoholDrinkers              46574
AccessedMedicalServices          0
HaveCardiovascularDisease        0
newBMI              

# Exploratory Analysis on Numerical Data

In [13]:
# Extract only the numeric data variables
numeric_data = pd.DataFrame(heart[["originalBMI","newBMI", "PhysicalHealthDays", "MentalHealthDays", "SleepHours","HaveCardiovascularDisease"]])

# Summary Statistics for all Variables
numeric_data.describe().round(2)

Unnamed: 0,originalBMI,newBMI,PhysicalHealthDays,MentalHealthDays,SleepHours
count,398564.0,398564.0,434205.0,436065.0,439679.0
mean,28.53,28.46,4.35,4.38,7.02
std,6.61,6.78,8.69,8.39,1.5
min,6.77,5.99,0.0,0.0,1.0
25%,24.11,23.97,0.0,0.0,6.0
50%,27.35,27.32,0.0,0.0,7.0
75%,31.8,31.54,3.0,5.0,8.0
max,235.53,320.97,30.0,30.0,24.0


## Cleaning the data for newBMI
Lowest recorded BMI and Highest recorded BMI are 13.6 and 204 respectively.
https://www.bbc.com/news/uk-england-leeds-44488822

In [14]:
min_newBMI,max_newBMI = 13.6 , 204
min_originalBMI,max_originalBMI = 13.6 , 204

# Remove outliers
outliers_count = len(numeric_data[(numeric_data['newBMI'] < min_newBMI) | (numeric_data['newBMI'] > max_newBMI) |
                (numeric_data['originalBMI'] < min_originalBMI) | (numeric_data['originalBMI'] > max_originalBMI)])

# Check how many rows have been removed
rows_removed = outliers_count
print(f"Number of outlier rows removed: {rows_removed}")

# Now, remove the outliers
numeric_data = numeric_data[(numeric_data['newBMI'] >= min_newBMI) & (numeric_data['newBMI'] <= max_newBMI) &
                (numeric_data['originalBMI'] >= min_originalBMI) & (numeric_data['originalBMI'] <= max_originalBMI)]

heart = heart[(heart['newBMI'] >= min_newBMI) & (heart['newBMI'] <= max_newBMI) &
                (heart['originalBMI'] >= min_originalBMI) & (heart['originalBMI'] <= max_originalBMI)]

rows_after_removal = numeric_data.shape[0]
print(f"Number of rows after removing outliers: {rows_after_removal}")

Number of outlier rows removed: 341
Number of rows after removing outliers: 398223


In [15]:
numeric_data.describe()

Unnamed: 0,originalBMI,newBMI,PhysicalHealthDays,MentalHealthDays,SleepHours
count,398223.0,398223.0,389284.0,390863.0,394099.0
mean,28.544699,28.47516,4.375168,4.425001,7.024123
std,6.58529,6.743217,8.700598,8.398869,1.491042
min,13.653979,13.613761,0.0,0.0,1.0
25%,24.107143,23.96543,0.0,0.0,6.0
50%,27.37565,27.315409,0.0,0.0,7.0
75%,31.807159,31.540693,4.0,5.0,8.0
max,173.798748,200.487885,30.0,30.0,24.0


# Exploratory analysis on Categorical data

In [16]:
categorical_data = pd.DataFrame(heart[["State","HaveCardiovascularDisease","GeneralHealth","HadDepressiveDisorder","RaceEthnicityCategory","AgeCategory","AlcoholDrinkers"]])
categorical_data.describe()

Unnamed: 0,State,HaveCardiovascularDisease,GeneralHealth,HadDepressiveDisorder,RaceEthnicityCategory,AgeCategory,AlcoholDrinkers
count,398223,398223,397332,396074,387936,393576,374011
unique,54,2,5,2,5,13,2
top,Washington,No,Very good,No,"White only, Non-Hispanic",Age 65 to 69,Yes
freq,22978,361726,134332,312481,291164,42769,200743


In [17]:
heart.to_csv('cleaned-heart.csv')