## Key Indicators of Heart Disease 2022

In [1]:
#Imported libraries for the EDA process
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats



In [2]:
#Loading the file
heart_df = pd.read_csv('heart_2022.csv')
print(heart_df)

                 State     Sex GeneralHealth  PhysicalHealthDays  \
0              Alabama  Female     Very good                 4.0   
1              Alabama    Male     Very good                 0.0   
2              Alabama    Male     Very good                 0.0   
3              Alabama  Female          Fair                 5.0   
4              Alabama  Female          Good                 3.0   
...                ...     ...           ...                 ...   
246017  Virgin Islands    Male     Very good                 0.0   
246018  Virgin Islands  Female          Fair                 0.0   
246019  Virgin Islands    Male          Good                 0.0   
246020  Virgin Islands  Female     Excellent                 2.0   
246021  Virgin Islands    Male     Very good                 0.0   

        MentalHealthDays                                    LastCheckupTime  \
0                    0.0  Within past year (anytime less than 12 months ...   
1                    0.0 

In [3]:
heart_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246022 entries, 0 to 246021
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      246022 non-null  object 
 1   Sex                        246022 non-null  object 
 2   GeneralHealth              246022 non-null  object 
 3   PhysicalHealthDays         246022 non-null  float64
 4   MentalHealthDays           246022 non-null  float64
 5   LastCheckupTime            246022 non-null  object 
 6   PhysicalActivities         246022 non-null  object 
 7   SleepHours                 246022 non-null  float64
 8   RemovedTeeth               246022 non-null  object 
 9   HadHeartAttack             246022 non-null  object 
 10  HadAngina                  246022 non-null  object 
 11  HadStroke                  246022 non-null  object 
 12  HadAsthma                  246022 non-null  object 
 13  HadSkinCancer              24

In [4]:
#Renames the Has columns to just the condition itself
heart_df.rename(columns={'HadHeartAttack' : 'HeartAttack',
                         'HadAngina' : 'Angina',
                         'HadStroke' : 'Stroke',
                         'HadSkinCancer' : 'SkinCancer', 
                         'HadCOPD' : 'COPD', 
                         'HadDepressiveDisorder' : 'DepressiveDisorder', 
                         'HadKidneyDisease' : 'KidneyDisease', 
                         'HadArthritis' : 'Arthritis', 
                         'HadDiabetes' : 'Diabetes'}, inplace=True)


##ASSIGNING NUMERIC VALUES TO CATEGORICAL DATA##

##Replaces all columns that consist of observations 'Yes' and 'No' with 0 and 1
heart_df = heart_df.apply(lambda col: col.replace({'Yes' : 0,
                                      'No' : 1}))

#Assigns numeric values from 1-5 that represents the General Health
heart_df.replace({'GeneralHealth' : {'Poor' : 1,
                                    'Fair' : 2,
                                    'Good' : 3,
                                    'Very good' : 4,
                                    'Excellent' : 5}}, inplace=True)

#Assigns numeric values from 1-4 that represent the last checkup time 
heart_df.replace({'LastCheckupTime' : {"Within past year (anytime less than 12 months ago)" : 1,
                                      "Within past 2 years (1 year but less than 2 years ago)" : 2,
                                      "Within past 5 years (2 years but less than 5 years ago)" : 3,
                                      "5 or more years ago" : 4}}, inplace=True)

#Assigns numeric values to the removed teeth column
heart_df.replace({'RemovedTeeth' : {'None of them' : 1,
                                   '1 to 5' : 2,
                                   '6 or more, but not all' : 3,
                                   'All' : 4}}, inplace=True)

#Assigns numer values to diabetes column
heart_df.replace({'Diabetes' : {'Yes, but only during pregnancy (female)' : 2,
                               'No, pre-diabetes or borderline diabetes' : 3}}, inplace=True)

#Assigns numeric values to the smoker status column
heart_df.replace({'SmokerStatus' : {'Never smoked' : 1,
                                   'Former smoker' : 2,
                                   'Current smoker - now smokes some days' : 3,
                                   'Current smoker - now smokes every day' : 4}}, inplace=True)

#Assigns numeric values to the smoker status column
heart_df.replace({'ECigaretteUsage' :{'Never used e-cigarettes in my entire life' : 1,
                                     'Not at all (right now)' : 2,
                                     'Use them some days' : 3,
                                     'Use them every day' : 4}}, inplace=True)

#Assigns numeric values to the RaceEthnicCategory
heart_df.replace({'RaceEthnicityCategory' : {'White only, Non-Hispanic' : 1,
                                            'Black only, Non-Hispanic' : 2,
                                            'Multiracial, Non-Hispanic' : 3,
                                            'Hispanic' : 4,
                                            'Other race only, Non-Hispanic' : 5}}, inplace=True)

#Assigns numeric values to the age range of respondents
heart_df.replace({'AgeCategory' : {'Age 18 to 24' : 1,
                                  'Age 25 to 29' : 2,
                                  'Age 30 to 34' : 3,
                                  'Age 35 to 39' : 4,
                                  'Age 40 to 44' : 5,
                                  'Age 45 to 49' : 6,
                                  'Age 50 to 54' : 7,
                                  'Age 55 to 59' : 8,
                                  'Age 60 to 64' : 9,
                                  'Age 65 to 69' : 10,
                                  'Age 70 to 74' : 11,
                                  'Age 75 to 79' : 12,
                                  'Age 80 or older' : 13}}, inplace=True)

#Assigns numeric values to the TetanusLast10Tdap column
heart_df.replace({'TetanusLast10Tdap' : {'No, did not receive any tetanus shot in the past 10 years' : 1,
                                        'Yes, received tetanus shot but not sure what type' : 2,
                                        'Yes, received tetanus shot, but not Tdap' : 3,
                                        'Yes, received Tdap' : 4}}, inplace=True)

#Assigns a numeric value to the CovidPos column
heart_df.replace({'CovidPos' : {'Tested positive using home test without a health professional' : 2}}, inplace =True)

##APPROPRIATE VARIABLE TYPES##

#Assigns appropriate data types to the variables
heart_df['SleepHours'] = heart_df['SleepHours'].astype('int')
#heart_df['PhysicalHealthDays'] = heart_df['PhysicalHealthDays'].astype('int')
#heart_df['MentalHealthDays'] = heart_df['MentalHealthDays'].astype('int')
#heart_df['LastCheckupTime'] = heart_df['LastCheckupTime'].astype('int')


##DROP VALUES##

#Removes rows that contain 24 hours of average sleep time
heart_df= heart_df[heart_df['SleepHours'] != 24.]


print(heart_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 246009 entries, 0 to 246021
Data columns (total 40 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   State                      246009 non-null  object 
 1   Sex                        246009 non-null  object 
 2   GeneralHealth              246009 non-null  int64  
 3   PhysicalHealthDays         246009 non-null  float64
 4   MentalHealthDays           246009 non-null  float64
 5   LastCheckupTime            246009 non-null  int64  
 6   PhysicalActivities         246009 non-null  int64  
 7   SleepHours                 246009 non-null  int64  
 8   RemovedTeeth               246009 non-null  int64  
 9   HeartAttack                246009 non-null  int64  
 10  Angina                     246009 non-null  int64  
 11  Stroke                     246009 non-null  int64  
 12  HadAsthma                  246009 non-null  int64  
 13  SkinCancer                 246009 

In [5]:
print(heart_df.head(10))
print(heart_df.tail(10))

     State     Sex  GeneralHealth  PhysicalHealthDays  MentalHealthDays  \
0  Alabama  Female              4                 4.0               0.0   
1  Alabama    Male              4                 0.0               0.0   
2  Alabama    Male              4                 0.0               0.0   
3  Alabama  Female              2                 5.0               0.0   
4  Alabama  Female              3                 3.0              15.0   
5  Alabama    Male              3                 0.0               0.0   
6  Alabama  Female              3                 3.0               0.0   
7  Alabama    Male              2                 5.0               0.0   
8  Alabama    Male              3                 2.0               0.0   
9  Alabama  Female              4                 0.0               0.0   

   LastCheckupTime  PhysicalActivities  SleepHours  RemovedTeeth  HeartAttack  \
0                1                   0           9             1            1   
1           

In [6]:
# saving updated csv file
heart_df.to_csv('heart_2022_cleaned.csv', index = False)