In [1]:
import pandas as pd

In [2]:
# loading in data
df_heart_2020 = pd.read_csv('heart_2020.csv')
df_heart_2020.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [3]:
# checking for missing data and data types
df_heart_2020.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 319795 entries, 0 to 319794
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   HeartDisease      319795 non-null  object 
 1   BMI               319795 non-null  float64
 2   Smoking           319795 non-null  object 
 3   AlcoholDrinking   319795 non-null  object 
 4   Stroke            319795 non-null  object 
 5   PhysicalHealth    319795 non-null  float64
 6   MentalHealth      319795 non-null  float64
 7   DiffWalking       319795 non-null  object 
 8   Sex               319795 non-null  object 
 9   AgeCategory       319795 non-null  object 
 10  Race              319795 non-null  object 
 11  Diabetic          319795 non-null  object 
 12  PhysicalActivity  319795 non-null  object 
 13  GenHealth         319795 non-null  object 
 14  SleepTime         319795 non-null  float64
 15  Asthma            319795 non-null  object 
 16  KidneyDisease     31

In [4]:
# creating binary variables - change yes/no columns to 1/0
df_heart_2020['HeartDisease'] = df_heart_2020['HeartDisease'].replace({'No': 0, 'Yes': 1})
df_heart_2020['Smoking'] = df_heart_2020['Smoking'].replace({'No': 0, 'Yes': 1})
df_heart_2020['AlcoholDrinking'] = df_heart_2020['AlcoholDrinking'].replace({'No': 0, 'Yes': 1})
df_heart_2020['Stroke'] = df_heart_2020['Stroke'].replace({'No': 0, 'Yes': 1})
df_heart_2020['DiffWalking'] = df_heart_2020['DiffWalking'].replace({'No': 0, 'Yes': 1})
df_heart_2020['PhysicalActivity'] = df_heart_2020['PhysicalActivity'].replace({'No': 0, 'Yes': 1})
df_heart_2020['Asthma'] = df_heart_2020['Asthma'].replace({'No': 0, 'Yes': 1})
df_heart_2020['KidneyDisease'] = df_heart_2020['KidneyDisease'].replace({'No': 0, 'Yes': 1})
df_heart_2020['SkinCancer'] = df_heart_2020['SkinCancer'].replace({'No': 0, 'Yes': 1})
df_heart_2020['Sex'] = df_heart_2020['Sex'].replace({'Female': 0, 'Male': 1})

# spliting diabetes to yes and no
# might look into a different approach down the line
df_heart_2020['Diabetic'] = df_heart_2020['Diabetic'].replace({'No': 0, 'No, borderline diabetes': 0, 'Yes (during pregnancy)': 1, 'Yes': 1})

df_heart_2020.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,16.6,1,0,0,3.0,30.0,0,0,55-59,White,1,1,Very good,5.0,1,0,1
1,0,20.34,0,0,1,0.0,0.0,0,0,80 or older,White,0,1,Very good,7.0,0,0,0
2,0,26.58,1,0,0,20.0,30.0,0,1,65-69,White,1,1,Fair,8.0,1,0,0
3,0,24.21,0,0,0,0.0,0.0,0,0,75-79,White,0,0,Good,6.0,0,0,1
4,0,23.71,0,0,0,28.0,0.0,1,0,40-44,White,0,1,Very good,8.0,0,0,0


In [6]:
# changing age category into number representing the 13 cateogory
# 0 to 12 -> young to old
df_heart_2020['AgeCategory'] = df_heart_2020['AgeCategory'].replace({'18-24': 0, '25-29': 1, '30-34': 2, '35-39': 3,
                                                                    '40-44': 4, '45-49': 5, '50-54': 6, '55-59': 7,
                                                                    '60-64': 8, '65-69': 9, '70-74': 10, '75-79': 11,
                                                                    '80 or older': 12})



In [9]:
# changing types
# chaing diabetic to int
df_heart_2020['Diabetic'] = df_heart_2020['Diabetic'].astype(int)

# changing sleep time to int
df_heart_2020['SleepTime'] = df_heart_2020['SleepTime'].astype(int)

# changing difficulty walking to int
df_heart_2020['DiffWalking'] = df_heart_2020['DiffWalking'].astype(int)

# mental health to int
df_heart_2020['MentalHealth'] = df_heart_2020['MentalHealth'].astype(int)

# physical health to int
df_heart_2020['PhysicalHealth'] = df_heart_2020['PhysicalHealth'].astype(int)

In [10]:
# checking for outliers of unusual data
# data is within reason
df_heart_2020.describe()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Diabetic,PhysicalActivity,SleepTime,Asthma,KidneyDisease,SkinCancer
count,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0,319795.0
mean,0.085595,28.325399,0.412477,0.068097,0.03774,3.37171,3.898366,0.13887,0.475273,6.514536,0.13559,0.775362,7.097075,0.134061,0.036833,0.093244
std,0.279766,6.3561,0.492281,0.251912,0.190567,7.95085,7.955235,0.345812,0.499389,3.564759,0.342353,0.417344,1.436007,0.340718,0.188352,0.290775
min,0.0,12.02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,0.0,24.03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,1.0,6.0,0.0,0.0,0.0
50%,0.0,27.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0,1.0,7.0,0.0,0.0,0.0
75%,0.0,31.42,1.0,0.0,0.0,2.0,3.0,0.0,1.0,9.0,0.0,1.0,8.0,0.0,0.0,0.0
max,1.0,94.85,1.0,1.0,1.0,30.0,30.0,1.0,1.0,12.0,1.0,1.0,24.0,1.0,1.0,1.0


In [11]:
# saving updated csv file
df_heart_2020.to_csv('heart_2020_cleaned.csv', index = False)