In [98]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from patsy import dmatrix
from sklearn.metrics import mean_squared_error
from pyearth import Earth

In [99]:
import warnings
warnings.filterwarnings('ignore')

# Data Cleaning / Exploration

In [100]:
sample_size = 20000

path = 'data/heart_2020_cleaned.csv'
df = pd.read_csv(path)
df = df.sample(n = sample_size)
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
226378,No,23.48,Yes,No,No,28.0,30.0,Yes,Male,70-74,Other,No,Yes,Poor,4.0,Yes,Yes,No
58573,No,23.11,Yes,No,No,0.0,0.0,No,Male,35-39,Black,No,Yes,Very good,8.0,No,No,No
33272,No,19.53,Yes,Yes,No,0.0,0.0,No,Female,65-69,White,No,Yes,Excellent,9.0,No,No,Yes
263387,No,29.95,No,No,No,0.0,0.0,No,Male,25-29,Asian,No,No,Fair,7.0,No,No,No
237727,No,30.41,Yes,No,No,0.0,0.0,No,Female,75-79,White,No,No,Very good,6.0,No,No,Yes


In [101]:
df.isna().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [102]:
for col in list(df.columns):
    if df[col].dtype == 'object':
        print(f"Column '{col}' has {len(df[col].unique())} unique values")

Column 'HeartDisease' has 2 unique values
Column 'Smoking' has 2 unique values
Column 'AlcoholDrinking' has 2 unique values
Column 'Stroke' has 2 unique values
Column 'DiffWalking' has 2 unique values
Column 'Sex' has 2 unique values
Column 'AgeCategory' has 13 unique values
Column 'Race' has 6 unique values
Column 'Diabetic' has 4 unique values
Column 'PhysicalActivity' has 2 unique values
Column 'GenHealth' has 5 unique values
Column 'Asthma' has 2 unique values
Column 'KidneyDisease' has 2 unique values
Column 'SkinCancer' has 2 unique values


In [103]:
binary_cols = ['HeartDisease', 'Smoking', 'AlcoholDrinking',
              'Stroke', 'DiffWalking', 'PhysicalActivity',
              'Asthma', 'KidneyDisease', 'SkinCancer']

for col in binary_cols:
    print(f"Working on column: {col}")
    df[col] = df.apply(
        lambda row: 1 if row[col] == 'Yes' else 0,
        axis = 1
    )
    
df.head()

Working on column: HeartDisease
Working on column: Smoking
Working on column: AlcoholDrinking
Working on column: Stroke
Working on column: DiffWalking
Working on column: PhysicalActivity
Working on column: Asthma
Working on column: KidneyDisease
Working on column: SkinCancer


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
226378,0,23.48,1,0,0,28.0,30.0,1,Male,70-74,Other,No,1,Poor,4.0,1,1,0
58573,0,23.11,1,0,0,0.0,0.0,0,Male,35-39,Black,No,1,Very good,8.0,0,0,0
33272,0,19.53,1,1,0,0.0,0.0,0,Female,65-69,White,No,1,Excellent,9.0,0,0,1
263387,0,29.95,0,0,0,0.0,0.0,0,Male,25-29,Asian,No,0,Fair,7.0,0,0,0
237727,0,30.41,1,0,0,0.0,0.0,0,Female,75-79,White,No,0,Very good,6.0,0,0,1


In [109]:
for col in list(df.columns):
    if df[col].dtype == 'object':
        df[col] = df.apply(
            lambda row: row[col].replace(' ', '_'),
            axis = 1
        )

In [110]:
for col in list(df.columns):
    if df[col].dtype == 'object':
        dummies = pd.get_dummies(
            df[col], prefix = f"{col}", drop_first = True
        )

        df = df.drop(columns = [col])
        df = pd.concat([df, dummies], axis = 1)
        
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,PhysicalActivity,SleepTime,...,Race_Hispanic,Race_Other,Race_White,"Diabetic_No,_borderline_diabetes",Diabetic_Yes,Diabetic_Yes_(during_pregnancy),GenHealth_Fair,GenHealth_Good,GenHealth_Poor,GenHealth_Very_good
226378,0,23.48,1,0,0,28.0,30.0,1,1,4.0,...,0,1,0,0,0,0,0,0,1,0
58573,0,23.11,1,0,0,0.0,0.0,0,1,8.0,...,0,0,0,0,0,0,0,0,0,1
33272,0,19.53,1,1,0,0.0,0.0,0,1,9.0,...,0,0,1,0,0,0,0,0,0,0
263387,0,29.95,0,0,0,0.0,0.0,0,0,7.0,...,0,0,0,0,0,0,1,0,0,0
237727,0,30.41,1,0,0,0.0,0.0,0,0,6.0,...,0,0,1,0,0,0,0,0,0,1


In [112]:
df.to_csv('data/heart_2020_cleaned_v2.csv')

# EDA