In [1]:
import logging
import pandas as pd
import wandb
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import fbeta_score, precision_score, recall_score, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn_pandas import DataFrameMapper
import matplotlib.pyplot as plt

import os
from pathlib import Path

In [2]:
def get_numerical_categorical_features(data:pd.DataFrame):
    numerical_features = []
    categorical_features = []
    for i in data.columns:
        if data[i].dtypes == 'float64':
            numerical_features.append(i)
        else:
            categorical_features.append(i)
    return numerical_features,categorical_features

def split_categorical_binary_features(data:pd.DataFrame):
    binary_features = []
    categorical_features = []
    for i in data.columns:
        if len(data[i].unique())==2:
            binary_features.append(i)
        else:
            categorical_features.append(i)
    return binary_features,categorical_features

def treat_numerical_data(data:pd.DataFrame,scaler:int):
    if scaler == 0:
        scaler_model = StandardScaler()
    elif scaler == 1:
       scaler_model = MinMaxScaler(feature_range=(0,1))
    else:
       scaler_model = MinMaxScaler(feature_range=(-1,1))  
    
    mapper = DataFrameMapper([(data.columns, scaler_model)])
    scaled_features = mapper.fit_transform(data.copy())
    scaled_features_df = pd.DataFrame(scaled_features, index=data.index, columns=data.columns)
    
    return scaled_features_df

def treat_categorical_data(data:pd.DataFrame,encoder):
    if encoder==0:
        labeled_features_df = pd.get_dummies(data=data,prefix=data.columns)
    else:
        le = LabelEncoder()
        data2 = data.copy()
        for i in data.columns:
            data2[i] = le.fit_transform(data2[i])
        labeled_features_df = data2

    return labeled_features_df


In [3]:
file_dir = Path().resolve().parent
file_dir = os.path.join(file_dir, 'data')
file_dir = os.path.join(file_dir, 'heart_2020_cleaned.csv')
data = pd.read_csv(file_dir)

In [4]:
min(data['HeartDisease'].value_counts())

27373

In [5]:
data.groupby('HeartDisease', group_keys=False).apply(lambda x: x.sample(27373)).reset_index(drop=True)

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,23.23,No,No,No,0.0,0.0,No,Female,70-74,White,No,Yes,Very good,9.0,No,No,No
1,No,29.60,No,No,No,0.0,0.0,No,Male,65-69,White,No,Yes,Very good,8.0,No,No,No
2,No,25.09,No,No,No,7.0,0.0,No,Male,80 or older,White,No,No,Good,8.0,No,Yes,Yes
3,No,31.18,Yes,No,No,0.0,25.0,No,Female,30-34,White,No,Yes,Good,7.0,No,No,No
4,No,44.48,Yes,No,No,0.0,0.0,No,Female,60-64,White,No,No,Excellent,7.0,No,No,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54741,Yes,25.39,No,No,No,0.0,0.0,No,Female,80 or older,White,No,Yes,Good,9.0,No,No,No
54742,Yes,24.19,Yes,No,No,10.0,0.0,Yes,Female,75-79,Black,No,No,Very good,5.0,No,Yes,No
54743,Yes,26.30,Yes,No,No,0.0,0.0,No,Male,75-79,White,No,Yes,Fair,6.0,Yes,No,No
54744,Yes,29.45,No,No,No,7.0,0.0,No,Female,65-69,White,Yes,No,Fair,9.0,No,No,No


In [6]:
num,cat = get_numerical_categorical_features(data)

In [7]:
categorical_data = data.loc[:,cat]
numerical_data = data.loc[:,num]

In [8]:
scaler = 0
num_data = treat_numerical_data(numerical_data,scaler)

In [9]:
encoder = 1
cate_data = treat_categorical_data(categorical_data,encoder)

In [10]:
treat_data = pd.concat([num_data,cate_data],axis=1)

In [11]:
treat_data

Unnamed: 0,BMI,PhysicalHealth,MentalHealth,SleepTime,HeartDisease,Smoking,AlcoholDrinking,Stroke,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,Asthma,KidneyDisease,SkinCancer
0,-1.844750,-0.046751,3.281069,-1.460354,0,1,0,0,0,0,7,5,2,1,4,1,0,1
1,-1.256338,-0.424070,-0.490039,-0.067601,0,0,0,1,0,0,12,5,0,1,4,0,0,0
2,-0.274603,2.091388,3.281069,0.628776,0,1,0,0,0,1,9,5,2,1,1,1,0,0
3,-0.647473,-0.424070,-0.490039,-0.763977,0,0,0,0,0,0,11,5,0,0,2,0,0,1
4,-0.726138,3.097572,-0.490039,0.628776,0,0,0,0,1,0,4,5,0,1,4,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,-0.144019,0.456341,-0.490039,-0.763977,1,1,0,0,1,1,8,3,2,0,1,1,0,0
319791,0.238291,-0.424070,-0.490039,-1.460354,0,1,0,0,0,1,3,3,0,1,4,1,0,0
319792,-0.642753,-0.424070,-0.490039,-0.763977,0,0,0,0,0,0,5,3,0,1,2,0,0,0
319793,0.705560,-0.424070,-0.490039,3.414282,0,0,0,0,0,0,1,3,0,0,2,0,0,0
