In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Load Data

### Initialize Path Constants

In [0]:
RAW_DATA_PATH = '../data/raw'
PROCESSED_DATA_PATH = '../data/processed'

### Load CSV Files

In [0]:
train_df = pd.read_csv('{}/tubes2_HeartDisease_train.csv'.format(RAW_DATA_PATH))
test_df = pd.read_csv('{}/tubes2_HeartDisease_test.csv'.format(RAW_DATA_PATH))

## Rename Column Names & Convert '?' to NaN

In [0]:
from copy import deepcopy

test_columns_replacement = {
    'Column1': 'age',
    'Column2': 'sex',
    'Column3': 'chest_pain_type',
    'Column4': 'resting_blood_pressure',
    'Column5': 'serum_cholestrol',
    'Column6': 'fasting_blood_sugar',
    'Column7': 'resting_ECG',
    'Column8': 'max_heart_rate_achieved',
    'Column9': 'excercise_induced_angina',
    'Column10': 'ST_depression',
    'Column11': 'peak_exercise_ST_segment',
    'Column12': 'num_of_major_vessels',
    'Column13': 'thal',
}

train_columns_replacement = test_columns_replacement.copy()
train_columns_replacement['Column14'] = 'heart_disease_diagnosis'

train_df = train_df.rename(columns=train_columns_replacement).replace('?',np.NaN)
test_df = test_df.rename(columns=test_columns_replacement).replace('?',np.NaN)

combine = [train_df,test_df]

In [0]:
train_df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestrol,fasting_blood_sugar,resting_ECG,max_heart_rate_achieved,excercise_induced_angina,ST_depression,peak_exercise_ST_segment,num_of_major_vessels,thal,heart_disease_diagnosis
0,54,1,4,125,216,0,0,140,0,0.0,,,,1
1,55,1,4,158,217,0,0,110,1,2.5,2.0,,,1
2,54,0,3,135,304,1,0,170,0,0.0,1.0,0.0,3.0,0
3,48,0,3,120,195,0,0,125,0,0.0,,,,0
4,50,1,4,120,0,0,1,156,1,0.0,1.0,,6.0,3


In [0]:
test_df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestrol,fasting_blood_sugar,resting_ECG,max_heart_rate_achieved,excercise_induced_angina,ST_depression,peak_exercise_ST_segment,num_of_major_vessels,thal
0,60,1,2,160,267,1,1,157,0,0.5,2,,
1,61,1,4,148,203,0,0,161,0,0.0,1,1.0,7.0
2,54,1,4,130,242,0,0,91,1,1.0,2,,
3,48,1,4,120,260,0,0,115,0,2.0,2,,
4,57,0,1,130,308,0,0,98,0,1.0,2,,


## Data Preprocessing

### Impute Null Data

to make it simple, we encode null values with the median (numerical features) and mode (categorical features)

In [0]:
def impute_categorical_data(df, colnames):
    for colname in colnames:
        df[colname].fillna(df[colname].mode().iloc[0],inplace=True)

def impute_numerical_data(df,colnames):
    for colname in colnames:
        df[colname].fillna(df.loc[df[colname]!=np.NaN][colname].median(),inplace=True)

In [0]:
categorical_data_colname = ['peak_exercise_ST_segment','excercise_induced_angina','thal']
numerical_data_colname = ['max_heart_rate_achieved',
                          'ST_depression',
                          'num_of_major_vessels',
                          'resting_blood_pressure',
                          'fasting_blood_sugar',
                          'serum_cholestrol',
                          'resting_ECG']

impute_categorical_data(train_df, categorical_data_colname)
impute_numerical_data(train_df, numerical_data_colname)

train_df

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestrol,fasting_blood_sugar,resting_ECG,max_heart_rate_achieved,excercise_induced_angina,ST_depression,peak_exercise_ST_segment,num_of_major_vessels,thal,heart_disease_diagnosis
0,54,1,4,125,216,0,0,140,0,0,2,0,3,1
1,55,1,4,158,217,0,0,110,1,2.5,2,0,3,1
2,54,0,3,135,304,1,0,170,0,0,1,0,3,0
3,48,0,3,120,195,0,0,125,0,0,2,0,3,0
4,50,1,4,120,0,0,1,156,1,0,1,0,6,3
5,64,0,4,130,303,0,0,122,0,2,2,2,3,0
6,63,1,4,130,308,0,0,138,1,2,2,0,3,2
7,58,1,2,130,251,0,0,110,0,0,2,0,3,0
8,42,1,2,150,268,0,0,136,0,0,2,0,3,0
9,54,1,3,120,258,0,2,147,0,4,2,0,7,0


### Encode Categorical Data

General rules:
- Ordinal categorical type -> label encoding
- Nominal categorical type -> one-hot encoding

## One-Hot Encoding

## Save Processed Data

Save the processed data to the ../raw/processed folder

In [0]:
train_df.to_csv('{}/processed_data.csv'.format(PROCESSED_DATA_PATH),index=False)