In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Load Data

### Initialize Path Constants

In [3]:
RAW_DATA_PATH = '../data/raw'
PROCESSED_DATA_PATH = '../data/processed'

### Load CSV Files

In [22]:
train_df = pd.read_csv('{}/tubes2_HeartDisease_train.csv'.format(RAW_DATA_PATH))
test_df = pd.read_csv('{}/tubes2_HeartDisease_test.csv'.format(RAW_DATA_PATH))

## Rename Column Names & Convert '?' to NaN

In [23]:
from copy import deepcopy

test_columns_replacement = {
    'Column1': 'age',
    'Column2': 'sex',
    'Column3': 'chest_pain_type',
    'Column4': 'resting_blood_pressure',
    'Column5': 'serum_cholestrol',
    'Column6': 'fasting_blood_sugar',
    'Column7': 'resting_ECG',
    'Column8': 'max_heart_rate_achieved',
    'Column9': 'excercise_induced_angina',
    'Column10': 'ST_depression',
    'Column11': 'peak_exercise_ST_segment',
    'Column12': 'num_of_major_vessels',
    'Column13': 'thal',
}

train_columns_replacement = test_columns_replacement.copy()
train_columns_replacement['Column14'] = 'heart_disease_diagnosis'

train_df = train_df.rename(columns=train_columns_replacement).replace('?',np.NaN)
test_df = test_df.rename(columns=test_columns_replacement).replace('?',np.NaN)

combine = [train_df,test_df]

In [13]:
train_df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestrol,fasting_blood_sugar,resting_ECG,max_heart_rate_achieved,excercise_induced_angina,ST_depression,peak_exercise_ST_segment,num_of_major_vessels,thal,heart_disease_diagnosis
0,54,1,4,125,216,0,0,140,0,0.0,,,,1
1,55,1,4,158,217,0,0,110,1,2.5,2.0,,,1
2,54,0,3,135,304,1,0,170,0,0.0,1.0,0.0,3.0,0
3,48,0,3,120,195,0,0,125,0,0.0,,,,0
4,50,1,4,120,0,0,1,156,1,0.0,1.0,,6.0,3


In [14]:
test_df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestrol,fasting_blood_sugar,resting_ECG,max_heart_rate_achieved,excercise_induced_angina,ST_depression,peak_exercise_ST_segment,num_of_major_vessels,thal
0,60,1,2,160,267,1,1,157,0,0.5,2,,
1,61,1,4,148,203,0,0,161,0,0.0,1,1.0,7.0
2,54,1,4,130,242,0,0,91,1,1.0,2,,
3,48,1,4,120,260,0,0,115,0,2.0,2,,
4,57,0,1,130,308,0,0,98,0,1.0,2,,


## Data Preprocessing

### Impute Null Data

to make it simple, we encode null values with the median (numerical features) and mode (categorical features)

In [24]:
def impute_categorical_data(df, colnames):
    for colname in colnames:
        df[colname].fillna(df[colname].mode().iloc[0],inplace=True)

def impute_numerical_data(df,colnames):
    for colname in colnames:
        df[colname].fillna(df.loc[df[colname]!=np.NaN][colname].median(),inplace=True)

In [26]:
categorical_data_colname = ['peak_exercise_ST_segment','excercise_induced_angina','thal']
numerical_data_colname = ['max_heart_rate_achieved',
                          'ST_depression',
                          'num_of_major_vessels',
                          'resting_blood_pressure',
                          'fasting_blood_sugar',
                          'serum_cholestrol',
                          'resting_ECG']

impute_categorical_data(train_df, categorical_data_colname)
impute_numerical_data(train_df, numerical_data_colname)

train_df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestrol,fasting_blood_sugar,resting_ECG,max_heart_rate_achieved,excercise_induced_angina,ST_depression,peak_exercise_ST_segment,num_of_major_vessels,thal,heart_disease_diagnosis
0,54,1,4,125,216,0,0,140,0,0.0,2,0,3,1
1,55,1,4,158,217,0,0,110,1,2.5,2,0,3,1
2,54,0,3,135,304,1,0,170,0,0.0,1,0,3,0
3,48,0,3,120,195,0,0,125,0,0.0,2,0,3,0
4,50,1,4,120,0,0,1,156,1,0.0,1,0,6,3


### Encode Categorical Data

General rules:
- Ordinal categorical type -> label encoding
- Nominal categorical type -> one-hot encoding

## One-Hot Encoding

<h5>Column :</h5>
<ul>
    <li>Peak exercise ST segment (peak_exercise_ST_segment)</li>
</ul>

In [27]:
# one hot column
one_hot = pd.get_dummies(train_df['peak_exercise_ST_segment'])

# drop column
train_df = train_df.drop('peak_exercise_ST_segment', axis=1)
train_df = train_df.join(one_hot)

train_df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestrol,fasting_blood_sugar,resting_ECG,max_heart_rate_achieved,excercise_induced_angina,ST_depression,num_of_major_vessels,thal,heart_disease_diagnosis,1,2,3
0,54,1,4,125,216,0,0,140,0,0.0,0,3,1,0,1,0
1,55,1,4,158,217,0,0,110,1,2.5,0,3,1,0,1,0
2,54,0,3,135,304,1,0,170,0,0.0,0,3,0,1,0,0
3,48,0,3,120,195,0,0,125,0,0.0,0,3,0,0,1,0
4,50,1,4,120,0,0,1,156,1,0.0,0,6,3,1,0,0


## Label Encoding

<h5>Column :</h5>
<ul>
    <li>chest_pain_type (4-1)</li>
    <li>resting_ECG (0-2)</li>
    <li>thal (3,7,6)</li>
</ul>

In [35]:
train_df_encoded = train_df.copy()

# label encoding 
replacement_one = {
    "chest_pain_type" : {
        4 : "one",
        3 : "two",
        2 : "three",
        1 : "four"
    },
    "thal" : {
        3 : "three",
        7 : "six",
        6 : "seven"
    }
}

replacement_two = {
    "chest_pain_type" : {
        "one" : 1,
        "two" : 2,
        "three" : 3,
        "four" : 4
    },
    "thal" : {
        "three" : 3,
        "six" : 6,
        "seven" : 7
    }
}

train_df_encoded.replace(replacement_one, inplace=True)
train_df_encoded.replace(replacement_one, inplace=True)

train_df_encoded.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestrol,fasting_blood_sugar,resting_ECG,max_heart_rate_achieved,excercise_induced_angina,ST_depression,num_of_major_vessels,thal,heart_disease_diagnosis,1,2,3
0,54,1,one,125,216,0,0,140,0,0.0,0,3,1,0,1,0
1,55,1,one,158,217,0,0,110,1,2.5,0,3,1,0,1,0
2,54,0,two,135,304,1,0,170,0,0.0,0,3,0,1,0,0
3,48,0,two,120,195,0,0,125,0,0.0,0,3,0,0,1,0
4,50,1,one,120,0,0,1,156,1,0.0,0,6,3,1,0,0


## Save Processed Data

Save the processed data to the ../raw/processed folder

In [0]:
train_df.to_csv('{}/processed_data.csv'.format(PROCESSED_DATA_PATH),index=False)