In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Load Data

In [2]:
!ls ../data/raw

TBSC2-20181112T130549Z-001.zip [31mtubes2_HeartDisease_test.csv[m[m
[31mdescription.xlsx[m[m               [31mtubes2_HeartDisease_train.csv[m[m


### Initialize Path Constants

In [4]:
RAW_DATA_PATH = '../data/raw'

### Load CSV File

In [16]:
train_df = pd.read_csv('{}/tubes2_HeartDisease_train.csv'.format(RAW_DATA_PATH))
test_df = pd.read_csv('{}/tubes2_HeartDisease_test.csv'.format(RAW_DATA_PATH))

In [17]:
train_df.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13,Column14
0,54,1,4,125,216,0,0,140,0,0.0,?,?,?,1
1,55,1,4,158,217,0,0,110,1,2.5,2,?,?,1
2,54,0,3,135,304,1,0,170,0,0.0,1,0,3,0
3,48,0,3,120,195,0,0,125,0,0.0,?,?,?,0
4,50,1,4,120,0,0,1,156,1,0.0,1,?,6,3


In [18]:
test_df.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,60,1,2,160,267,1,1,157,0,0.5,2,?,?
1,61,1,4,148,203,0,0,161,0,0.0,1,1,7
2,54,1,4,130,242,0,0,91,1,1.0,2,?,?
3,48,1,4,120,260,0,0,115,0,2.0,2,?,?
4,57,0,1,130,308,0,0,98,0,1.0,2,?,?


## Rename Column Names

So it's easier to read..

In [19]:
from copy import deepcopy

test_columns_replacement = {
    'Column1': 'age',
    'Column2': 'sex',
    'Column3': 'chest_pain_type',
    'Column4': 'resting_blood_pressure',
    'Column5': 'serum_cholestrol',
    'Column6': 'fasting_blood_sugar',
    'Column7': 'resting_ECG',
    'Column8': 'max_heart_rate_achieved',
    'Column9': 'excercise_induced_angina',
    'Column10': 'ST_depression',
    'Column11': 'peak_exercise_ST_segment',
    'Column12': 'num_of_major_vessels',
    'Column13': 'thal',
}

train_columns_replacement = test_columns_replacement.copy()
train_columns_replacement['Column14'] = 'heart_disease_diagnosis'

train_df = train_data.rename(columns=train_columns_replacement)

test_df = test_data.rename(columns=test_columns_replacement)

In [33]:
train_data.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestrol,fasting_blood_sugar,resting_ECG,max_heart_rate_achieved,excercise_induced_angina,ST_depression,peak_exercise_ST_segment,num_of_major_vessels,thal,heart_disease_diagnosis
0,54,1,4,125,216,0,0,140,0,0.0,?,?,?,1
1,55,1,4,158,217,0,0,110,1,2.5,2,?,?,1
2,54,0,3,135,304,1,0,170,0,0.0,1,0,3,0
3,48,0,3,120,195,0,0,125,0,0.0,?,?,?,0
4,50,1,4,120,0,0,1,156,1,0.0,1,?,6,3


In [34]:
test_data.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestrol,fasting_blood_sugar,resting_ECG,max_heart_rate_achieved,excercise_induced_angina,ST_depression,peak_exercise_ST_segment,num_of_major_vessels,thal
0,60,1,2,160,267,1,1,157,0,0.5,2,?,?
1,61,1,4,148,203,0,0,161,0,0.0,1,1,7
2,54,1,4,130,242,0,0,91,1,1.0,2,?,?
3,48,1,4,120,260,0,0,115,0,2.0,2,?,?
4,57,0,1,130,308,0,0,98,0,1.0,2,?,?


## Exploratory Data Analysis

### Check For Null Values

#### Nan values null

In [35]:
train_data.isnull().sum()

age                         0
sex                         0
chest_pain_type             0
resting_blood_pressure      0
serum_cholestrol            0
fasting_blood_sugar         0
resting_ECG                 1
max_heart_rate_achieved     0
excercise_induced_angina    0
ST_depression               0
peak_exercise_ST_segment    0
num_of_major_vessels        0
thal                        0
heart_disease_diagnosis     0
dtype: int64

In [36]:
test_data.isnull().sum()

age                         0
sex                         0
chest_pain_type             0
resting_blood_pressure      0
serum_cholestrol            0
fasting_blood_sugar         0
resting_ECG                 0
max_heart_rate_achieved     0
excercise_induced_angina    0
ST_depression               0
peak_exercise_ST_segment    0
num_of_major_vessels        0
thal                        0
dtype: int64

#### Dataset specific null type

Column with values '?' in the dataset is null values a well

In [43]:
def pad_text(text, target_length):
    assert(len(text) <= target_length)
    return text + (' ' * (target_length - len(text)))

def print_data_null_encoded(data):
    for column in data.columns:
        null_values = data[column].apply(lambda x: x == '?')
        print('{}: {} ({} %)'.format(pad_text(column, 25),
                                     sum(null_values),
                                     sum(null_values) * 100 / data.shape[0]))

In [44]:
print('==== Train Data ====')
print_data_null_encoded(train_data)

print('\n\n')

print('==== Test Data ====')
print_data_null_encoded(test_data)

==== Train Data ====
age                      : 0 (0.0 %)
sex                      : 0 (0.0 %)
chest_pain_type          : 0 (0.0 %)
resting_blood_pressure   : 47 (6.033376123234916 %)
serum_cholestrol         : 24 (3.0808729139922977 %)
fasting_blood_sugar      : 78 (10.012836970474968 %)
resting_ECG              : 1 (0.12836970474967907 %)
max_heart_rate_achieved  : 44 (5.648267008985879 %)
excercise_induced_angina : 44 (5.648267008985879 %)
ST_depression            : 49 (6.290115532734275 %)
peak_exercise_ST_segment : 262 (33.632862644415916 %)
num_of_major_vessels     : 514 (65.98202824133504 %)
thal                     : 408 (52.374839537869065 %)
heart_disease_diagnosis  : 0 (0.0 %)



==== Test Data ====
age                      : 0 (0.0 %)
sex                      : 0 (0.0 %)
chest_pain_type          : 0 (0.0 %)
resting_blood_pressure   : 12 (8.51063829787234 %)
serum_cholestrol         : 6 (4.25531914893617 %)
fasting_blood_sugar      : 12 (8.51063829787234 %)
resting_ECG      