In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

pd.options.display.max_columns = 1000

%matplotlib inline

## Load Data

### Initialize Path Constants

In [19]:
RAW_DATA_PATH = '../data/raw'
PROCESSED_DATA_PATH = '../data/processed'

### Load CSV Files

In [20]:
train_df = pd.read_csv('{}/tubes2_HeartDisease_train.csv'.format(RAW_DATA_PATH))
test_df = pd.read_csv('{}/tubes2_HeartDisease_test.csv'.format(RAW_DATA_PATH))

## Rename Column Names & Convert '?' to NaN

The '?' symbol is converted into py.NaN in order to let the program classify the columns with missing values ('?') as a number, not an object.

In [21]:
from copy import deepcopy

test_columns_replacement = {
    'Column1': 'age',
    'Column2': 'sex',
    'Column3': 'chest_pain_type',
    'Column4': 'resting_blood_pressure',
    'Column5': 'serum_cholestrol',
    'Column6': 'fasting_blood_sugar',
    'Column7': 'resting_ECG',
    'Column8': 'max_heart_rate_achieved',
    'Column9': 'excercise_induced_angina',
    'Column10': 'ST_depression',
    'Column11': 'peak_exercise_ST_segment',
    'Column12': 'num_of_major_vessels',
    'Column13': 'thal',
}

train_columns_replacement = test_columns_replacement.copy()
train_columns_replacement['Column14'] = 'heart_disease_diagnosis'

train_df = train_df.rename(columns=train_columns_replacement).replace('?', np.NaN)
test_df = test_df.rename(columns=test_columns_replacement).replace('?', np.NaN)

combine = [train_df,test_df]

In [22]:
train_df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestrol,fasting_blood_sugar,resting_ECG,max_heart_rate_achieved,excercise_induced_angina,ST_depression,peak_exercise_ST_segment,num_of_major_vessels,thal,heart_disease_diagnosis
0,54,1,4,125,216,0,0,140,0,0.0,,,,1
1,55,1,4,158,217,0,0,110,1,2.5,2.0,,,1
2,54,0,3,135,304,1,0,170,0,0.0,1.0,0.0,3.0,0
3,48,0,3,120,195,0,0,125,0,0.0,,,,0
4,50,1,4,120,0,0,1,156,1,0.0,1.0,,6.0,3


In [23]:
test_df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestrol,fasting_blood_sugar,resting_ECG,max_heart_rate_achieved,excercise_induced_angina,ST_depression,peak_exercise_ST_segment,num_of_major_vessels,thal
0,60,1,2,160,267,1,1,157,0,0.5,2,,
1,61,1,4,148,203,0,0,161,0,0.0,1,1.0,7.0
2,54,1,4,130,242,0,0,91,1,1.0,2,,
3,48,1,4,120,260,0,0,115,0,2.0,2,,
4,57,0,1,130,308,0,0,98,0,1.0,2,,


## Data Preprocessing

## Impute Null Data

### to make it simple, we encode null values with the median (numerical features)

In [24]:
def stringify_categorical_data(df, colnames):
    for colname in colnames:
        df[colname] = df[colname].astype(str)

def impute_numerical_data(df,colnames):
    for colname in colnames:
        df[colname].fillna(df.loc[df[colname]!=np.NaN][colname].median(),inplace=True)
        df[colname] = df[colname].astype(np.float64)

### Categorical data :
Categorical data is a categorical measurement expressed not in terms of numbers, but rather by means of a natural language description. In statistics, it is often used interchangeably with "categorical" data. Categorical data represent characteristics such as a person’s gender, marital status, hometown, or the types of movies they like. Categorical data can take on numerical values (such as “1” indicating male and “2” indicating female), but those numbers don’t have mathematical meaning. You couldn’t add them together, for example.

### Fields which is considered as Categorical in our problem :
- peak_exercise_ST_segment
- exercise_induced_angina
- thal

### Numerical data :
Numerical data is a numerical measurement expressed not by means of a natural language description, but rather in terms of numbers.These data have meaning as a measurement, such as a person’s height, weight, IQ, or blood pressure; or they’re a count, such as the number of stock shares a person owns, how many teeth a dog has, or how many pages you can read of your favorite book before you fall asleep.

### Fields which is considered as Numerical in our problem :
- max_heart_rate_achieved
- ST_depression
- num_of_major_vessels
- resting_blood_pressure
- fasting_blood_sugar
- serum_cholestrol
- resting_ECG


In [25]:
categorical_data_colname = ['peak_exercise_ST_segment','excercise_induced_angina','thal']
numerical_data_colname = ['max_heart_rate_achieved',
                          'ST_depression',
                          'num_of_major_vessels',
                          'resting_blood_pressure',
                          'fasting_blood_sugar',
                          'serum_cholestrol',
                          'resting_ECG']

for df in [train_df, test_df]:
    impute_numerical_data(df, numerical_data_colname)
    stringify_categorical_data(df, categorical_data_colname)

train_df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestrol,fasting_blood_sugar,resting_ECG,max_heart_rate_achieved,excercise_induced_angina,ST_depression,peak_exercise_ST_segment,num_of_major_vessels,thal,heart_disease_diagnosis
0,54,1,4,125.0,216.0,0.0,0.0,140.0,0,0.0,,0.0,,1
1,55,1,4,158.0,217.0,0.0,0.0,110.0,1,2.5,2.0,0.0,,1
2,54,0,3,135.0,304.0,1.0,0.0,170.0,0,0.0,1.0,0.0,3.0,0
3,48,0,3,120.0,195.0,0.0,0.0,125.0,0,0.0,,0.0,,0
4,50,1,4,120.0,0.0,0.0,1.0,156.0,1,0.0,1.0,0.0,6.0,3


In [26]:
test_df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestrol,fasting_blood_sugar,resting_ECG,max_heart_rate_achieved,excercise_induced_angina,ST_depression,peak_exercise_ST_segment,num_of_major_vessels,thal
0,60,1,2,160.0,267.0,1.0,1.0,157.0,0,0.5,2,0.0,
1,61,1,4,148.0,203.0,0.0,0.0,161.0,0,0.0,1,1.0,7.0
2,54,1,4,130.0,242.0,0.0,0.0,91.0,1,1.0,2,0.0,
3,48,1,4,120.0,260.0,0.0,0.0,115.0,0,2.0,2,0.0,
4,57,0,1,130.0,308.0,0.0,0.0,98.0,0,1.0,2,0.0,


In [27]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 779 entries, 0 to 778
Data columns (total 14 columns):
age                         779 non-null int64
sex                         779 non-null int64
chest_pain_type             779 non-null int64
resting_blood_pressure      779 non-null float64
serum_cholestrol            779 non-null float64
fasting_blood_sugar         779 non-null float64
resting_ECG                 779 non-null float64
max_heart_rate_achieved     779 non-null float64
excercise_induced_angina    779 non-null object
ST_depression               779 non-null float64
peak_exercise_ST_segment    779 non-null object
num_of_major_vessels        779 non-null float64
thal                        779 non-null object
heart_disease_diagnosis     779 non-null int64
dtypes: float64(7), int64(4), object(3)
memory usage: 85.3+ KB


## One-Hot Encoding

In [28]:
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

In [29]:
train_df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestrol,fasting_blood_sugar,resting_ECG,max_heart_rate_achieved,ST_depression,num_of_major_vessels,heart_disease_diagnosis,excercise_induced_angina_0,excercise_induced_angina_1,excercise_induced_angina_nan,peak_exercise_ST_segment_1,peak_exercise_ST_segment_2,peak_exercise_ST_segment_3,peak_exercise_ST_segment_nan,thal_3,thal_6,thal_7,thal_nan
0,54,1,4,125.0,216.0,0.0,0.0,140.0,0.0,0.0,1,1,0,0,0,0,0,1,0,0,0,1
1,55,1,4,158.0,217.0,0.0,0.0,110.0,2.5,0.0,1,0,1,0,0,1,0,0,0,0,0,1
2,54,0,3,135.0,304.0,1.0,0.0,170.0,0.0,0.0,0,1,0,0,1,0,0,0,1,0,0,0
3,48,0,3,120.0,195.0,0.0,0.0,125.0,0.0,0.0,0,1,0,0,0,0,0,1,0,0,0,1
4,50,1,4,120.0,0.0,0.0,1.0,156.0,0.0,0.0,3,0,1,0,1,0,0,0,0,1,0,0


In [30]:
train_df.values.shape

(779, 22)

In [31]:
test_df.head()

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestrol,fasting_blood_sugar,resting_ECG,max_heart_rate_achieved,ST_depression,num_of_major_vessels,excercise_induced_angina_0,excercise_induced_angina_1,excercise_induced_angina_nan,peak_exercise_ST_segment_1,peak_exercise_ST_segment_2,peak_exercise_ST_segment_3,peak_exercise_ST_segment_nan,thal_3,thal_6,thal_7,thal_nan
0,60,1,2,160.0,267.0,1.0,1.0,157.0,0.5,0.0,1,0,0,0,1,0,0,0,0,0,1
1,61,1,4,148.0,203.0,0.0,0.0,161.0,0.0,1.0,1,0,0,1,0,0,0,0,0,1,0
2,54,1,4,130.0,242.0,0.0,0.0,91.0,1.0,0.0,0,1,0,0,1,0,0,0,0,0,1
3,48,1,4,120.0,260.0,0.0,0.0,115.0,2.0,0.0,1,0,0,0,1,0,0,0,0,0,1
4,57,0,1,130.0,308.0,0.0,0.0,98.0,1.0,0.0,1,0,0,0,1,0,0,0,0,0,1


In [32]:
test_df.shape

(141, 21)

In [39]:
test_df = test_df[train_df.drop('heart_disease_diagnosis', axis=1).columns]

## Save Processed Data

Save the processed data to the ../raw/processed folder

In [53]:
train_df.to_csv('{}/processed_train_data.csv'.format(PROCESSED_DATA_PATH),index=False)
test_df.to_csv('{}/processed_test_data.csv'.format(PROCESSED_DATA_PATH),index=False)

## Reference

1. http://scaryscientist.blogspot.com/2015/02/classification-of-data-types.html