In [31]:
import pandas as pd
import numpy as np
import os
import pickle
from sklearn import __version__ as sklearn_version
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from library.sb_utils import save_file

In [32]:
patient_data = pd.read_csv('../data/patient_data_cleaned.csv')
patient_data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up,0


## Train Test Split

In [33]:
#Extract Heart Disease column
HD = patient_data['HeartDisease']

In [34]:
patient_data = patient_data.loc[:, patient_data.columns != 'HeartDisease']
patient_data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,M,ATA,140,289.0,0,Normal,172,N,0.0,Up
1,49,F,NAP,160,180.0,0,Normal,156,N,1.0,Flat
2,37,M,ATA,130,283.0,0,ST,98,N,0.0,Up
3,48,F,ASY,138,214.0,0,Normal,108,Y,1.5,Flat
4,54,M,NAP,150,195.0,0,Normal,122,N,0.0,Up


In [35]:
#Test Train Split
X_train, X_test, y_train, y_test = train_test_split(patient_data, HD, test_size=0.25, random_state=42)

In [36]:
X_train.shape, X_test.shape

((687, 11), (230, 11))

In [37]:
y_train.shape, y_test.shape

((687,), (230,))

In [38]:
#Checking data types for further processing
X_train.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol       float64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
dtype: object

In [39]:
X_test.dtypes

Age                 int64
Sex                object
ChestPainType      object
RestingBP           int64
Cholesterol       float64
FastingBS           int64
RestingECG         object
MaxHR               int64
ExerciseAngina     object
Oldpeak           float64
ST_Slope           object
dtype: object

## One-hot encoding categorical variables

In [40]:
#One-hot encoding object features
X_train = pd.get_dummies(X_train, drop_first = True)
X_train.shape

(687, 15)

In [41]:
X_test = pd.get_dummies(X_test, drop_first = True)
X_test.shape

(230, 15)

In [42]:
#Verifing data types
X_train.dtypes

Age                    int64
RestingBP              int64
Cholesterol          float64
FastingBS              int64
MaxHR                  int64
Oldpeak              float64
Sex_M                  uint8
ChestPainType_ATA      uint8
ChestPainType_NAP      uint8
ChestPainType_TA       uint8
RestingECG_Normal      uint8
RestingECG_ST          uint8
ExerciseAngina_Y       uint8
ST_Slope_Flat          uint8
ST_Slope_Up            uint8
dtype: object

In [43]:
X_test.dtypes

Age                    int64
RestingBP              int64
Cholesterol          float64
FastingBS              int64
MaxHR                  int64
Oldpeak              float64
Sex_M                  uint8
ChestPainType_ATA      uint8
ChestPainType_NAP      uint8
ChestPainType_TA       uint8
RestingECG_Normal      uint8
RestingECG_ST          uint8
ExerciseAngina_Y       uint8
ST_Slope_Flat          uint8
ST_Slope_Up            uint8
dtype: object

In [44]:
X_train.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
155,56,155,342.0,1,150,3.0,1,0,0,0,1,0,1,1,0
362,56,155,199.016358,0,99,0.0,1,0,1,0,0,1,0,1,0
868,59,150,212.0,1,157,1.6,1,0,1,0,1,0,0,0,1
101,51,130,179.0,0,100,0.0,1,0,0,0,1,0,0,0,1
199,57,130,308.0,0,98,1.0,0,0,0,1,1,0,0,1,0


## Scaling/standardizing numeric features

In [45]:
num_col = ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak'] 

In [46]:
scaler = StandardScaler()
mm_scaler = MinMaxScaler()

In [47]:
#Scale numerical data, using MinMax to preserve distribution
mm_scaler.fit(X_train[num_col])

MinMaxScaler()

In [48]:
X_train[num_col] = mm_scaler.transform(X_train[num_col])

In [49]:
X_train.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
155,0.5625,0.575472,0.496139,1,0.685039,0.483871,1,0,0,0,1,0,1,1,0
362,0.5625,0.575472,0.220109,0,0.283465,0.0,1,0,1,0,0,1,0,1,0
868,0.625,0.528302,0.245174,1,0.740157,0.258065,1,0,1,0,1,0,0,0,1
101,0.458333,0.339623,0.181467,0,0.291339,0.0,1,0,0,0,1,0,0,0,1
199,0.583333,0.339623,0.430502,0,0.275591,0.16129,0,0,0,1,1,0,0,1,0


In [50]:
X_test[num_col] = mm_scaler.transform(X_test[num_col])

In [51]:
X_test.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA,RestingECG_Normal,RestingECG_ST,ExerciseAngina_Y,ST_Slope_Flat,ST_Slope_Up
668,0.333333,0.339623,0.287645,0,0.88189,0.096774,0,1,0,0,0,0,0,1,0
30,0.5,0.481132,0.835907,0,0.527559,0.0,1,0,1,0,1,0,0,1,0
377,0.75,0.622642,0.220109,1,0.464567,0.193548,1,0,0,0,0,1,0,1,0
535,0.6875,0.367925,0.220109,1,0.440945,0.193548,1,0,1,0,0,1,1,1,0
806,0.520833,0.132075,0.432432,0,0.732283,0.0,1,1,0,0,1,0,0,0,1


In [53]:
# save the data to a new csv file
datapath = '../data'
save_file(patient_data, 'patient_data_preprocessed.csv', datapath)

Writing file.  "../data/patient_data_preprocessed.csv"
