# 4 Pre-Processing

## 4.1 Import Modules and Data

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
filename = '../Step 2 - Data Wrangling/heart_cleveland_original.csv'
df = pd.read_csv(filename)

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,condition
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0,1
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0,0


## 4.2 Creating Feature and Target Variables

In [3]:
X = df.drop('condition',axis =1 )
y = df[['condition']]

print(X.shape)
print(y.shape)
X.head()

(297, 13)
(297, 1)


Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,69,1,0,160,234,1,2,131,0,0.1,1,1,0
1,69,0,0,140,239,0,0,151,0,1.8,0,2,0
2,66,0,0,150,226,0,0,114,0,2.6,2,0,0
3,65,1,0,138,282,1,2,174,0,1.4,1,1,0
4,64,1,0,110,211,0,2,144,1,1.8,1,0,0


We start by creating our X and y variables for our features and our target. Our target variable is the "Condition" column, which tells us whether a patient has heart disease or not.

## 4.3 Dummy Features

In [4]:
Xcat = X.copy()
Xcat[['cp']] = X[['cp']].replace({0 : 'Asymptomatic', 1 : 'Atypical Angina', 2 : 'Non-Anginal Pain', 3 : 'Typical Angina'})
Xcat[['restecg']] = X[['restecg']].replace({0 : 'Left Ventricular Hypertrophy',1: 'Normal', 2 : 'ST-T Wave Abnormality'})
Xcat[['slope']] = X[['slope']].replace({0 : 'Downsloping', 1 : 'Flat', 2 : 'Upsloping'})
Xcat[['thal']] = X[['thal']].replace({1 : 'Fixed Defect', 2 : 'Normal', 0 : 'Reversable Defect'})

Xcat.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,69,1,Asymptomatic,160,234,1,ST-T Wave Abnormality,131,0,0.1,Flat,1,Reversable Defect
1,69,0,Asymptomatic,140,239,0,Left Ventricular Hypertrophy,151,0,1.8,Downsloping,2,Reversable Defect
2,66,0,Asymptomatic,150,226,0,Left Ventricular Hypertrophy,114,0,2.6,Upsloping,0,Reversable Defect
3,65,1,Asymptomatic,138,282,1,ST-T Wave Abnormality,174,0,1.4,Flat,1,Reversable Defect
4,64,1,Asymptomatic,110,211,0,ST-T Wave Abnormality,144,1,1.8,Flat,0,Reversable Defect


In [5]:
dummyX = pd.get_dummies(Xcat)
dummyX.head(15)

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,cp_Asymptomatic,...,cp_Typical Angina,restecg_Left Ventricular Hypertrophy,restecg_Normal,restecg_ST-T Wave Abnormality,slope_Downsloping,slope_Flat,slope_Upsloping,thal_Fixed Defect,thal_Normal,thal_Reversable Defect
0,69,1,160,234,1,131,0,0.1,1,1,...,0,0,0,1,0,1,0,0,0,1
1,69,0,140,239,0,151,0,1.8,2,1,...,0,1,0,0,1,0,0,0,0,1
2,66,0,150,226,0,114,0,2.6,0,1,...,0,1,0,0,0,0,1,0,0,1
3,65,1,138,282,1,174,0,1.4,1,1,...,0,0,0,1,0,1,0,0,0,1
4,64,1,110,211,0,144,1,1.8,0,1,...,0,0,0,1,0,1,0,0,0,1
5,64,1,170,227,0,155,0,0.6,0,1,...,0,0,0,1,0,1,0,0,1,0
6,63,1,145,233,1,150,0,2.3,0,1,...,0,0,0,1,0,0,1,1,0,0
7,61,1,134,234,0,145,0,2.6,2,1,...,0,1,0,0,0,1,0,0,0,1
8,60,0,150,240,0,171,0,0.9,0,1,...,0,1,0,0,1,0,0,0,0,1
9,59,1,178,270,0,145,0,4.2,0,1,...,0,0,0,1,0,0,1,0,1,0


To add dummy features, we first need to add in our categorical data. We need to convert it from numerical to text objects for the get_dummies method to be able to detect it and create dummy features. Since some features such as sex and exang are a binary value, I left these categories out since it will create redundant columns by creating one for male and female when a single column will suffice.

## 4.4 Scaling Data

In [6]:
scalerDummy = StandardScaler()
scalerDummy.fit(dummyX)
scaledDummy = scalerDummy.transform(dummyX)
scaledDummy = pd.DataFrame(scaledDummy, columns = dummyX.columns)

scalerReg = StandardScaler()
scalerReg.fit(X)
scaledReg = scalerReg.transform(X)
scaledReg = pd.DataFrame(scaledReg, columns = X.columns)

scaledDummy.head()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,cp_Asymptomatic,...,cp_Typical Angina,restecg_Left Ventricular Hypertrophy,restecg_Normal,restecg_ST-T Wave Abnormality,slope_Downsloping,slope_Flat,slope_Upsloping,thal_Fixed Defect,thal_Normal,thal_Reversable Defect
0,1.600302,0.691095,1.596266,-0.257179,2.430427,-0.812095,-0.696419,-0.820813,0.344824,3.451528,...,-0.957146,-0.989949,-0.116841,1.016979,-0.937948,1.080686,-0.275839,-0.254,-0.794901,0.900542
1,1.600302,-1.44698,0.468418,-0.160859,-0.41145,0.061157,-0.696419,0.63947,1.411625,3.451528,...,-0.957146,1.010153,-0.116841,-0.983304,1.066157,-0.925338,-0.275839,-0.254,-0.794901,0.900542
2,1.268242,-1.44698,1.032342,-0.411292,-0.41145,-1.554358,-0.696419,1.326662,-0.721976,3.451528,...,-0.957146,1.010153,-0.116841,-0.983304,-0.937948,-0.925338,3.625308,-0.254,-0.794901,0.900542
3,1.157555,0.691095,0.355633,0.667499,2.430427,1.065396,-0.696419,0.295874,0.344824,3.451528,...,-0.957146,-0.989949,-0.116841,1.016979,-0.937948,1.080686,-0.275839,-0.254,-0.794901,0.900542
4,1.046868,0.691095,-1.223355,-0.700254,-0.41145,-0.244481,1.435916,0.63947,-0.721976,3.451528,...,-0.957146,-0.989949,-0.116841,1.016979,-0.937948,1.080686,-0.275839,-0.254,-0.794901,0.900542


In [7]:
scaledReg.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,1.600302,0.691095,-2.240629,1.596266,-0.257179,2.430427,1.010199,-0.812095,-0.696419,-0.820813,0.643781,0.344824,-0.874292
1,1.600302,-1.44698,-2.240629,0.468418,-0.160859,-0.41145,-1.003419,0.061157,-0.696419,0.63947,-0.976583,1.411625,-0.874292
2,1.268242,-1.44698,-2.240629,1.032342,-0.411292,-0.41145,-1.003419,-1.554358,-0.696419,1.326662,2.264145,-0.721976,-0.874292
3,1.157555,0.691095,-2.240629,0.355633,0.667499,2.430427,1.010199,1.065396,-0.696419,0.295874,0.643781,0.344824,-0.874292
4,1.046868,0.691095,-2.240629,-1.223355,-0.700254,-0.41145,1.010199,-0.244481,1.435916,0.63947,0.643781,-0.721976,-0.874292


After creating dummy features, we standardize our data without the target variable. We also scale our regular data without dummy features to see how this performs. Since all of our categorical data is already converted to a numerical data, creating dummy variables is not as necessary.

## 4.5 Train Test Split

In [8]:
X_train70_dummy,X_test30_dummy,y_train70_dummy,y_test30_dummy = train_test_split(scaledDummy,y,test_size = 0.3)
X_train70,X_test30,y_train70,y_test30 = train_test_split(scaledReg,y,test_size = 0.3)

print(X_train70_dummy.shape,y_train70_dummy.shape)
print(X_train70.shape,y_train70.shape)

(207, 22) (207, 1)
(207, 13) (207, 1)


We create multiple train test splits since we don't know which may provide us with the best results. Multiple splits were created to allow the flexibility of use without needing to create it later on.

## 4.6 Export Data

In [9]:
Train70 = open('Train70Data.p','wb')
pickle.dump((X_train70,y_train70),Train70)
Train70.close()

Test30 = open('Test30Data.p','wb')
pickle.dump((X_test30,y_test30),Test30)
Test30.close()

Train70_dummy = open('Train70Data_dummy.p','wb')
pickle.dump((X_train70_dummy,y_train70_dummy),Train70_dummy)
Train70_dummy.close()

Test30_dummy = open('Test30Data_dummy.p','wb')
pickle.dump((X_test30_dummy,y_test30_dummy),Test30_dummy)
Train70_dummy.close()

All the data created will be exported in case any portion of it needs to be referenced later on, without needing to come back and export it later on. Exporting all the test train splits that were created on the scaled dummy feature data. The full scale data and the unscaled dummy feature data is also being exported. Lastly, the target variables and the feature values are being exported so we have the original unaltered data in case something else needs to be performed later on. Pickle is being used to export this data so that the object is preserved across projects without needing to convert to csv or other file formats.