1. Load heart disease dataset in pandas dataframe
2. Remove outliers using Z score. Usual guideline is to remove anything that has Z score > 3 formula or Z score < -3
3. Convert text columns to numbers using label encoding and one hot encoding
4. Apply scaling
5. Build a classification model using various methods (SVM, logistic regression, random forest) and check which model gives you the best accuracy
6. Now use PCA to reduce dimensions, retrain your model and see what impact it has on your model in terms of accuracy. Keep in mind that many times doing PCA reduces the accuracy but computation is much lighter and that's the trade off you need to consider while building models in real life

In [379]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [380]:
df = pd.read_csv("heart_disease.csv")
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [381]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [382]:
df.isna().any()

Age               False
Sex               False
ChestPainType     False
RestingBP         False
Cholesterol       False
FastingBS         False
RestingECG        False
MaxHR             False
ExerciseAngina    False
Oldpeak           False
ST_Slope          False
HeartDisease      False
dtype: bool

In [383]:
df.nunique()

Age                50
Sex                 2
ChestPainType       4
RestingBP          67
Cholesterol       222
FastingBS           2
RestingECG          3
MaxHR             119
ExerciseAngina      2
Oldpeak            53
ST_Slope            3
HeartDisease        2
dtype: int64

In [384]:
df.shape

(918, 12)

Outliers are the value with Z score > 3 or Z score < -3, So the following five Cells are selecting the data in between the range i.e., -3 < Z score < 3

In [385]:
df = df[(df.RestingBP>=df.RestingBP.mean()-3*df.RestingBP.std())]
df = df[(df.RestingBP<=df.RestingBP.mean()+3*df.RestingBP.std())]
df.shape

(910, 12)

In [386]:
df = df[(df.Cholesterol>=df.Cholesterol.mean()-3*df.Cholesterol.std())]
df = df[(df.Cholesterol<=df.Cholesterol.mean()+3*df.Cholesterol.std())]
df.shape

(907, 12)

In [387]:
df = df[(df.FastingBS>=df.FastingBS.mean()-3*df.FastingBS.std())]
df = df[(df.FastingBS<=df.FastingBS.mean()+3*df.FastingBS.std())]
df.shape

(907, 12)

In [388]:
df = df[(df.MaxHR>=df.MaxHR.mean()-3*df.MaxHR.std())]
df = df[(df.MaxHR<=df.MaxHR.mean()+3*df.MaxHR.std())]
df.shape

(906, 12)

In [389]:
df = df[(df.Oldpeak>=df.Oldpeak.mean()-3*df.Oldpeak.std())]
df = df[(df.Oldpeak<=df.Oldpeak.mean()+3*df.Oldpeak.std())]
df.shape

(899, 12)

In [390]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [391]:
df.nunique()

Age                50
Sex                 2
ChestPainType       4
RestingBP          63
Cholesterol       219
FastingBS           2
RestingECG          3
MaxHR             116
ExerciseAngina      2
Oldpeak            47
ST_Slope            3
HeartDisease        2
dtype: int64

In [392]:
df.Sex.unique()

array(['M', 'F'], dtype=object)

In [393]:
df.ChestPainType.unique()

array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)

In [394]:
df.RestingECG.unique()

array(['Normal', 'ST', 'LVH'], dtype=object)

In [395]:
df.ExerciseAngina.unique()

array(['N', 'Y'], dtype=object)

In [396]:
df.ST_Slope.unique()

array(['Up', 'Flat', 'Down'], dtype=object)

In [397]:
df.ExerciseAngina.replace({
    'N':0,
    'Y':1
}, inplace=True)
df.RestingECG.replace({
    'Normal':1,
    'ST':2,
    'LVH':3
}, inplace=True)
df.ST_Slope.replace({
    'Down':1,
    'Flat':2,
    'Up':3
}, inplace=True)
df


Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,1,172,0,0.0,3,0
1,49,F,NAP,160,180,0,1,156,0,1.0,2,1
2,37,M,ATA,130,283,0,2,98,0,0.0,3,0
3,48,F,ASY,138,214,0,1,108,1,1.5,2,1
4,54,M,NAP,150,195,0,1,122,0,0.0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,1,132,0,1.2,2,1
914,68,M,ASY,144,193,1,1,141,0,3.4,2,1
915,57,M,ASY,130,131,0,1,115,1,1.2,2,1
916,57,F,ATA,130,236,0,3,174,0,0.0,2,1


In [398]:
df = pd.get_dummies(df, drop_first=True)
df

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,140,289,0,1,172,0,0.0,3,0,1,1,0,0
1,49,160,180,0,1,156,0,1.0,2,1,0,0,1,0
2,37,130,283,0,2,98,0,0.0,3,0,1,1,0,0
3,48,138,214,0,1,108,1,1.5,2,1,0,0,0,0
4,54,150,195,0,1,122,0,0.0,3,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,1,132,0,1.2,2,1,1,0,0,1
914,68,144,193,1,1,141,0,3.4,2,1,1,0,0,0
915,57,130,131,0,1,115,1,1.2,2,1,1,0,0,0
916,57,130,236,0,3,174,0,0.0,2,1,0,1,0,0


In [399]:
Y = df.HeartDisease.values


In [400]:
X=df.drop('HeartDisease', axis=1).values
X.shape

(899, 13)

Building Various models to apply on data without PCA and Scaling

In [401]:
from sklearn.model_selection import train_test_split
x_train, x_test,y_train,y_test = train_test_split(X,Y, test_size=0.2, random_state=42)

In [402]:
x_test.shape

(180, 13)

In [403]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

In [404]:
lr=LogisticRegression(max_iter=1000)
lr.fit(x_train,y_train)
lr.score(x_test,y_test)

0.8555555555555555

In [405]:
svm = SVC()
svm.fit(x_train, y_train)
svm.score(x_test,y_test)

0.7277777777777777

In [406]:
rf = RandomForestClassifier()
rf.fit(x_train,y_train)
rf.score(x_test,y_test)

0.8888888888888888

In [407]:
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
knn.score(x_test,y_test)

0.7333333333333333

In [408]:
dt = DecisionTreeClassifier()
dt.fit(x_train, y_train)
dt.score(x_test,y_test)

0.8277777777777777

In [409]:
gauss = GaussianNB()
gauss.fit(x_train, y_train)
gauss.score(x_test,y_test)

0.8055555555555556

Now Scaling the Data, Then applying gridSearch for best parameters for the models

In [410]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)
scaled_X

array([[-1.42815446,  0.46590022,  0.84963584, ...,  2.06332497,
        -0.5349047 , -0.22955001],
       [-0.47585532,  1.63471366, -0.16812204, ..., -0.48465463,
         1.86949191, -0.22955001],
       [-1.7455875 , -0.1185065 ,  0.79361247, ...,  2.06332497,
        -0.5349047 , -0.22955001],
       ...,
       [ 0.3706328 , -0.1185065 , -0.62564622, ..., -0.48465463,
        -0.5349047 , -0.22955001],
       [ 0.3706328 , -0.1185065 ,  0.35476274, ...,  2.06332497,
        -0.5349047 , -0.22955001],
       [-1.63977649,  0.34901888, -0.21480818, ..., -0.48465463,
         1.86949191, -0.22955001]])

In [411]:
model_params = {
    'svm':{
        'model' : SVC(gamma='auto'),
        'param' : {
            'C' : [i for i in range(2,21)],
            'kernel':['rbf','linear']
        }
    },
    'random_forest':{
        'model':RandomForestClassifier(),
        'param':{
            'n_estimators':[i for i in range(1,21)],
            'criterion':['gini','entropy']
        }
    },
    'logistic_regression':{
        'model':LogisticRegression(max_iter=100000),
        'param':{
            'C':[i for i in range(2,31)]
        }
    },
    'descision_tree':{
        'model':DecisionTreeClassifier(),
        'param':{
            'criterion':['gini','entropy']
        }
    },
    'knn':{
        'model':KNeighborsClassifier(),
        'param':{
            'n_neighbors':[i for i in range(2,21)]
        }
    }
    
}

In [412]:
from sklearn.model_selection import GridSearchCV
scores=[]

for model_name, mp in model_params.items():
    clf=GridSearchCV(mp['model'],mp['param'], cv=5)
    clf.fit(scaled_X,Y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params':clf.best_params_
    })
df1 =pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df1

Unnamed: 0,model,best_score,best_params
0,svm,0.817517,"{'C': 2, 'kernel': 'rbf'}"
1,random_forest,0.82419,"{'criterion': 'entropy', 'n_estimators': 15}"
2,logistic_regression,0.807486,{'C': 2}
3,descision_tree,0.737381,{'criterion': 'entropy'}
4,knn,0.830875,{'n_neighbors': 9}


Now Applying PCA to reduce some of the Parameters and repeating the above procedure again

In [413]:
from sklearn.decomposition import PCA
pca = PCA(0.99)
X_pca = pca.fit_transform(scaled_X)
X_pca.shape

(899, 13)

In [414]:
X_pca

array([[-2.8680575 ,  0.21333313, -1.41257566, ...,  0.33593755,
        -0.26942022, -0.04392056],
       [-0.74450526,  0.8093094 ,  1.41678279, ...,  0.93086295,
         0.10779614,  0.16513668],
       [-1.84086917, -0.12689666, -1.53849555, ...,  1.23493619,
         0.56995239, -1.82354081],
       ...,
       [ 1.54257718, -0.60111701, -1.03441609, ..., -0.26438583,
        -0.21511655,  0.26067565],
       [-1.88202652,  1.72745849, -0.15308986, ...,  0.91548106,
        -0.97638834,  0.64645111],
       [-2.14875312, -0.81980931,  1.29769061, ...,  0.12458723,
         0.01801302,  0.01515971]])

In [415]:
pca.explained_variance_ratio_

array([0.22235374, 0.10994919, 0.09805708, 0.09123109, 0.08238096,
       0.0696049 , 0.06700943, 0.06319378, 0.04792169, 0.04652268,
       0.03819182, 0.0328761 , 0.03070755])

In [416]:
x_train_pca,x_test_pca, y_train, y_test = train_test_split(X_pca, Y, test_size=0.2, random_state=42)

In [417]:
lr=LogisticRegression(max_iter=1000)
lr.fit(x_train_pca,y_train)
lr.score(x_test_pca,y_test)

0.8444444444444444

In [418]:
svm = SVC()
svm.fit(x_train_pca, y_train)
svm.score(x_test_pca,y_test)

0.8722222222222222

In [419]:
rf = RandomForestClassifier()
rf.fit(x_train_pca,y_train)
rf.score(x_test_pca,y_test)

0.8222222222222222

In [420]:
knn = KNeighborsClassifier()
knn.fit(x_train_pca, y_train)
knn.score(x_test_pca,y_test)

0.8555555555555555

In [421]:
dt = DecisionTreeClassifier()
dt.fit(x_train_pca, y_train)
dt.score(x_test_pca,y_test)

0.7444444444444445

In [422]:
gauss = GaussianNB()
gauss.fit(x_train_pca, y_train)
gauss.score(x_test_pca,y_test)

0.8388888888888889

In [423]:
scores=[]

for model_name, mp in model_params.items():
    clf1=GridSearchCV(mp['model'],mp['param'], cv=5)
    clf1.fit(X_pca,Y)
    scores.append({
        'model': model_name,
        'best_score': clf1.best_score_,
        'best_params':clf1.best_params_
    })
df1 =pd.DataFrame(scores, columns=['model', 'best_score', 'best_params'])
df1

Unnamed: 0,model,best_score,best_params
0,svm,0.817517,"{'C': 2, 'kernel': 'rbf'}"
1,random_forest,0.826412,"{'criterion': 'entropy', 'n_estimators': 15}"
2,logistic_regression,0.807486,{'C': 2}
3,descision_tree,0.754091,{'criterion': 'entropy'}
4,knn,0.830875,{'n_neighbors': 9}
