In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [3]:
df = pd.read_csv('heart.csv')
df['ST_Slope']

0        Up
1      Flat
2        Up
3      Flat
4        Up
       ... 
913    Flat
914    Flat
915    Flat
916    Flat
917      Up
Name: ST_Slope, Length: 918, dtype: object

In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])
df['ExerciseAngina'] = le.fit_transform(df['ExerciseAngina'])
df.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,ATA,140,289,0,Normal,172,0,0.0,Up,0
1,49,0,NAP,160,180,0,Normal,156,0,1.0,Flat,1
2,37,1,ATA,130,283,0,ST,98,0,0.0,Up,0
3,48,0,ASY,138,214,0,Normal,108,1,1.5,Flat,1
4,54,1,NAP,150,195,0,Normal,122,0,0.0,Up,0


In [5]:
dummies1 = pd.get_dummies(df['ChestPainType'],dtype=int)
dummies2 = pd.get_dummies(df['RestingECG'],dtype=int)
dummies3 = pd.get_dummies(df['ST_Slope'],dtype=int)
df_dummies = pd.concat([df,dummies1,dummies2,dummies3],axis='columns')
dummies3

Unnamed: 0,Down,Flat,Up
0,0,0,1
1,0,1,0
2,0,0,1
3,0,1,0
4,0,0,1
...,...,...,...
913,0,1,0
914,0,1,0
915,0,1,0
916,0,1,0


In [6]:
x = df_dummies.drop(['ChestPainType','RestingECG','ST_Slope','TA','ST','Up'],axis='columns')
x.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ASY,ATA,NAP,LVH,Normal,Down,Flat
0,40,1,140,289,0,172,0,0.0,0,0,1,0,0,1,0,0
1,49,0,160,180,0,156,0,1.0,1,0,0,1,0,1,0,1
2,37,1,130,283,0,98,0,0.0,0,0,1,0,0,0,0,0
3,48,0,138,214,0,108,1,1.5,1,1,0,0,0,1,0,1
4,54,1,150,195,0,122,0,0.0,0,0,0,1,0,1,0,0


In [7]:
x.shape

(918, 16)

In [8]:
from scipy.stats import zscore

z_scores = np.abs(zscore(x))
threshold = 3

df_cleaned = x[(z_scores<threshold).all(axis=1)]

df_cleaned.shape

(842, 16)

In [9]:
df_cleaned.head()

Unnamed: 0,Age,Sex,RestingBP,Cholesterol,FastingBS,MaxHR,ExerciseAngina,Oldpeak,HeartDisease,ASY,ATA,NAP,LVH,Normal,Down,Flat
0,40,1,140,289,0,172,0,0.0,0,0,1,0,0,1,0,0
1,49,0,160,180,0,156,0,1.0,1,0,0,1,0,1,0,1
2,37,1,130,283,0,98,0,0.0,0,0,1,0,0,0,0,0
3,48,0,138,214,0,108,1,1.5,1,1,0,0,0,1,0,1
4,54,1,150,195,0,122,0,0.0,0,0,0,1,0,1,0,0


In [10]:
x = df_cleaned.drop('HeartDisease',axis='columns')
y = df_cleaned['HeartDisease']

In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
x = scaler.fit_transform(x)


In [12]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [13]:
from sklearn.linear_model import LogisticRegression

model1 = LogisticRegression()
model1.fit(x_train,y_train)
model1.score(x_test,y_test)

0.9230769230769231

In [14]:
from sklearn.svm import SVC
model2 = SVC()
model2.fit(x_train,y_train)
model2.score(x_test,y_test)

0.9289940828402367

In [15]:
from sklearn.ensemble import RandomForestClassifier
model3 = RandomForestClassifier()
model3.fit(x_train,y_train)
model3.score(x_test,y_test)

0.9112426035502958

In [24]:
from sklearn.model_selection import GridSearchCV

model_params = {
    'SVM': {
        'model': SVC(),
        'params': {
            'kernel': ['linear', 'rbf'],
            'C': [0.1, 1, 10]
        }
    },
    'Logistic Regression': {
        'model': LogisticRegression(),
        'params': {
            'C': [0.1, 1, 10]
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [10, 100, 200],
            'max_depth': [None, 5, 10]
        }
    }
}
best_estimater = {}
for key,value in model_params.items():
    grid = GridSearchCV(value['model'],value['params'],cv=5,scoring='accuracy',n_jobs=-1)
    grid.fit(x_train,y_train)
    best_estimater[key] = grid.best_estimator_

In [23]:
type(model_params)

dict

In [18]:
best_estimater

{'SVM': SVC(C=0.1),
 'Logistic Regression': LogisticRegression(C=0.1),
 'Random Forest': RandomForestClassifier(max_depth=10, n_estimators=200)}

In [19]:
best_model = max(best_estimater,key = lambda x:best_estimater[x].score(x_test,y_test))
x.shape

(842, 15)

In [20]:
from sklearn.decomposition import PCA
pca = PCA(0.90)
x = pca.fit_transform(x)
x.shape

(842, 8)

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2)

In [22]:
model4 = RandomForestClassifier()
model4.fit(x_train,y_train)
model4.score(x_test,y_test)

0.8994082840236687