In [1]:
# Import basic libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Import models
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier 

# Not show warning
import warnings
warnings.filterwarnings("ignore",)


In [2]:
# Load data and show first 5 rows
data = pd.read_csv('heart.csv')
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# Check rows and columns
data.shape

(303, 14)

In [4]:
# Check the null column
data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [5]:
# Check frequency on target's column
data['target'].value_counts()

1    165
0    138
Name: target, dtype: int64

In [6]:
# Divide features and target
X = data.drop(columns='target', axis = 1)
y = data['target']

In [7]:
# Show features
print(X)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   3       145   233    1        0      150      0      2.3   
1     37    1   2       130   250    0        1      187      0      3.5   
2     41    0   1       130   204    0        0      172      0      1.4   
3     56    1   1       120   236    0        1      178      0      0.8   
4     57    0   0       120   354    0        1      163      1      0.6   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   57    0   0       140   241    0        1      123      1      0.2   
299   45    1   3       110   264    0        1      132      0      1.2   
300   68    1   0       144   193    1        1      141      0      3.4   
301   57    1   0       130   131    0        1      115      1      1.2   
302   57    0   1       130   236    0        0      174      0      0.0   

     slope  ca  thal  
0        0   0     1  
1        0   0     2  
2        2   0    

In [8]:
# Show target
print(y)

0      1
1      1
2      1
3      1
4      1
      ..
298    0
299    0
300    0
301    0
302    0
Name: target, Length: 303, dtype: int64


In [9]:
# Split data into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2)

In [10]:
# Show features dan target before and after split.
print('Train data shape:')
print(X.shape)
print(X_train.shape)
print(X_test.shape)

print('\nTest data shape:') 
print(y.shape)
print(y_train.shape)
print(y_test.shape)


Train data shape:
(303, 13)
(242, 13)
(61, 13)

Test data shape:
(303,)
(242,)
(61,)


In [11]:
# Collect all models into a list
models = [LogisticRegression(), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]

In [14]:
# Create function to compare each models
def compare_models():
    for model in models:
        model.fit(X_train, y_train)
        test_prediction = model.predict(X_test)
        accuracy_scores = round(accuracy_score(y_test, test_prediction)*100, 2)
        print(f'Accuracy score on {model}: {accuracy_scores}%')

In [15]:
# Show all models performance
compare_models()

Accuracy score on LogisticRegression(): 90.1639344262295%
Accuracy score on SVC(kernel='linear'): 86.88524590163934%
Accuracy score on KNeighborsClassifier(): 70.49180327868852%
Accuracy score on RandomForestClassifier(): 90.1639344262295%


In [18]:
models = [LogisticRegression(), SVC(kernel='linear'), KNeighborsClassifier(), RandomForestClassifier()]

In [25]:
# Create function to compare each models score
def compare_crossVal_models():
    for model in models:
        cv_score = cross_val_score(model, X, y, cv = 5)
        mean_accuracy = round((sum(cv_score)/len(cv_score))*100, 2)
        print(f'Cross validation on {model}: {cv_score}')
        print(f'Mean Accuracy on {model}: {mean_accuracy}\n')    

In [26]:
# Show all models performance
compare_crossVal_models()

Cross validation on LogisticRegression(): [0.81967213 0.86885246 0.85245902 0.85       0.75      ]
Mean Accuracy on LogisticRegression(): 82.82

Cross validation on SVC(kernel='linear'): [0.81967213 0.8852459  0.80327869 0.86666667 0.76666667]
Mean Accuracy on SVC(kernel='linear'): 82.83

Cross validation on KNeighborsClassifier(): [0.60655738 0.6557377  0.57377049 0.73333333 0.65      ]
Mean Accuracy on KNeighborsClassifier(): 64.39

Cross validation on RandomForestClassifier(): [0.81967213 0.85245902 0.81967213 0.88333333 0.76666667]
Mean Accuracy on RandomForestClassifier(): 82.84

