In [70]:
import sys
import numpy as np
import pandas as pd
from time import time
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier

data = pd.read_csv(r'C:\Users\Guo Beiting\Downloads\heart.csv')
data

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [71]:
pd.set_option('display.max_columns', None)
data = pd.concat([data, pd.get_dummies(data['Sex']), pd.get_dummies(data['ChestPainType']), 
           pd.get_dummies(data['RestingECG']), pd.get_dummies(data['ExerciseAngina']), pd.get_dummies(data['ST_Slope'])], axis=1).reindex(data.index)

data = data.drop(['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], axis=1)
data

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,F,M,ASY,ATA,NAP,TA,LVH,Normal,ST,N,Y,Down,Flat,Up
0,40,140,289,0,172,0.0,0,0,1,0,1,0,0,0,1,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,1,0,0,0,1,0,0,1,0,1,0,0,1,0
2,37,130,283,0,98,0.0,0,0,1,0,1,0,0,0,0,1,1,0,0,0,1
3,48,138,214,0,108,1.5,1,1,0,1,0,0,0,0,1,0,0,1,0,1,0
4,54,150,195,0,122,0.0,0,0,1,0,0,1,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1,0,1,0,0,0,1,0,1,0,1,0,0,1,0
914,68,144,193,1,141,3.4,1,0,1,1,0,0,0,0,1,0,1,0,0,1,0
915,57,130,131,0,115,1.2,1,0,1,1,0,0,0,0,1,0,0,1,0,1,0
916,57,130,236,0,174,0.0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0


In [72]:
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data))
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices] 

data_mean = data.mean()
data_std = data.std()

# Split the training data and test data
Train, Test = split_train_test(data,0.2)
Train_y = Train['HeartDisease'].reset_index(drop=True)
Test_y = Test['HeartDisease'].reset_index(drop=True)

Train = Train.drop(['HeartDisease'], axis=1)
Test = Test.drop(['HeartDisease'], axis=1)
# remove label, normalization 
Train = (Train-data_mean[np.arange(21)!=6])/data_std[np.arange(21)!=6]
Train = Train.fillna(0).reset_index(drop=True)

Test = (Test-data_mean[np.arange(21)!=6])/data_std[np.arange(21)!=6]
Test = Test.fillna(0).reset_index(drop=True)
Train_y

0      1
1      0
2      1
3      1
4      0
      ..
730    0
731    1
732    1
733    0
734    1
Name: HeartDisease, Length: 735, dtype: int64

In [73]:
def dimensionality_reduction_LDA(n_components, X_train, y_train):
    print("Extracting the top %d features from %d data"% (n_components, X_train.shape[0]))
    t0 = time()
    pca = PCA(n_components=n_components).fit(X_train)

    lda = LDA().fit(pca.transform(X_train), y_train)
    print("done in %0.3fs" % (time() - t0))

    return lda, pca

def train_text_transform_LDA(lda, pca, X_train, X_test):
    print("Projecting the input data on the eigen orthonormal basis")
    t0 = time()
    X_train_lda = lda.transform(pca.transform(X_train))
    X_test_lda = lda.transform(pca.transform(X_test))
    print("done in %0.3fs" % (time() - t0))

    return X_train_lda, X_test_lda

In [79]:
n_components = 4

lda, pca = dimensionality_reduction_LDA(n_components, Train, Train_y)

X_train_lda, X_test_lda = train_text_transform_LDA(lda, pca, Train, Test)
print(X_train_lda.shape, X_test_lda.shape)
print(Train_y.shape, Test_y.shape)

Extracting the top 4 features from 735 data
done in 0.008s
Projecting the input data on the eigenfaces orthonormal basis
done in 0.002s
(735, 1) (183, 1)
(735,) (183,)


In [80]:
forest = RandomForestClassifier(n_estimators=10, 
                                random_state=42, 
                                max_depth=8, 
                                max_features=5, 
                                min_samples_leaf=5)

forest.fit(X_train_lda, Train_y)
y_pred = forest.predict(X_test_lda)

In [81]:
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, confusion_matrix


print('\nConfusion Matrix: \n', confusion_matrix(Test_y, y_pred))
print('\nAccuracy Score: ', accuracy_score(Test_y, y_pred))
print('\nClassification Report: \n', classification_report(Test_y, y_pred))


Confusion Matrix: 
 [[69 16]
 [10 88]]

Accuracy Score:  0.8579234972677595

Classification Report: 
               precision    recall  f1-score   support

           0       0.87      0.81      0.84        85
           1       0.85      0.90      0.87        98

    accuracy                           0.86       183
   macro avg       0.86      0.85      0.86       183
weighted avg       0.86      0.86      0.86       183

