# CAD
This data set is taken from [here](https://ieee-dataport.org/open-access/heart-disease-dataset-comprehensive)

# Initializing 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import StackingClassifier,RandomForestClassifier,VotingClassifier 
from sklearn.svm import SVC
from sklearn import tree
from sklearn.model_selection import KFold

In [2]:
# calculate different performance metric for the model clf
def clf_score(clf, X_test, y_test):
    y_pred = clf.predict(X_test)
    ac = accuracy_score(y_test,y_pred)
    f1 = f1_score(y_test,y_pred)
    pr = precision_score(y_test,y_pred)
    re  = recall_score(y_test,y_pred)
   
    return [ac,f1,pr,re]

In [3]:
# normalsie the input base on the norm parametrs, if norm is none, no normalisation will happen
def normalisation(X_train,X_test, norm ):
    if norm != None:
        scaler1 = norm
        
        X_train  =  scaler1.fit_transform(X_train)
        X_test  =  scaler1.transform(X_test)

    #X_train =  scaler1.fit_transform(X_train)
    #X_test =  scaler1.transform(X_test)
    return X_train, X_test

In [4]:
# calculate mean and std for kfold results
def show_statics(arr):
    
    ac = np.mean([ x[0] for x in arr]), np.std([ x[0] for x in arr])
    f1 = np.mean([ x[1] for x in arr]), np.std([ x[1] for x in arr])
    pr = np.mean([ x[2] for x in arr]), np.std([ x[2] for x in arr])
    re = np.mean([ x[3] for x in arr]), np.std([ x[3] for x in arr])
    return [ac,f1,pr,re]

In [5]:
# train a model and 
def train_clf(clf, X,y,norm,  print_ind = True):
    Accs = []
    for i, (train_index, valid_index) in enumerate(kf.split(X)):
            
        X_train = X[train_index]
        y_train = y[train_index]
        X_valid = X[valid_index]
        y_valid = y[valid_index]
        X_train , X_valid = normalisation(X_train , X_valid,norm)
        
        clf.fit(X_train, y_train)

        res = clf_score(clf,X_train,y_train)
        if (print_ind): print('Train: ', res)

        res = clf_score(clf,X_valid,y_valid)
        if (print_ind): print('Valid: ', res)
        Accs.append(res)
    #print(show_statics( Accs))
    return show_statics( Accs)    


# Working on dataset

## Loading dataset 

In [6]:
df1 = pd.read_csv("data.csv")
df1 = df1.drop_duplicates()
y = df1["target"].to_numpy()
X = df1.drop("target", axis=1).to_numpy()



In [7]:
print("Sample after removing duplicates: N %d, P %d" %(len(y[y==0]),len(y[y==1])))

Sample after removing duplicates: N 410, P 508


In [8]:
X_train, X_test,y_train, y_test = train_test_split(X,y , 
                                   random_state=10,  
                                   test_size=0.2,  
                                   shuffle=True)


kf = KFold(n_splits=10, random_state=0, shuffle = True)

 

In [9]:
len(y_test[y_test==0]),len(y_test[y_test==1])

(86, 98)

## Aplying Machine learning model

In [10]:
norm =  StandardScaler()
X_train,X_test = normalisation(X_train,X_test, norm)

### KNN 

In [11]:
n_neigh = [1,2,3,4,5,6,7,8,9,10]
ps = [1,1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,1.9,2]

best_valid = 0
best_test = 0
best_n = 0
best_p = 0
for n in n_neigh:
    for p in ps:
        #print('------',norm, n , p)
        clf_knn_org = KNeighborsClassifier(n_neighbors=n,p = p )
        ac,f1,pr,re = train_clf(clf_knn_org,X_train,y_train,norm,False)
        #print('Valid :')
        #print(ac,f1,pr,re)
    
        #print('Test :')
        #print(n , p)
        ac_t,f1_t,pr_t,re_t = clf_score( clf_knn_org,X_test, y_test)
        if (ac[0] > best_valid):
            best_valid = ac[0]
            best_test  = ac_t
            best_n = n
            best_p = p
        #print('--------')

print('******************')
print(norm, best_valid , best_test, best_n,best_p)

******************
StandardScaler() 0.8803221029248427 0.8315217391304348 7 1.4


In [12]:
 
clf_knn_org = KNeighborsClassifier(n_neighbors=best_n,p = best_p )
x1 , x2 = normalisation(X_train, X_test,StandardScaler())
clf_knn_org.fit(x1, y_train)
print(norm)
print(clf_score( clf_knn_org,x2, y_test))


StandardScaler()
[0.8315217391304348, 0.8410256410256411, 0.845360824742268, 0.8367346938775511]


### SVM

In [13]:
 
Cs = [1,51,101]#,151,201,251]
gammas = [0.01,0.1]#,0.2,0.3,0.4]
kernels = ['rbf', 'linear']
best_valid = 0
best_test = 0
best_C = 0
best_gamma = 0


for k in kernels:
    for c in Cs:
        for g in gammas:
        
            #print('------',norm, c , g, k)
            clf_svm_org = svm.SVC(C=c,gamma=g,kernel=k)
            ac,f1,pr,re = train_clf(clf_svm_org,X_train,y_train,None,False)
            #print('Valid :')
            #print(ac,f1,pr,re)
        
            #print('Test :')
            #print(n , p)
            ac_t,f1_t,pr_t,re_t = clf_score( clf_svm_org,X_test, y_test)
            if (ac[0] >= best_valid):
                best_valid = ac[0]
                best_test  = ac_t
                best_C = c
                best_gamma = g
                best_kernel = k
            #print('--------')

    #print('******************')
print(norm, best_valid , best_test, best_C,best_gamma,best_kernel)

StandardScaler() 0.8693631988152536 0.8315217391304348 1 0.01 rbf


In [14]:
clf_svm_org =  svm.SVC(C=best_C,gamma=best_gamma,kernel=best_kernel,probability=True)
clf_svm_org.fit(X_train, y_train)
print(norm)
print(clf_score( clf_svm_org,X_test, y_test))


StandardScaler()
[0.842391304347826, 0.8542713567839196, 0.8415841584158416, 0.8673469387755102]


### Random Forest

In [None]:
ns = np.arange(50,250,10)
max_depths = np.arange(1,10,1)
min_samples_leaves =  np.arange(1,10,1)
best_valid = 0
best_test = 0
best_n = 0
best_max_depth = 0
best_min_samples_leaf = 0

for n in ns:
    for min_samples_leaf in min_samples_leaves:
        for max_depth in max_depths:
                 
                clf_rf_new = RandomForestClassifier(n_estimators=n, min_samples_leaf=min_samples_leaf ,max_depth= max_depth, bootstrap=True ,   random_state=0 )
                ac,f1,pr,re = train_clf(clf_rf_new,X_train,y_train,None,False)

                ac_t,f1_t,pr_t,re_t = clf_score( clf_svm_org,X_test, y_test)
                if (ac[0] >= best_valid):
                    best_valid = ac[0]
                    best_test  = ac_t
                    best_n = n
                    best_max_depth = max_depth
                    best_min_samples_leaf = min_samples_leaf
                    print(norm, best_valid , best_test, best_n,best_max_depth) 
            #print('--------')

print('******************')
print(norm, best_valid , best_test, best_n,best_max_depth) 

StandardScaler() 0.8312476860422064 0.842391304347826 50 1
StandardScaler() 0.8529803776379119 0.842391304347826 50 2
StandardScaler() 0.8570899666790078 0.842391304347826 50 3
StandardScaler() 0.866604961125509 0.842391304347826 50 4
StandardScaler() 0.8694557571269901 0.842391304347826 50 7
StandardScaler() 0.8748056275453535 0.842391304347826 50 9
StandardScaler() 0.8775268419104034 0.842391304347826 60 9


In [None]:
 
clf_rf_new =  RandomForestClassifier(n_estimators=n,bootstrap=True ,   max_depth= max_depth, min_samples_leaf=best_min_samples_leaf , random_state=0 )

 
clf_rf_new.fit(X_train, y_train)
print(norm)
print(clf_score( clf_rf_new,X_test, y_test))


### Ensemble models

In [None]:

model_tuples1 = [('clf_svm',  clf_svm_org),('clf_knn_org', clf_knn_org), ('clf_rf',clf_rf_new)  ]
model_tuples2 = [('clf_svm1',  clf_svm_org),('clf_knn1', clf_knn_org), ('clf_rf1',clf_rf_new)  ]
model_tuples3 = [('clf_svm2',  clf_svm_org),('clf_knn2', clf_knn_org), ('clf_rf2',clf_rf_new)  ]

best_m1 = None
best_m2 = None
best_m3 = None  
best_valid = 0
best_test = 0
for m1 in model_tuples1:
    for m2 in model_tuples2:
        for m3 in model_tuples3:
            estimator = [] 
            estimator.append(m1)
            estimator.append(m2)
            estimator.append(m3)
          
            stacked_model = VotingClassifier(estimators = estimator, voting ='soft') 
           
            ac,f1,pr,re = train_clf(stacked_model,X_train,y_train,None,False)

            ac_t,f1_t,pr_t,re_t = clf_score( stacked_model,X_test, y_test)
            if (ac[0] >= best_valid):
                best_valid = ac[0]
                best_test  = ac_t
                best_m1 = m1
                best_m2 = m2
                best_m3 = m3
                print(best_m1[0], best_m2[0], best_m3[0], best_valid, best_test)



In [None]:
estimator = []  
estimator.append(best_m1)
estimator.append(best_m2)
estimator.append(best_m3)
stacked_model = VotingClassifier(estimators = estimator, voting ='soft') 
 
stacked_model.fit(X_train, y_train)
print(clf_score( stacked_model,X_test, y_test))


In [None]:
stacked_model = StackingClassifier(
    estimators=[ best_m1,best_m2,best_m3 ],
    final_estimator=SVC(C=100,gamma=0.01)
)
 
stacked_model.fit(X_train, y_train)
#95.04
 
print(clf_score( stacked_model,X_test, y_test))