In [None]:
! pip install cloudmesh-common -U

In [None]:
# Import Libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,roc_curve,classification_report

# Algorithms
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

from sklearn.cluster import KMeans
from sklearn.cluster import MeanShift, estimate_bandwidth
from itertools import cycle # Meanshift plotting
from sklearn.cluster import SpectralClustering

# Clustering preprocessing 
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

# Clustering postprocessing
from scipy.stats import mode

# Benchmarking 
from cloudmesh.common.StopWatch import StopWatch

In [None]:
# Import Data
# Data needs to be manually retireved from kaggle with the csv files being renamed accordingly.
StopWatch.start("Total")
StopWatch.start("Data Preprocessing")
data_dav = pd.read_csv('dav_heart_disease_dataset.csv') # https://www.kaggle.com/johnsmith88/heart-disease-dataset
data_sav = pd.read_csv('sve_cardio_dataset.csv', delimiter = ';') # https://www.kaggle.com/sulianova/cardiovascular-disease-dataset

In [None]:
# Create data and target arrays
x_dav = data_dav.drop('target', axis = 1)
y_dav = data_dav['target']

x_sav = data_sav.drop('cardio', axis = 1)
y_sav = data_sav['cardio']

# Split the data
xtrn_dav, xtst_dav, ytrn_dav, ytst_dav = train_test_split(x_dav, y_dav, test_size=0.20, random_state = 0)

xtrn_sav, xtst_sav, ytrn_sav, ytst_sav = train_test_split(x_sav, y_sav, test_size=0.20, random_state = 0)

# Normalize
dav_scaler = StandardScaler()
xtrn_dav_n = dav_scaler.fit_transform(xtrn_dav)
xtst_dav_n = dav_scaler.transform(xtst_dav)

sav_scaler = StandardScaler()
xtrn_sav_n = sav_scaler.fit_transform(xtrn_sav)
xtst_sav_n = sav_scaler.transform(xtst_sav)

StopWatch.stop("Data Preprocessing")
StopWatch.status("Data Preprocessing", True)

In [None]:
# SVC
StopWatch.start("dav SVC")
svc_dav = SVC()
svc_dav.fit(xtrn_dav_n, ytrn_dav)
svc_dav_predict = svc_dav.predict(xtst_dav_n)
svc_dav_conf_mat = confusion_matrix(ytst_dav, svc_dav_predict)
svc_dav_acc = accuracy_score(ytst_dav, svc_dav_predict)

print("Dav set")
print("Confusion Matrix\n{}\n\nAccuracy\n{}\n".format(svc_dav_conf_mat, svc_dav_acc*100))
print(classification_report(ytst_dav,svc_dav_predict))

StopWatch.stop("dav SVC")
StopWatch.status("dav SVC", True)
StopWatch.start("sav SVC")

svc_sav = SVC()
svc_sav.fit(xtrn_sav_n, ytrn_sav)
svc_sav_predict = svc_sav.predict(xtst_sav_n)
svc_sav_conf_mat = confusion_matrix(ytst_sav, svc_sav_predict)
svc_sav_acc = accuracy_score(ytst_sav, svc_sav_predict)

print("Sav set")
print("Confusion Matrix\n{}\n\nAccuracy\n{}\n".format(svc_sav_conf_mat, svc_sav_acc*100))
print(classification_report(ytst_sav,svc_sav_predict))

StopWatch.stop("sav SVC")
StopWatch.status("sav SVC", True)

In [None]:
# K nearest
StopWatch.start("dav knn")
knn_dav = KNeighborsClassifier(n_neighbors=10)
knn_dav.fit(xtrn_dav_n, ytrn_dav)
knn_dav_predict = knn_dav.predict(xtst_dav_n)
knn_dav_conf_mat = confusion_matrix(ytst_dav, knn_dav_predict)
knn_dav_acc = accuracy_score(ytst_dav, knn_dav_predict)

print("Dav set")
print("Confusion Matrix\n{}\n\nAccuracy\n{}\n".format(knn_dav_conf_mat, knn_dav_acc*100))
print(classification_report(ytst_dav,knn_dav_predict))

StopWatch.stop("dav knn")
StopWatch.status("dav knn", True)
StopWatch.start("sav knn")

knn_sav = KNeighborsClassifier(n_neighbors=10)
knn_sav.fit(xtrn_sav_n, ytrn_sav)
knn_sav_predict = knn_sav.predict(xtst_sav_n)
knn_sav_conf_mat = confusion_matrix(ytst_sav, knn_sav_predict)
knn_sav_acc = accuracy_score(ytst_sav, knn_sav_predict)

print("Sav set")
print("Confusion Matrix\n{}\n\nAccuracy\n{}\n".format(knn_sav_conf_mat, knn_sav_acc*100))
print(classification_report(ytst_sav,knn_sav_predict))

StopWatch.stop("sav knn")
StopWatch.status("sav knn", True)

In [None]:
# Gaussian Naive Bayes
StopWatch.start("dav GaussNBay")
gnb_dav = GaussianNB()
gnb_dav.fit(xtrn_dav_n, ytrn_dav)
gnb_dav_predict = gnb_dav.predict(xtst_dav_n)
gnb_dav_conf_mat = confusion_matrix(ytst_dav, gnb_dav_predict)
gnb_dav_acc = accuracy_score(ytst_dav, gnb_dav_predict)

print("Dav set")
print("Confusion Matrix\n{}\n\nAccuracy\n{}\n".format(gnb_dav_conf_mat, gnb_dav_acc*100))
print(classification_report(ytst_dav,gnb_dav_predict))

StopWatch.stop("dav GaussNBay")
StopWatch.status("dav GaussNBay", True)
StopWatch.start("sav GaussNBay")

gnb_sav = GaussianNB()
gnb_sav.fit(xtrn_sav_n, ytrn_sav)
gnb_sav_predict = gnb_sav.predict(xtst_sav_n)
gnb_sav_conf_mat = confusion_matrix(ytst_sav, gnb_sav_predict)
gnb_sav_acc = accuracy_score(ytst_sav, gnb_sav_predict)

print("Sav set")
print("Confusion Matrix\n{}\n\nAccuracy\n{}\n".format(gnb_sav_conf_mat, gnb_sav_acc*100))
print(classification_report(ytst_sav,gnb_sav_predict))

StopWatch.stop("sav GaussNBay")
StopWatch.status("sav GaussNBay", True)

In [None]:
# Decision Trees
StopWatch.start("dav decisionTree")

dtre_dav = DecisionTreeClassifier(criterion = 'entropy',random_state=0,max_depth = 6)
dtre_dav.fit(xtrn_dav_n, ytrn_dav)
dtre_dav_predict = dtre_dav.predict(xtst_dav_n)
dtre_dav_conf_mat = confusion_matrix(ytst_dav, dtre_dav_predict)
dtre_dav_acc = accuracy_score(ytst_dav, dtre_dav_predict)

print("Dav set")
print("Confusion Matrix\n{}\n\nAccuracy\n{}\n".format(dtre_dav_conf_mat, dtre_dav_acc*100))
print(classification_report(ytst_dav,dtre_dav_predict))

StopWatch.stop("dav decisionTree")
StopWatch.status("dav decisionTree", True)
StopWatch.start("sav decisionTree")

dtre_sav = DecisionTreeClassifier(criterion = 'entropy',random_state=0,max_depth = 6)
dtre_sav.fit(xtrn_sav_n, ytrn_sav)
dtre_sav_predict = dtre_sav.predict(xtst_sav_n)
dtre_sav_conf_mat = confusion_matrix(ytst_sav, dtre_sav_predict)
dtre_sav_acc = accuracy_score(ytst_sav, dtre_sav_predict)

print("Sav set")
print("Confusion Matrix\n{}\n\nAccuracy\n{}\n".format(dtre_sav_conf_mat, dtre_sav_acc*100))
print(classification_report(ytst_sav, dtre_sav_predict))

StopWatch.stop("sav decisionTree")
StopWatch.status("sav decisionTree", True)

In [None]:
# Cluserting preprocessing
StopWatch.start("cluster preprocessing")
dav_cluster = scale(x_dav)
sav_cluster = scale(x_sav)

n_comp_dav = len(np.unique(y_dav))
n_comp_sav = len(np.unique(y_sav))

pca_dav = PCA(n_components=n_comp_dav).fit_transform(dav_cluster)
pca_sav = PCA(n_components=n_comp_sav).fit_transform(sav_cluster)
StopWatch.stop("cluster preprocessing")
StopWatch.status("cluster preprocessing", True)

In [None]:
# K Means
# Code was rewritten into a function to reduce workload associated in visualizing the clustering algorithms
def kMeans(data, labels):
    Xtrain, Xtest, ytrain, ytest = train_test_split(data, labels, test_size=0.30, random_state=0)
    KModel = KMeans(n_clusters = 2, random_state = 9).fit(Xtrain, ytrain)
    predictions = KModel.predict(Xtest)
    
    plt.title('Actual Data')
    plt.scatter(Xtest[:,0], Xtest[:,1], marker = 'o', s = 10, c = ytest)
    plt.show()
    
    plt.title('Predicted')
    plt.scatter(Xtest[:,0], Xtest[:,1], marker = 'o', s = 10, c = predictions)
    plt.show()

    print("Confusion matrix\n", confusion_matrix(ytest, predictions))
    print("Accuracy\n", accuracy_score(ytest, predictions)*100)
    print(classification_report(ytest, predictions))
    
StopWatch.start("dav kmeans")    
kMeans(pca_dav, y_dav)
StopWatch.stop("dav kmeans")  
StopWatch.status("dav kmeans", True)

StopWatch.start("sav kmeans")  
kMeans(pca_sav, y_sav)
StopWatch.stop("sav kmeans")  
StopWatch.status("sav kmeans", True)

In [None]:
# Mean-Shift - Implementation in progress
def meanShift(data, labels):
    Xtrain, Xtest, ytrain, ytest = train_test_split(data, labels, test_size=0.30, random_state=0)
    bandwidth = estimate_bandwidth(Xtrain, quantile=0.5)
    ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
    ms.fit(Xtrain)

    predictions = ms.predict(Xtest)
    
    plt.title('Actual Data')
    plt.scatter(Xtest[:,0], Xtest[:,1], marker = 'o', s = 10, c = ytest)
    plt.show()
    
    plt.title('Predicted')
    plt.scatter(Xtest[:,0], Xtest[:,1], marker = 'o', s = 10, c = predictions)
    plt.show()

    print("Confusion matrix\n", confusion_matrix(ytest, predictions))
    print("Accuracy\n", accuracy_score(ytest, predictions)*100)
    print(classification_report(ytest, predictions))
    
StopWatch.start("dav meanShift")    
meanShift(pca_dav, y_dav)
StopWatch.stop("dav meanShift")  
StopWatch.status("dav meanShift", True)

StopWatch.start("sav meanShift")  
meanShift(pca_sav, y_sav)
StopWatch.stop("sav meanShift")  
StopWatch.status("sav meanShift", True)

In [None]:
# Spectral Clustering
def spectralClust(data, labels):
    Xtrain, Xtest, ytrain, ytest = train_test_split(data, labels, test_size=0.30, random_state=0)
    specCl = SpectralClustering(n_clusters = 2, affinity='nearest_neighbors', assign_labels='kmeans').fit(Xtrain, ytrain)
    predictions = specCl.fit_predict(Xtest)
    
    plt.title('Actual Data')
    plt.scatter(Xtest[:,0], Xtest[:,1], marker = 'o', s = 10, c = ytest)
    plt.show()
    
    plt.title('Predicted')
    plt.scatter(Xtest[:,0], Xtest[:,1], marker = 'o', s = 10, c = predictions)
    plt.show()

    print("Confusion matrix\n", confusion_matrix(ytest, predictions))
    print("Accuracy\n", accuracy_score(ytest, predictions)*100)
    print(classification_report(ytest, predictions))
    
StopWatch.start("dav spectralClust")    
spectralClust(pca_dav, y_dav)
StopWatch.stop("dav spectralClust")  
StopWatch.status("dav spectralClust", True)

StopWatch.start("sav spectralClust")  
spectralClust(pca_sav, y_sav)
StopWatch.stop("sav spectralClust")  
StopWatch.status("sav spectralClust", True)

In [None]:
StopWatch.stop("Total")
StopWatch.status("Total", True)
StopWatch.benchmark()