<a href="https://colab.research.google.com/github/charlierettig7/Personal/blob/master/HeartDisease.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pylab as pl
import sklearn


In [3]:
from ucimlrepo import fetch_ucirepo

# fetch dataset
heart_disease = fetch_ucirepo(id=45)

# data (as pandas dataframes)
Features = heart_disease.data.features
Target = heart_disease.data.targets

Xdf = pd.DataFrame(Features)
ydf = pd.DataFrame(Target)
full_data = pd.concat([Xdf,ydf], axis = 1)

# Drop rows with any NaN values
full_data = full_data.dropna()


# metadata
print(heart_disease.metadata)

# variable information
print(heart_disease.variables)

ModuleNotFoundError: No module named 'ucimlrepo'

In [None]:
#Exploratory
print(full_data.head())
print(full_data.shape)
print(full_data['num'].unique())
print(full_data.groupby('num').size())

In [None]:
sns.countplot(x='num', data=full_data)
plt.show()

In [None]:
full_data.drop('num', axis=1).plot(kind='box', subplots=True, layout=(5,3), sharex=False, sharey=False, figsize=(9,9),
title='Box Plot for each input variable')
plt.savefig('Heart_full')
plt.show()

In [None]:
full_data.drop('num' ,axis=1).hist(bins=30, figsize=(9,9))
pl.suptitle("Histogram for each numeric input variable")
plt.savefig('Heart_full_hist')
plt.show()

In [None]:
from pandas.plotting import scatter_matrix
from matplotlib import cm
import matplotlib.pyplot as plt

feature_names = list(Xdf.columns.values)

X = full_data[feature_names]
y = full_data['num']

# Set up the colormap
cmap = cm.get_cmap('gnuplot')

# Create the scatter matrix and pass the color values directly within plt.scatter
scatter = scatter_matrix(X, marker='o', s=40, hist_kwds={'bins': 15}, figsize=(9, 9), diagonal='hist')

# Add color to each plot
for i, ax in enumerate(scatter.ravel()):
    row, col = divmod(i, len(X.columns))
    if row != col:  # Avoid coloring the histogram diagonals
        colors = cmap(y / y.max())  # Normalize y for colormap
        ax.scatter(X.iloc[:, col], X.iloc[:, row], c=colors, marker='o', s=10)

plt.suptitle('Scatter-matrix for each input variable')
plt.savefig('heart_scatter_matrix')
plt.show()


In [None]:
from pandas.plotting import scatter_matrix
from matplotlib import cm
import matplotlib.pyplot as plt

feature_names = list(Xdf.columns.values)

X = full_data[feature_names]
y2 = np.where(full_data['num'] != 0, 1, 0)

# Set up the colormap
cmap = cm.get_cmap('gnuplot')

# Create the scatter matrix and pass the color values directly within plt.scatter
scatter = scatter_matrix(X, marker='o', s=40, hist_kwds={'bins': 15}, figsize=(9, 9), diagonal='hist')

# Add color to each plot
for i, ax in enumerate(scatter.ravel()):
    row, col = divmod(i, len(X.columns))
    if row != col:  # Avoid coloring the histogram diagonals
        colors = cmap(y / y.max())  # Normalize y for colormap
        ax.scatter(X.iloc[:, col], X.iloc[:, row], c=colors, marker='o', s=10)

plt.suptitle('Scatter-matrix for each input variable')
plt.savefig('heart_scatter_matrix')
plt.show()


In [None]:
# Calculate the correlation matrix
corr_matrix = full_data.corr()

# Set up the figure
plt.figure(figsize=(10, 8))

# Draw the heatmap with a color map and annotating the correlations
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True, cbar=True, linewidths=0.5)

# Add a title
plt.title("Correlation Heatmap of Variables")

# Show the plot
plt.show()


In [None]:
# Generate the pairplot
sns.pairplot(full_data, hue='num', palette='viridis', markers=["o", "s", "D"])

# Add a title (PairPlot doesn't have a direct way to set titles, so using plt.title won't apply to each subplot)
plt.suptitle("Pairplot of Features Colored by Target Variable", y=1.02)

# Show the plot
plt.show()


In [None]:

# Get the summary statistics
desc_stats = X.describe()

# Plot each feature's summary statistics
for column in desc_stats.columns:
    plt.figure(figsize=(8, 4))
    plt.bar(desc_stats.index, desc_stats[column], color="skyblue")
    plt.title(f'Summary Statistics for {column}')
    plt.xlabel('Statistic')
    plt.ylabel('Value')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler



X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler



X_train, X_test, y2_train, y2_test = train_test_split(X, y2, random_state=0)

scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

Multiclass KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()

knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
.format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
.format(knn.score(X_test, y_test)))

print(knn.get_params())

Single-Class KNN below

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn2 = KNeighborsClassifier()

knn2.fit(X_train, y2_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
.format(knn2.score(X_train, y2_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
.format(knn2.score(X_test, y2_test)))

print(knn2.get_params())

Multiclass Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier().fit(X_train, y_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}' .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}' .format(clf.score(X_test, y_test)))

Single Class Decision Tree below

In [None]:
from sklearn.tree import DecisionTreeClassifier
clf2 = DecisionTreeClassifier().fit(X_train, y2_train)
print('Accuracy of Decision Tree classifier on training set: {:.2f}' .format(clf2.score(X_train, y2_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}' .format(clf2.score(X_test, y2_test)))

Multi-Class LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis()
lda.fit(X_train, y_train)
print('Accuracy of LDA classifier on training set: {:.2f}' .format(lda.score(X_train, y_train)))
print('Accuracy of LDA classifier on test set: {:.2f}' .format(lda.score(X_test, y_test)))

Binary LDA

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda2 = LinearDiscriminantAnalysis()
lda2.fit(X_train, y2_train)
print('Accuracy of LDA classifier on training set: {:.2f}' .format(lda2.score(X_train, y2_train)))
print('Accuracy of LDA classifier on test set: {:.2f}' .format(lda2.score(X_test, y2_test)))

Multiclass SVC

In [None]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
.format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
.format(svm.score(X_test, y_test)))

Binary SVC

In [None]:
from sklearn.svm import SVC
svm2 = SVC()
svm2.fit(X_train, y2_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
.format(svm2.score(X_train, y2_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
.format(svm2.score(X_test, y2_test)))

In [None]:

pred_KNN = knn.predict(X_test)
pred_LDA = lda.predict(X_test)
pred_SVM = svm.predict(X_test)


pred_KNN_bi = knn2.predict(X_test)
pred_LDA_bi = lda2.predict(X_test)
pred_SVM_bi = svm2.predict(X_test)



In [None]:
from sklearn.metrics import accuracy_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import numpy as np

# Function to calculate metrics for multiclass classifiers
def get_multiclass_metrics(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)

    # Sensitivity (Recall for each class and then macro average)
    sensitivity = recall_score(y_test, y_pred, average='macro')

    # Specificity calculation for each class
    cm = confusion_matrix(y_test, y_pred)
    specificity_per_class = []
    for i in range(len(cm)):
        # True negatives (sum of elements not in row i or column i)
        tn = cm.sum() - (cm[i, :].sum() + cm[:, i].sum() - cm[i, i])
        # False positives (sum of elements in column i excluding diagonal)
        fp = cm[:, i].sum() - cm[i, i]
        # Specificity for class i
        specificity = tn / (tn + fp)
        specificity_per_class.append(specificity)
    specificity = np.mean(specificity_per_class)  # Average specificity across classes

    return accuracy, sensitivity, specificity

##Multiclass

# Calculate metrics for each classifier
acc_knn, sens_knn, spec_knn = get_multiclass_metrics(y_test, pred_KNN)
acc_lda, sens_lda, spec_lda = get_multiclass_metrics(y_test, pred_LDA)
acc_svm, sens_svm, spec_svm = get_multiclass_metrics(y_test, pred_SVM)

# Prepare data for grouped bar chart
classifiers = ['KNN', 'LDA', 'SVM']
accuracy = [acc_knn, acc_lda, acc_svm]
sensitivity = [sens_knn, sens_lda, sens_svm]
specificity = [spec_knn, spec_lda, spec_svm]

# Set up the bar chart
metrics = ['Accuracy', 'Sensitivity', 'Specificity']
data = [accuracy, sensitivity, specificity]

x = np.arange(len(classifiers))  # Label locations
bar_width = 0.25

fig, ax = plt.subplots(figsize=(10, 6))

# Plot each metric
for i, metric in enumerate(metrics):
    ax.bar(x + i * bar_width, data[i], width=bar_width, label=metric)

# Add labels and titles
ax.set_xlabel('Classifier')
ax.set_ylabel('Metric Score')
ax.set_title('Performance Comparison of KNN, LDA, and SVM (Multiclass)')
ax.set_xticks(x + bar_width)
ax.set_xticklabels(classifiers)
ax.legend()

plt.tight_layout()
plt.show()

print("Confusion Matrix KNN")
print(confusion_matrix(y_test, pred_KNN))
print("Confusion Matrix LDA")
print(confusion_matrix(y_test, pred_LDA))
print("Confusion Matrix SVM")
print(confusion_matrix(y_test, pred_SVM))


In [None]:
##Single Class

# Calculate metrics for each classifier
acc_knn, sens_knn, spec_knn = get_multiclass_metrics(y2_test, pred_KNN_bi)
acc_lda, sens_lda, spec_lda = get_multiclass_metrics(y2_test, pred_LDA_bi)
acc_svm, sens_svm, spec_svm = get_multiclass_metrics(y2_test, pred_SVM_bi)

# Prepare data for grouped bar chart
classifiers = ['KNN', 'LDA', 'SVM']
accuracy = [acc_knn, acc_lda, acc_svm]
sensitivity = [sens_knn, sens_lda, sens_svm]
specificity = [spec_knn, spec_lda, spec_svm]

# Set up the bar chart
metrics = ['Accuracy', 'Sensitivity', 'Specificity']
data = [accuracy, sensitivity, specificity]

x = np.arange(len(classifiers))  # Label locations
bar_width = 0.25

fig, ax = plt.subplots(figsize=(10, 6))

# Plot each metric
for i, metric in enumerate(metrics):
    ax.bar(x + i * bar_width, data[i], width=bar_width, label=metric)

# Add labels and titles
ax.set_xlabel('Classifier')
ax.set_ylabel('Metric Score')
ax.set_title('Performance Comparison of KNN, LDA, and SVM (Multiclass)')
ax.set_xticks(x + bar_width)
ax.set_xticklabels(classifiers)
ax.legend()

plt.tight_layout()
plt.show()

print("Confusion Matrix KNN")
print(confusion_matrix(y_test, pred_KNN))
print("Confusion Matrix LDA")
print(confusion_matrix(y_test, pred_LDA))
print("Confusion Matrix SVM")
print(confusion_matrix(y_test, pred_SVM))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Assuming you already have the confusion matrices for each classifier
cm_knn = confusion_matrix(y_test, pred_KNN)
cm_lda = confusion_matrix(y_test, pred_LDA)
cm_svm = confusion_matrix(y_test, pred_SVM)

# Plotting function
def plot_confusion_matrix(cm, title='Confusion Matrix'):
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.title(title)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

# Plot confusion matrix for each classifier
plot_confusion_matrix(cm_knn, title='Confusion Matrix for KNN')
plot_confusion_matrix(cm_lda, title='Confusion Matrix for LDA')
plot_confusion_matrix(cm_svm, title='Confusion Matrix for SVM')


In [None]:

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Assuming you already have the confusion matrices for each classifier
cm_knn = confusion_matrix(y2_test, pred_KNN_bi)
cm_lda = confusion_matrix(y2_test, pred_LDA_bi)
cm_svm = confusion_matrix(y2_test, pred_SVM_bi)

# Plotting function
def plot_confusion_matrix(cm, title='Confusion Matrix'):
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", cbar=False)
    plt.title(title)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

# Plot confusion matrix for each classifier
plot_confusion_matrix(cm_knn, title='Confusion Matrix for KNN')
plot_confusion_matrix(cm_lda, title='Confusion Matrix for LDA')
plot_confusion_matrix(cm_svm, title='Confusion Matrix for SVM')



In [None]:
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import inspect


def roc_curver_2D_better(variation):
    frame = inspect.currentframe().f_back
    title = [name for name, val in frame.f_locals.items() if val is variation] #gets name
    # Binarize the true labels
    y_test_bin = label_binarize(y_test, classes=[0, 1])
    print("Classes in the model:", variation.classes_)

    variation_scores = variation.predict_proba(X_test)

    y_true = y_test_bin.ravel() #set to 1D
    variation_scores = variation_scores.ravel()

    # Calculate micro-average ROC curve and AUC
    fps, tps, _ = roc_curve(y_true , variation_scores)
    roc_auc_micro = auc(fps, tps)

    # Plot the Micro-average ROC curve

    plt.plot(fps, tps, lw=2, label=f'' + title[0] +' (AUC = ' + str(round(roc_auc_micro, 2)) + ')')

    plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line
    plt.xlabel('False Positive Percent')
    plt.ylabel('True Positive Percent')
    plt.title('ROC Curve')
    plt.legend(loc="lower right")

plt.figure()
roc_curver_2D_better(lda2)
roc_curver_2D_better(knn2)
roc_curver_2D_better(clf2)
roc_curver_2D_better(svm2)
plt.show()