In [1]:
import pandas as pd

# Load the dataset
file_path = "./Cardiovascular_Disease_Dataset.csv"
df = pd.read_csv(file_path)



In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Rename columns for consistency
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Define features and target variable
X = df.drop(columns=["target"])
y = df["target"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB()
}

# Train and evaluate models
accuracy_results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_results[name] = accuracy

# Display results
accuracy_results


{'Logistic Regression': 0.975,
 'Random Forest': 0.985,
 'Gradient Boosting': 0.99,
 'Support Vector Machine': 0.965,
 'K-Nearest Neighbors': 0.91,
 'Decision Tree': 0.97,
 'Naive Bayes': 0.945}

In [3]:
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier

# Additional models
extra_models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "AdaBoost": AdaBoostClassifier(),
    "Extra Trees": ExtraTreesClassifier()
}

# Train and evaluate additional models
for name, model in extra_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_results[name] = accuracy

# Display updated results
accuracy_results


Parameters: { "use_label_encoder" } are not used.



{'Logistic Regression': 0.975,
 'Random Forest': 0.985,
 'Gradient Boosting': 0.99,
 'Support Vector Machine': 0.965,
 'K-Nearest Neighbors': 0.91,
 'Decision Tree': 0.97,
 'Naive Bayes': 0.945,
 'XGBoost': 0.99,
 'AdaBoost': 0.975,
 'Extra Trees': 0.985}

In [4]:
# Sort accuracy results in descending order
sorted_results = sorted(accuracy_results.items(), key=lambda item: item[1], reverse=True)

# Print results line by line
for name, accuracy in sorted_results:
    print(f"{name}: {accuracy}")


Gradient Boosting: 0.99
XGBoost: 0.99
Random Forest: 0.985
Extra Trees: 0.985
Logistic Regression: 0.975
AdaBoost: 0.975
Decision Tree: 0.97
Support Vector Machine: 0.965
Naive Bayes: 0.945
K-Nearest Neighbors: 0.91


In [5]:
import warnings
warnings.filterwarnings("ignore")

In [6]:
import numpy as np

# Number of iterations
num_iterations = 20

# Dictionary to store cumulative accuracy
average_accuracy_results = {name: [] for name in models.keys()}
average_accuracy_results.update({name: [] for name in extra_models.keys()})

# Run the models multiple times
for _ in range(num_iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, stratify=y)

    # Standardize the features
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train and evaluate models
    for name, model in {**models, **extra_models}.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        average_accuracy_results[name].append(accuracy)

# Compute the average accuracy
final_avg_results = {name: np.mean(acc_list) for name, acc_list in average_accuracy_results.items()}

# Sort and display results
sorted_avg_results = sorted(final_avg_results.items(), key=lambda item: item[1], reverse=True)
for name, avg_acc in sorted_avg_results:
    print(f"{name}: {avg_acc}")


Gradient Boosting: 0.97625
AdaBoost: 0.9754999999999997
Random Forest: 0.97425
Extra Trees: 0.9730000000000001
XGBoost: 0.9727499999999998
Logistic Regression: 0.9639999999999999
Support Vector Machine: 0.9637500000000001
Decision Tree: 0.9612499999999997
Naive Bayes: 0.9422500000000001
K-Nearest Neighbors: 0.9385
