In [9]:
import pandas as pd

# Load the dataset
file_path = "./heart_1025.csv"
df = pd.read_csv(file_path)



In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Rename columns for consistency
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

# Define features and target variable
X = df.drop(columns=["target"])
y = df["target"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Support Vector Machine": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Naive Bayes": GaussianNB()
}

# Train and evaluate models
accuracy_results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_results[name] = accuracy

# Display results
accuracy_results


{'Logistic Regression': 0.8097560975609757,
 'Random Forest': 1.0,
 'Gradient Boosting': 0.975609756097561,
 'Support Vector Machine': 0.926829268292683,
 'K-Nearest Neighbors': 0.8634146341463415,
 'Decision Tree': 0.9853658536585366,
 'Naive Bayes': 0.8292682926829268}

In [11]:
from xgboost import XGBClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier

# Additional models
extra_models = {
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "AdaBoost": AdaBoostClassifier(),
    "Extra Trees": ExtraTreesClassifier()
}

# Train and evaluate additional models
for name, model in extra_models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_results[name] = accuracy

# Display updated results
accuracy_results


{'Logistic Regression': 0.8097560975609757,
 'Random Forest': 1.0,
 'Gradient Boosting': 0.975609756097561,
 'Support Vector Machine': 0.926829268292683,
 'K-Nearest Neighbors': 0.8634146341463415,
 'Decision Tree': 0.9853658536585366,
 'Naive Bayes': 0.8292682926829268,
 'XGBoost': 1.0,
 'AdaBoost': 0.8829268292682927,
 'Extra Trees': 1.0}

In [12]:
# Sort accuracy results in descending order
sorted_results = sorted(accuracy_results.items(), key=lambda item: item[1], reverse=True)

# Print results line by line
for name, accuracy in sorted_results:
    print(f"{name}: {accuracy}")


Random Forest: 1.0
XGBoost: 1.0
Extra Trees: 1.0
Decision Tree: 0.9853658536585366
Gradient Boosting: 0.975609756097561
Support Vector Machine: 0.926829268292683
AdaBoost: 0.8829268292682927
K-Nearest Neighbors: 0.8634146341463415
Naive Bayes: 0.8292682926829268
Logistic Regression: 0.8097560975609757


In [13]:
import warnings
warnings.filterwarnings("ignore")

In [14]:
import numpy as np

# Number of iterations
num_iterations = 20

# Dictionary to store cumulative accuracy
average_accuracy_results = {name: [] for name in models.keys()}
average_accuracy_results.update({name: [] for name in extra_models.keys()})

# Run the models multiple times
for _ in range(num_iterations):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None, stratify=y)

    # Standardize the features
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train and evaluate models
    for name, model in {**models, **extra_models}.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        average_accuracy_results[name].append(accuracy)

# Compute the average accuracy
final_avg_results = {name: np.mean(acc_list) for name, acc_list in average_accuracy_results.items()}

# Sort and display results
sorted_avg_results = sorted(final_avg_results.items(), key=lambda item: item[1], reverse=True)
for name, avg_acc in sorted_avg_results:
    print(f"{name}: {avg_acc}")


Decision Tree: 0.9946341463414633
XGBoost: 0.9946341463414633
Extra Trees: 0.9946341463414633
Random Forest: 0.9939024390243901
Gradient Boosting: 0.9651219512195123
Support Vector Machine: 0.9143902439024391
AdaBoost: 0.8929268292682929
Logistic Regression: 0.8475609756097562
K-Nearest Neighbors: 0.8446341463414635
Naive Bayes: 0.8187804878048782
