<a href="https://colab.research.google.com/github/eneribnk/ML-Assignment/blob/main/ml_assignment_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load and inspect the database

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
file_path = 'heart_disease_dataset.csv'
data = pd.read_csv(file_path, sep=';')

X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Convert target y to binary (0 for no disease, 1 for disease presence)
y = y.apply(lambda x: 1 if x > 0 else 0)

#check for missing values
missing_values_X = data.isnull().sum()
print("\nMissing values in X:\n", missing_values_X)

missing_values_y = y.isnull().sum()
print("\nMissing values in y:\n", missing_values_y)


nan_values_X = X.isna().sum()
print("\nNaN values in X per column:")
print(nan_values_X)

# Optional: Highlight only columns with NaN values
columns_with_nan = nan_values_X[nan_values_X > 0]
if not columns_with_nan.empty:
    print("\nColumns with NaN values:")
    print(columns_with_nan)
else:
    print("\nNo columns in X contain NaN values.")

nan_values_y = y.isna().sum()
print("\nNaN values in y:\n", nan_values_y)

if missing_values_X.sum() > 0 or nan_values_X.sum() > 0:
    X = X.fillna(X.mean())

if missing_values_y.sum() > 0 or nan_values_y > 0:
    y = y.fillna(y.mode()[0])





Missing values in X:
 age             0
sex             0
cp              0
trestbps        0
chol            0
fbs             0
restecg         0
thalach         0
exang           0
oldpeak         0
slope           0
ca              4
thal            2
HeartDisease    0
dtype: int64

Missing values in y:
 0

NaN values in X per column:
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          4
thal        2
dtype: int64

Columns with NaN values:
ca      4
thal    2
dtype: int64

NaN values in y:
 0


Plot the dataset

In [3]:
import plotly.graph_objects as go
fig = go.Figure(data=[go.Table(
    header=dict(
        values=list(data.columns),
        fill_color='lightblue',
        align='center',
        font=dict(color='black', size=12)
    ),
    cells=dict(
        values=[data[col] for col in data.columns],
        fill_color='lightgrey',
        align='left'
    )
)])

fig.update_layout(title="Heart Disease Dataset Table")
fig.show()

Handle Missing Values

In [4]:
# Confirm that missing and NaN values are handled
missing_values_X_after = X.isnull().sum()
nan_values_X_after = X.isna().sum()
print("\nMissing values in X after handling:\n", missing_values_X_after)
print("\nNaN values in X after handling:\n", nan_values_X_after)

missing_values_y_after = y.isnull().sum()
nan_values_y_after = y.isna().sum()
print("\nMissing values in y after handling:\n", missing_values_y_after)
print("\nNaN values in y after handling:\n", nan_values_y_after)


Missing values in X after handling:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
dtype: int64

NaN values in X after handling:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
dtype: int64

Missing values in y after handling:
 0

NaN values in y after handling:
 0


Show Correlations

In [5]:
import numpy as np
correlation_matrix = X.corr()

# Create a heatmap using Plotly
fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.columns,
    colorscale='RdBu',
    zmin=-1, zmax=1,
    colorbar=dict(title='Correlation Coefficient'),
))

fig.update_layout(
    title="Correlation Matrix",
    xaxis_title="Features",
    yaxis_title="Features",
    xaxis=dict(tickmode='array', tickvals=np.arange(len(correlation_matrix.columns))),
    yaxis=dict(tickmode='array', tickvals=np.arange(len(correlation_matrix.columns))),
)

fig.show()

In [6]:
# i choose 0.7 as a threshold and pairs with factor >0.7 will be excluded
threshold = 0.7
upper_triangle = correlation_matrix.where(abs(correlation_matrix) > threshold)
features_to_drop = set() #initialize the empty set that will store the features to be excluded

for i in range(len(upper_triangle.columns)):
    for j in range(i):
        if abs(upper_triangle.iloc[i, j]) > threshold:  # If correlation is above threshold
            colname = upper_triangle.columns[i]
            features_to_drop.add(colname)
# 4. Remove one feature from each correlated pair
print("\nFeatures to drop due to high correlation:", features_to_drop)
X = X.drop(columns=features_to_drop)
print("\nShape of X after removing correlated features:", X.shape)


Features to drop due to high correlation: set()

Shape of X after removing correlated features: (303, 13)


Histograms of features

In [7]:
import plotly.express as px
for column in X.columns:
    fig = px.histogram(X, x=column, title=f"Histogram of {column}", labels={column: column})
    fig.update_layout(xaxis_title=column, yaxis_title="Frequency")
    fig.show()

Split for train - test (80% - 20%) randomly

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=41)
print(f"Shape of X_train: {X_train.shape}, Shape of X_test: {X_test.shape}, Shape of y_train: {y_train.shape}, Shape of y_test: {y_test.shape}")


Shape of X_train: (242, 13), Shape of X_test: (61, 13), Shape of y_train: (242,), Shape of y_test: (61,)


Z-score normalization

In [9]:
import json
stats = {}
for column in X_train.columns:
    # Compute mean and std for the training set only
    mean, std = X_train[column].mean(), X_train[column].std()

    # Apply Z-normalization
    X_train[column] = (X_train[column] - mean) / (std + 1e-10)  # 1e-10 to avoid division by 0

    # Store statistics for test set normalization
    stats[column] = {'mean': mean, 'std': std}

for column in X_test.columns:
    mean, std = stats[column]['mean'], stats[column]['std']
    X_test[column] = (X_test[column] - mean) / (std + 1e-10)

# Write statistics to a JSON file for future use
with open('column_stats.json', 'w') as f:
    json.dump(stats, f)

In [10]:
# Verify the scaling by printing the means and stds of the scaled features
print("\nMeans of scaled features in training set:", X_train.mean())
print("Standard deviations of scaled features in training set:", X_train.std())


Means of scaled features in training set: age         1.486414e-16
sex         8.533119e-17
cp         -1.238679e-16
trestbps    4.807908e-16
chol        3.303143e-17
fbs         3.303143e-17
restecg     6.973302e-17
thalach     5.321730e-16
exang       7.340318e-17
oldpeak     7.432071e-17
slope      -1.486414e-16
ca          5.046468e-18
thal       -2.358077e-16
dtype: float64
Standard deviations of scaled features in training set: age         1.0
sex         1.0
cp          1.0
trestbps    1.0
chol        1.0
fbs         1.0
restecg     1.0
thalach     1.0
exang       1.0
oldpeak     1.0
slope       1.0
ca          1.0
thal        1.0
dtype: float64


Simple Training

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
import time
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


def ml_classifiers(X_train, X_test, y_train, y_test):
    classifiers = {
        "Logistic Regression": LogisticRegression(),
        "Support Vector Machine": SVC(probability=True),
        "Random Forest": RandomForestClassifier(),
        "K-Nearest Neighbors": KNeighborsClassifier(),
        "AdaBoost": AdaBoostClassifier(),
        "Decision Tree": DecisionTreeClassifier() }

    results = {}
     # Lists to store training times and prediction times
    training_times = []
    prediction_times = []
    classifiers_list = []  # List to store the classifier names for later use
    for name, clf in classifiers.items():


        classifiers_list.append(name)  # Add the classifier name to the list
        t1 = time.time_ns() / (10 ** 9)
        clf.fit(X_train, y_train)
        t2 = time.time_ns() / (10 ** 9)
        y_pred = clf.predict(X_test)
        t3 = time.time_ns() / (10 ** 9)
        accuracy = accuracy_score(y_test, y_pred)

        # Only call predict_proba if it's available
        if hasattr(clf, 'predict_proba'):
            y_prob = clf.predict_proba(X_test)[:, 1]  # Probabilities for AUC-ROC
            auc_roc = roc_auc_score(y_test, y_prob)
        else:
            auc_roc = None  # If not available, set AUC to None

        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # Store training and prediction times in the lists
        training_times.append(t2 - t1)
        prediction_times.append(t3 - t2)

        results[name] = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1-Score': f1,
            'AUC-ROC': auc_roc
        }


        # Print evaluation metrics
        print("-" * 40)
        print(f"{name}:")
        print(f"  Accuracy: {accuracy:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall: {recall:.4f}")
        print(f"  F1-Score: {f1:.4f}")
        print(f"  AUC-ROC: {auc_roc:.4f}")
        print("-" * 40)

    # Convert the results dictionary into a pandas DataFrame for easier plotting
    results_df = pd.DataFrame(results).T
    # Plot each metric (Accuracy, Precision, Recall, F1, AUC-ROC) as a bar trace
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC']
    fig = go.Figure()
    for metric in metrics:

        fig.add_trace(go.Bar(
        x=results_df.index,  # Classifiers (index of the DataFrame)
        y=results_df[metric],  # Values for the metric
        name=metric,
        text=results_df[metric].round(4),  # Show rounded values as text on bars
        textposition='auto',  # Automatically position the text
        hovertemplate='%{x}: %{y:.4f}<br>%{text}',  # Show detailed hover info
        marker=dict(line=dict(width=1.5))  # Optional: add border to the bars
        ))
        # Customize layout
        fig.update_layout(
         title='Comparison of Classifiers Across Different Metrics',
          barmode='group',  # Group the bars for each classifier side by side
          xaxis_title='Classifier',
          yaxis_title='Score',
          xaxis=dict(tickangle=45),  # Rotate x-axis labels for readability
          legend_title='Metrics',
          template='plotly_dark',  # Dark theme for better visuals
          hovermode='closest'  # Show detailed hover info
        )

    # Show the plot
    fig.show()





    # Create a figure for execution times (training and prediction times)
    fig_times = go.Figure()

    # Plot training times
    fig_times.add_trace(go.Bar(x=classifiers_list, y=training_times, name="Training Time",
                              text=[f"{time:.4f}" for time in training_times],
                              textposition='auto', hovertemplate='%{x}: %{y:.4f} seconds'))

    # Plot prediction times
    fig_times.add_trace(go.Bar(x=classifiers_list, y=prediction_times, name="Prediction Time",
                              text=[f"{time:.4f}" for time in prediction_times],
                              textposition='auto', hovertemplate='%{x}: %{y:.4f} seconds'))

    # Customize the layout of the execution times figure
    fig_times.update_layout(
        title='Execution Times for Each Classifier (Training and Prediction)',
        barmode='group',  # Group bars for each time type side by side
        xaxis_title='Classifier',
        yaxis_title='Time (Seconds)',
        legend_title='Time Type',
        template='plotly_dark',  # Optional: Change theme to dark for better visuals
        xaxis=dict(tickangle=45),  # Rotate x-axis labels for readability
        hovermode='closest'
    )

    # Show the execution time plot
    fig_times.show()
    return results


results = ml_classifiers(X_train, X_test, y_train, y_test)


----------------------------------------
Logistic Regression:
  Accuracy: 0.8689
  Precision: 0.8095
  Recall: 0.8095
  F1-Score: 0.8095
  AUC-ROC: 0.9095
----------------------------------------
----------------------------------------
Support Vector Machine:
  Accuracy: 0.8361
  Precision: 0.7391
  Recall: 0.8095
  F1-Score: 0.7727
  AUC-ROC: 0.9167
----------------------------------------
----------------------------------------
Random Forest:
  Accuracy: 0.8525
  Precision: 0.7727
  Recall: 0.8095
  F1-Score: 0.7907
  AUC-ROC: 0.9179
----------------------------------------
----------------------------------------
K-Nearest Neighbors:
  Accuracy: 0.8525
  Precision: 0.7727
  Recall: 0.8095
  F1-Score: 0.7907
  AUC-ROC: 0.9375
----------------------------------------
----------------------------------------
AdaBoost:
  Accuracy: 0.8689
  Precision: 0.7826
  Recall: 0.8571
  F1-Score: 0.8182
  AUC-ROC: 0.9167
----------------------------------------
----------------------------------

Training with Reapeated Kfold and Hyperparameter tune for SVM

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedKFold, cross_validate, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import time

#Grid Definition For SVM
svm_param_grid = {
    'C': [0.1, 0.5, 1, 10],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': [1, 0.1, 0.01, 0.001]
    }

#Find the best hyperparameters for the svm before the training
svm_grid_search = GridSearchCV(SVC(probability=True), svm_param_grid, cv=5, scoring='accuracy', n_jobs=-1)
best_params = svm_grid_search.fit(X_train, y_train).best_params_
print("Best Hyperparameters for SVM:", best_params)

def ml_classifiers2(X_train, X_test, y_train, y_test):
    classifiers = {
        "Logistic Regression": LogisticRegression(),
        "Support Vector Machine": svm_grid_search,
        "Random Forest": RandomForestClassifier(),
        "K-Nearest Neighbors": KNeighborsClassifier(),
        "AdaBoost": AdaBoostClassifier(),
        "Decision Tree": DecisionTreeClassifier()
    }

    results = {}
    # Lists to store training times and prediction times
    training_times = []
    prediction_times = []
    classifiers_list = []  # List to store the classifier names for later use

    # Define RepeatedKFold cross-validation strategy
    cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=42)

    for name, clf in classifiers.items():
        classifiers_list.append(name)  # Add the classifier name to the list

        # Perform cross-validation to get the metrics
        start_time = time.time()
        scoring = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

        # Cross-validation
        scores = cross_validate(clf, X_train, y_train, cv=cv, scoring=scoring, n_jobs=-1)

        # Calculate the mean and standard deviation for each metric
        accuracy_mean = np.mean(scores['test_accuracy'])
        accuracy_std = np.std(scores['test_accuracy'])

        precision_mean = np.mean(scores['test_precision'])
        precision_std = np.std(scores['test_precision'])

        recall_mean = np.mean(scores['test_recall'])
        recall_std = np.std(scores['test_recall'])

        f1_mean = np.mean(scores['test_f1'])
        f1_std = np.std(scores['test_f1'])

        auc_roc_mean = np.mean(scores['test_roc_auc'])
        auc_roc_std = np.std(scores['test_roc_auc'])

        # Store results
        results[name] = {
            'Accuracy': (accuracy_mean, accuracy_std),
            'Precision': (precision_mean, precision_std),
            'Recall': (recall_mean, recall_std),
            'F1-Score': (f1_mean, f1_std),
            'AUC-ROC': (auc_roc_mean, auc_roc_std)
        }

        # Store the times
        end_time = time.time()
        training_times.append(end_time - start_time)
        prediction_times.append(end_time - start_time)  # You can refine this if needed

        # Print evaluation metrics
        print("-" * 40)
        print(f"{name}:")
        print(f"  Accuracy: {accuracy_mean:.4f} ± {accuracy_std:.4f}")
        print(f"  Precision: {precision_mean:.4f} ± {precision_std:.4f}")
        print(f"  Recall: {recall_mean:.4f} ± {recall_std:.4f}")
        print(f"  F1-Score: {f1_mean:.4f} ± {f1_std:.4f}")
        print(f"  AUC-ROC: {auc_roc_mean:.4f} ± {auc_roc_std:.4f}")
        print("-" * 40)

    # Convert the results dictionary into a pandas DataFrame for easier plotting
    results_df = pd.DataFrame(results).T

    # Plot each metric (Accuracy, Precision, Recall, F1, AUC-ROC) as a bar trace
    metrics = ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'AUC-ROC']
    fig = go.Figure()
    for metric in metrics:
        mean_values = [results_df.loc[classifier, metric][0] for classifier in results_df.index]  # Means
        std_values = [results_df.loc[classifier, metric][1] for classifier in results_df.index]  # Stds

        fig.add_trace(go.Bar(
            x=results_df.index,  # Classifiers (index of the DataFrame)
            y=mean_values,  # Values for the metric
            name=f'{metric} Mean',
            text=[f"{mean:.4f}" for mean in mean_values],  # Show rounded values as text on bars
            textposition='auto',
            hovertemplate='%{x}: %{y:.4f}<br>Mean: %{text}',
            marker=dict(line=dict(width=1.5))
        ))

        # Add error bars to show std deviation
        fig.add_trace(go.Bar(
            x=results_df.index,
            y=std_values,
            name=f'{metric} Std',
            text=[f"{std:.4f}" for std in std_values],
            textposition='auto',
            hovertemplate='%{x}: ±%{y:.4f}<br>Std: %{text}',
            marker=dict(line=dict(width=1.5)),
            opacity=0.5
        ))

    # Customize layout
    fig.update_layout(
        title='Comparison of Classifiers Across Different Metrics (with Std)',
        barmode='group',  # Group the bars for each classifier side by side
        xaxis_title='Classifier',
        yaxis_title='Score',
        xaxis=dict(tickangle=45),  # Rotate x-axis labels for readability
        legend_title='Metrics',
        template='plotly_dark',
        hovermode='closest'
    )

    # Show the plot
    fig.show()

    # Create a figure for execution times (training and prediction times)
    fig_times = go.Figure()

    # Plot training times
    fig_times.add_trace(go.Bar(x=classifiers_list, y=training_times, name="Training Time",
                              text=[f"{time:.4f}" for time in training_times],
                              textposition='auto', hovertemplate='%{x}: %{y:.4f} seconds'))

    # Plot prediction times
    fig_times.add_trace(go.Bar(x=classifiers_list, y=prediction_times, name="Prediction Time",
                              text=[f"{time:.4f}" for time in prediction_times],
                              textposition='auto', hovertemplate='%{x}: %{y:.4f} seconds'))

    # Customize the layout of the execution times figure
    fig_times.update_layout(
        title='Execution Times for Each Classifier (Training and Prediction)',
        barmode='group',  # Group bars for each time type side by side
        xaxis_title='Classifier',
        yaxis_title='Time (Seconds)',
        legend_title='Time Type',
        template='plotly_dark',  # Optional: Change theme to dark for better visuals
        xaxis=dict(tickangle=45),  # Rotate x-axis labels for readability
        hovermode='closest'
    )

    # Show the execution time plot
    fig_times.show()

    return results


# run the training
results = ml_classifiers2(X_train, X_test, y_train, y_test)


Best Hyperparameters for SVM: {'C': 0.1, 'gamma': 0.01, 'kernel': 'rbf'}
----------------------------------------
Logistic Regression:
  Accuracy: 0.8078 ± 0.0480
  Precision: 0.8126 ± 0.0804
  Recall: 0.7922 ± 0.0867
  F1-Score: 0.7976 ± 0.0572
  AUC-ROC: 0.8929 ± 0.0436
----------------------------------------
----------------------------------------
Support Vector Machine:
  Accuracy: 0.8062 ± 0.0545
  Precision: 0.8292 ± 0.0819
  Recall: 0.7637 ± 0.0871
  F1-Score: 0.7906 ± 0.0630
  AUC-ROC: 0.8872 ± 0.0495
----------------------------------------
----------------------------------------
Random Forest:
  Accuracy: 0.7963 ± 0.0590
  Precision: 0.8063 ± 0.0844
  Recall: 0.7706 ± 0.0920
  F1-Score: 0.7837 ± 0.0678
  AUC-ROC: 0.8887 ± 0.0458
----------------------------------------
----------------------------------------
K-Nearest Neighbors:
  Accuracy: 0.8025 ± 0.0491
  Precision: 0.7992 ± 0.0817
  Recall: 0.7981 ± 0.0830
  F1-Score: 0.7941 ± 0.0578
  AUC-ROC: 0.8763 ± 0.0455
-------