# 1. Preparing the tools

- pandas for data analysis.
- NumPy for numerical operations.
- Matplotlib/seaborn for plotting or data visualization.
- Scikit-Learn for machine learning modelling and evaluation.

In [None]:
# Regular EDA and plotting libraries
import numpy as np # np is short for numpy

import pandas as pd # pandas is so commonly used, it's shortened to pd

import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns # seaborn gets shortened to sns, TK - can seaborn be removed for matplotlib (simpler)?

## Models
import sklearn 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier # For ANN
from sklearn.svm import SVC  # For SVM (Support Vector Machine)
from sklearn.naive_bayes import GaussianNB  # For Naive Bayes
from sklearn.naive_bayes import BernoulliNB  # For binary classification


## Model evaluators
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import GradientBoostingClassifier
# from sklearn.metrics import plot_roc_curve # note: this was changed in Scikit-Learn 1.2+ to be "RocCurveDisplay" (see below)
from sklearn.metrics import RocCurveDisplay # new in Scikit-Learn 1.2+

# Print last updated
import datetime
print(f"Notebook last updated: {datetime.datetime.now()}\n")

# Print versions of libraries we're using (as long as yours are equal or greater than these, your code should work)
print(f"NumPy version: {np.__version__}")
print(f"pandas version: {pd.__version__}")
print(f"matplotlib version: {matplotlib.__version__}")
print(f"Scikit-Learn version: {sklearn.__version__}")

# 2. Loading Data

In [None]:
import sys
import os

# Get project root directory
project_dir = os.path.abspath("..")
sys.path.append(project_dir)

from src import config

# Use absolute path instead of relative
df = pd.read_csv(os.path.join(project_dir, config.HEART_DATA_PATH))
df.shape # (rows,columns)

# 3. Data Exploration (exploratory data analysis or EDA)

In [None]:
# Check the head of our DataFrame 
df.head()

In [None]:
# And the top 10
df.head(10)

In [None]:
#Number of positive(1) and negative(0) samples in pur dataframe
df.target.value_counts()

In [None]:
df.isnull().sum()

In [None]:
# Normalized value counts
df.target.value_counts(normalize=True)

In [None]:
# Plot the value counts with a bar graph
df.target.value_counts().plot(kind="bar", color=["salmon", "lightblue"]);

In [None]:
df.info()

In [None]:
df.describe()

### 3.1 Comparing one feature to another

In [None]:
# For sex, 1=male, 0=female 
df.sex.value_counts()

In [None]:
# Compare target column with sex column
pd.crosstab(index=df.target, columns=df.sex)

### 3.2 Making our comparison visual

In [None]:

# Create a plot
pd.crosstab(df.target, df.sex).plot(kind="bar", figsize=(10,6), color=["salmon", "lightblue"])

# Add some attributes to it
plt.title("Heart Disease Frequency vs Sex")
plt.xlabel("0 = No Disease, 1 = Disease")
plt.ylabel("Amount")
plt.legend(["Female", "Male"])
plt.xticks(rotation=0); # keep the labels on the x-axis vertical

### 3.3 Comparing age and maximum heart rate
Let's combine a couple of independent variables, such as, age and thalach (maximum heart rate) and then comparing them to our target variable heart disease.

In [None]:
# Create another figure
plt.figure(figsize=(10,6))

# Start with positve examples
plt.scatter(df.age[df.target==1], 
            df.thalach[df.target==1], 
            c="salmon") # define it as a scatter figure

# Now for negative examples, we want them on the same plot, so we call plt again
plt.scatter(df.age[df.target==0], 
            df.thalach[df.target==0], 
            c="lightblue") # axis always come as (x, y)

# Add some helpful info
plt.title("Heart Disease in function of Age and Max Heart Rate")
plt.xlabel("Age")
plt.legend(["Disease", "No Disease"])
plt.ylabel("Max Heart Rate");

In [None]:
#Histogram to check the distribution of the variable age
df.age.plot.hist(edgecolor='white', color='#4878CF');

In [None]:
df.sex.plot.hist(edgecolor='white', color='#4878CF');

In [None]:
df.cp.plot.hist(edgecolor='white', color='#4878CF');

In [None]:
df.trestbps.plot.hist(edgecolor='white', color='#4878CF');

In [None]:
df.chol.plot.hist(edgecolor='white', color='#4878CF');

In [None]:
df.fbs.plot.hist(edgecolor='white', color='#4878CF');

In [None]:
df.restecg.plot.hist(edgecolor='white', color='#4878CF');

In [None]:
df.thalach.plot.hist(edgecolor='white', color='#4878CF');

In [None]:
df.exang.plot.hist(edgecolor='white', color='#4878CF'); 

In [None]:
df.oldpeak.plot.hist(edgecolor='white', color='#4878CF');

In [None]:
df.slope.plot.hist(edgecolor='white', color='#4878CF');

In [None]:
df.ca.plot.hist(edgecolor='white', color='#4878CF');

In [None]:
df.thal.plot.hist(edgecolor='white', color='#4878CF');

In [None]:
df.target.plot.hist(edgecolor='white', color='#4878CF');

### 3.4 Comparing heart disease frequency and chest pain type

In [None]:
pd.crosstab(index=df.cp, columns=df.target)

In [None]:

# Create a new crosstab and base plot
pd.crosstab(df.cp, df.target).plot(kind="bar", 
                                   figsize=(10,6), 
                                   color=["lightblue", "salmon"])

# Add attributes to the plot to make it more readable
plt.title("Heart Disease Frequency Per Chest Pain Type")
plt.xlabel("Chest Pain Type")
plt.ylabel("Frequency")
plt.legend(["No Disease", "Disease"])
plt.xticks(rotation = 0);

### 3.5 Correlation between independent variables

In [None]:
#correlation between our independent variables
corr_matrix = df.corr()
corr_matrix 

In [None]:
# Let's visualize the correlation
corr_matrix = df.corr()
plt.figure(figsize=(15, 10))
sns.heatmap(corr_matrix, 
            annot=True, 
            linewidths=0.5, 
            fmt= ".2f", 
            cmap="YlGnBu");

# 4. Modeling
We will now be predicting our target variable variable using all of the other variables. For this, we will split the target variable from the rest.

In [33]:
# Everything except target variable
X = df.drop(labels="target", axis=1)

# Target variable
y = df.target.to_numpy()

In [None]:
# Independent variables (no target column)
X.head()

In [None]:
# Targets (in the form of a NumPy array)
y, type(y)

### 4.1 Creating a training and test split

Now, we will split our data into a training set and a test set. To split our data into a training and test set, we can use Scikit-Learn's sklearn.model_selection.train_test_split() and feed it our independent and dependent variables (X & y).

In [36]:
# Random seed for reproducibility (since train_test_split is random by default, setting the seed will create reproducible splits)
np.random.seed(42)

# Split into train & test set
X_train, X_test, y_train, y_test = train_test_split(X, # independent variables 
                                                    y, # dependent variable
                                                    test_size = 0.2) # percentage of data to use for test set

In [None]:
# Training data of independent variable
X_train.head()

In [None]:
# Training data of dependent variable
y_train, len(y_train)

In [None]:
# Testing data of independent variable
X_test.head()

In [None]:
# Testing data of dependent variable
y_test, len(y_test)

### 4.2 Choosing a model
We'll start by trying the following models and comparing their results.

1. Logistic Regression - sklearn.linear_model.LogisticRegression()
2. K-Nearest Neighbors - sklearn.neighbors.KNeighboursClassifier()
3. RandomForest - sklearn.ensemble.RandomForestClassifier()
4. Decision Tree: sklearn.tree.DecisionTreeClassifier()
5. SVC:sklearn.svm.
6. ANN: MLPClassifier(max_iter=1000)
7. Naive Bayes (Gaussian): sklearn.naive_bayes.GaussianNB()
8. Naive Bayes (Bernoulli): sklearn.naive_bayes.BernoulliNB()

In [41]:
#Put models in a dictionary
models = {"KNN": KNeighborsClassifier(),
          "Logistic Regression": LogisticRegression(max_iter=100), # Note: if you see a warning about "convergence not reached", you can increase `max_iter` until convergence is reached
          "Random Forest": RandomForestClassifier(),
          "Decision Tree": DecisionTreeClassifier(),
          "SVC": SVC(),
          "ANN": MLPClassifier(max_iter=1000),
          "Naive Bayes (Gaussian)": GaussianNB(),
          "Naive Bayes (Bernoulli)": BernoulliNB()}

# Create function to fit and score models
def fit_and_score(models, X_train, X_test, y_train, y_test):
    
    # Fits and evaluates given machine learning models.
    # models : a dict of different Scikit-Learn machine learning models
    # X_train : training data
    # X_test : testing data
    # y_train : labels assosciated with training data
    # y_test : labels assosciated with test data
    
    # Random seed for reproducible results
    np.random.seed(42)
    # Make a list to keep model scores
    model_scores = {}
    # Loop through models
    for name, model in models.items():
        # Fit the model to the data
        model.fit(X_train, y_train)
        # Evaluate the model and append its score to model_scores
        model_scores[name] = model.score(X_test, y_test)
    return model_scores

In [None]:
model_scores = fit_and_score(models=models, 
                             X_train=X_train, 
                             X_test=X_test, 
                             y_train=y_train, 
                             y_test=y_test)
model_scores

In [None]:
recall_scores = {
    name: recall_score(y_test, model.predict(X_test), average='weighted')
    for name, model in models.items()
}

recall_scores

### 4.3 Comparing the results of several models

In [None]:
model_compare = pd.DataFrame(model_scores, index=['accuracy'])
model_compare.T.plot.bar();

model_compare = pd.DataFrame(recall_scores, index=['Recall'])
model_compare.T.plot.bar();

### Some more exaluations

In [64]:
# Calculate precision
precision_scores = {
    name: precision_score(y_test, model.predict(X_test), average='weighted')
    for name, model in models.items()
}

In [65]:
f1_scores = {
    name: f1_score(y_test, model.predict(X_test), average='weighted')
    for name, model in models.items()
}

In [66]:
specificity_scores = {}
for name, model in models.items():
    cm = confusion_matrix(y_test, model.predict(X_test))
    tn = cm[0][0]
    fp = cm[0][1]
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    specificity_scores[name] = specificity

In [67]:
classification_error_scores = {
    name: 1 - accuracy
    for name, accuracy in model_scores.items()
}

In [None]:
model_comparison = pd.DataFrame({
    "Accuracy (%)": [score * 100 for score in model_scores.values()],
    "Recall (%)": [score * 100 for score in recall_scores.values()],
     "Precision (%)": [score * 100 for score in precision_scores.values()],
    "F1 Score (%)": [score * 100 for score in f1_scores.values()],
    "Specificity (%)": [score * 100 for score in specificity_scores.values()],
    "Classification Error (%)": [error * 100 for error in classification_error_scores.values()]
    
}, index=model_scores.keys())

# Display the DataFrame
print(model_comparison)

# Optional: Pretty-print the table
model_comparison.style.format({"Accuracy (%)": "{:.2f}", 
                               "Recall (%)": "{:.2f}",
                               "Precision (%)": "{:.2f}", 
                               "F1 Score (%)": "{:.2f}", 
                               "Specificity (%)": "{:.2f}", 
                               "Classification Error (%)": "{:.2f}"})

# 5. Hyperparameter tuning and cross-validation

### 5.1 Tune KNeighborsClassifier (K-Nearest Neighbors or KNN) by hand

In [69]:
# Create a list of train scores
train_scores = []

# Create a list of test scores
test_scores = []

# Create a list of different values for n_neighbors
neighbors = range(1, 21) # 1 to 20

# Setup algorithm
knn = KNeighborsClassifier()

# Loop through different neighbors values
for i in neighbors:
    knn.set_params(n_neighbors = i) # set neighbors value
    
    # Fit the algorithm
    knn.fit(X_train, y_train)
    
    # Update the training scores
    train_scores.append(knn.score(X_train, y_train))
    
    # Update the test scores
    test_scores.append(knn.score(X_test, y_test))

In [None]:
# KNN's train scores
train_scores

In [None]:
#Let's visualize KNN score test and train data
plt.plot(neighbors, train_scores, label="Train score")
plt.plot(neighbors, test_scores, label="Test score")
plt.xticks(np.arange(1, 21, 1))
plt.xlabel("Number of neighbors")
plt.ylabel("Model score")
plt.legend()

print(f"Maximum KNN score on the test data: {max(test_scores)*100:.2f}%")

### 5.2 Tuning models with with RandomizedSearchCV

In [72]:
# Different LogisticRegression hyperparameters
log_reg_grid = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

# Different RandomForestClassifier hyperparameters
rf_grid = {"n_estimators": np.arange(10, 1000, 50),
           "max_depth": [None, 3, 5, 10],
           "min_samples_split": np.arange(2, 20, 2),
           "min_samples_leaf": np.arange(1, 20, 2)}

# Define DecisionTreeClassifier hyperparameter
dt_grid = {
    "max_depth": [None, 3, 5, 10, 20],
    "min_samples_split": np.arange(2, 20, 2),
    "min_samples_leaf": np.arange(1, 20, 2),
    "max_features": [None, "sqrt", "log2"]
}

# Define SVC hyperparameters
svc_grid = {
    "C": np.logspace(-4, 4, 20),
    "kernel": ["linear", "poly", "rbf", "sigmoid"],
    "gamma": ["scale", "auto"]
}

# Different KNN hyperparameters
knn_grid = {
    "n_neighbors": np.arange(1, 21),
    "weights": ["uniform", "distance"],
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],
    "leaf_size": np.arange(20, 60, 5),
    "p": [1, 2]
}

# Define ANN hyperparameters
ann_grid = {
    "hidden_layer_sizes": [(50,), (100,), (50, 50), (100, 100)],  # Different layer sizes
    "activation": ["tanh", "relu"],  # Activation functions
    "solver": ["adam", "sgd"],  # Optimization solvers
    "alpha": np.logspace(-4, 4, 20),  # Regularization term
    "learning_rate": ["constant", "adaptive"],  # Learning rate strategy
    "max_iter": [500, 1000]  # Max iterations for convergence
}

# Define Naive Bayes (Gaussian) hyperparameters
nb_gaussian_grid = {
    "var_smoothing": np.logspace(-9, 0, 10)  # Smoothing parameter
}

# Define Naive Bayes (Bernoulli) hyperparameters
nb_bernoulli_grid = {
    "alpha": np.logspace(-4, 4, 20),  # Additive smoothing parameter
    "binarize": [0.0, 0.1, 0.2, 0.3],  # Threshold for binarizing input
}

In [None]:
%%time 

# Setup random seed
np.random.seed(42)

# Setup random hyperparameter search for LogisticRegression
rs_log_reg = RandomizedSearchCV(LogisticRegression(),
                                param_distributions=log_reg_grid,
                                cv=5,
                                n_iter=20,
                                verbose=True)

# Fit random hyperparameter search model
rs_log_reg.fit(X_train, y_train);

In [None]:
rs_log_reg.best_params_

In [None]:
rs_log_reg.score(X_test, y_test)

In [None]:
%%time 

# Setup random seed
np.random.seed(42)

# Setup random hyperparameter search for RandomForestClassifier
rs_rf = RandomizedSearchCV(RandomForestClassifier(),
                           param_distributions=rf_grid,
                           cv=5,
                           n_iter=20,
                           verbose=True)

# Fit random hyperparameter search model
rs_rf.fit(X_train, y_train);

In [None]:
# Find the best parameters
rs_rf.best_params_

In [None]:
# Evaluate the randomized search random forest model
rs_rf.score(X_test, y_test)

In [None]:
%%time

# Setup random seed
np.random.seed(42)

# Setup random hyperparameter search for DecisionTreeClassifier
rs_dt = RandomizedSearchCV(
    DecisionTreeClassifier(),
    param_distributions=dt_grid,
    cv=5,
    n_iter=20,
    verbose=True,
    random_state=42
)

# Fit the random hyperparameter search model
rs_dt.fit(X_train, y_train)

In [None]:
# Find the best parameters
rs_dt.best_params_

In [None]:
# Evaluate the randomized search Decision Tree model
rs_dt.score(X_test, y_test)

In [83]:
# %%time 

# # Setup random seed
# np.random.seed(42)

# # Setup random hyperparameter search for SVC
# rs_svc = RandomizedSearchCV(SVC(),
#                             param_distributions=svc_grid,
#                             cv=5,
#                             n_iter=20,
#                             verbose=True)

# # Fit random hyperparameter search model
# rs_svc.fit(X_train, y_train)

In [84]:
# # Find the best parameters
# rs_svc.best_params_

In [85]:
# # Evaluate the randomized search SVC model
# rs_svc.score(X_test, y_test)

In [None]:
%%time 

# Setup random seed
np.random.seed(42)

# Setup random hyperparameter search for KNN
rs_knn = RandomizedSearchCV(KNeighborsClassifier(),
                            param_distributions=knn_grid,
                            cv=5,
                            n_iter=20,
                            verbose=True)

# Fit random hyperparameter search model
rs_knn.fit(X_train, y_train)

In [None]:
# Find the best parameters
rs_knn.best_params_

In [None]:
# Evaluate the randomized search KNN model
rs_knn.score(X_test, y_test)

In [None]:
%%time

# Setup random hyperparameter search for GaussianNB
rs_nb_gaussian = RandomizedSearchCV(GaussianNB(), 
                                    param_distributions=nb_gaussian_grid, 
                                    cv=5, 
                                    n_iter=20, 
                                    verbose=True, 
                                    random_state=42)

# Fit random hyperparameter search model
rs_nb_gaussian.fit(X_train, y_train)

In [None]:
# Find the best parameters for Naive Bayes (Gaussian)
print("Best parameters for Gaussian Naive Bayes:", rs_nb_gaussian.best_params_)

In [None]:
# Evaluate the model on the test set
print("Naive Bayes (Gaussian) model test score:", rs_nb_gaussian.score(X_test, y_test))

In [None]:
%%time 
# Setup random hyperparameter search for BernoulliNB
rs_nb_bernoulli = RandomizedSearchCV(BernoulliNB(), 
                                     param_distributions=nb_bernoulli_grid, 
                                     cv=5, 
                                     n_iter=20, 
                                     verbose=True, 
                                     random_state=42)

# Fit random hyperparameter search model
rs_nb_bernoulli.fit(X_train, y_train)

In [None]:
# Find the best parameters for Naive Bayes (Bernoulli)
print("Best parameters for Bernoulli Naive Bayes:", rs_nb_bernoulli.best_params_)

In [None]:
# Evaluate the model on the test set
print("Naive Bayes (Bernoulli) model test score:", rs_nb_bernoulli.score(X_test, y_test))

In [None]:
# Define function to calculate classification error and specificity
def classification_error(y_true, y_pred):
    return 1 - accuracy_score(y_true, y_pred)

def specificity(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn = cm.sum(axis=1) - cm.diagonal()  # True negatives for each class
    fp = cm.sum(axis=0) - cm.diagonal()  # False positives for each class
    specificity_per_class = tn / (tn + fp + 1e-10)  # Avoid division by zero
    return specificity_per_class.mean()  # Average specificity across all classes

tuned_model_scores = {
    "Logistic Regression": {
        "Accuracy": accuracy_score(y_test, rs_log_reg.predict(X_test)),
        "Precision": precision_score(y_test, rs_log_reg.predict(X_test), average="weighted"),
        "Recall (Sensitivity)": recall_score(y_test, rs_log_reg.predict(X_test), average="weighted"),
        "Classification Error": classification_error(y_test, rs_log_reg.predict(X_test)),
        "Specificity": specificity(y_test, rs_log_reg.predict(X_test)),
        "F1 Score": f1_score(y_test, rs_log_reg.predict(X_test), average="weighted"),
    },
    "KNN": {
        "Accuracy": accuracy_score(y_test, rs_knn.predict(X_test)),
        "Precision": precision_score(y_test, rs_knn.predict(X_test), average="weighted"),
        "Recall (Sensitivity)": recall_score(y_test, rs_knn.predict(X_test), average="weighted"),
        "Classification Error": classification_error(y_test, rs_knn.predict(X_test)),
        "Specificity": specificity(y_test, rs_knn.predict(X_test)),
        "F1 Score": f1_score(y_test, rs_knn.predict(X_test), average="weighted"),
    },
    "Random Forest": {
        "Accuracy": accuracy_score(y_test, rs_rf.predict(X_test)),
        "Precision": precision_score(y_test, rs_rf.predict(X_test), average="weighted"),
        "Recall (Sensitivity)": recall_score(y_test, rs_rf.predict(X_test), average="weighted"),
        "Classification Error": classification_error(y_test, rs_rf.predict(X_test)),
        "Specificity": specificity(y_test, rs_rf.predict(X_test)),
        "F1 Score": f1_score(y_test, rs_rf.predict(X_test), average="weighted"),
    },
    "Decision Tree": {
        "Accuracy": accuracy_score(y_test, rs_dt.predict(X_test)),
        "Precision": precision_score(y_test, rs_dt.predict(X_test), average="weighted"),
        "Recall (Sensitivity)": recall_score(y_test, rs_dt.predict(X_test), average="weighted"),
        "Classification Error": classification_error(y_test, rs_dt.predict(X_test)),
        "Specificity": specificity(y_test, rs_dt.predict(X_test)),
        "F1 Score": f1_score(y_test, rs_dt.predict(X_test), average="weighted"),
    },
    "Naive Bayes (GaussianNB)": {
        "Accuracy": accuracy_score(y_test, rs_nb_gaussian.predict(X_test)),
        "Precision": precision_score(y_test, rs_nb_gaussian.predict(X_test), average="weighted"),
        "Recall (Sensitivity)": recall_score(y_test, rs_nb_gaussian.predict(X_test), average="weighted"),
        "Classification Error": classification_error(y_test, rs_nb_gaussian.predict(X_test)),
        "Specificity": specificity(y_test, rs_nb_gaussian.predict(X_test)),
        "F1 Score": f1_score(y_test, rs_nb_gaussian.predict(X_test), average="weighted"),
    },
    "Naive Bayes (BernoulliNB)": {
        "Accuracy": accuracy_score(y_test, rs_nb_bernoulli.predict(X_test)),
        "Precision": precision_score(y_test, rs_nb_bernoulli.predict(X_test), average="weighted"),
        "Recall (Sensitivity)": recall_score(y_test, rs_nb_bernoulli.predict(X_test), average="weighted"),
        "Classification Error": classification_error(y_test, rs_nb_bernoulli.predict(X_test)),
        "Specificity": specificity(y_test, rs_nb_bernoulli.predict(X_test)),
        "F1 Score": f1_score(y_test, rs_nb_bernoulli.predict(X_test), average="weighted"),
    }
}

# Convert results to a DataFrame
tuned_model_comparison = pd.DataFrame(tuned_model_scores).T * 100  # Convert to percentages

# Display the DataFrame
print(tuned_model_comparison)

# Optional: Pretty-print the table
tuned_model_comparison.style.format({"Accuracy": "{:.2f}%",
    "Precision": "{:.2f}%",
    "Recall (Sensitivity)": "{:.2f}%",
    "Classification Error": "{:.2f}%",
    "Specificity": "{:.2f}%",
    "F1 Score": "{:.2f}%"})

### Tuning K-Nearest Neighbors (KNN) with VotingClassifier

In [None]:

# Define other classifiers for VotingClassifier
log_reg = LogisticRegression(max_iter=100)
rf = RandomForestClassifier(n_estimators=100)
svc = SVC()

# Setup VotingClassifier with KNN
voting_knn = VotingClassifier(estimators=[('log_reg', log_reg), ('rf', rf), ('svc', svc)], voting='hard')

# Setup random hyperparameter search for VotingClassifier
rs_voting_knn = RandomizedSearchCV(voting_knn,
                                   param_distributions={'log_reg__C': np.logspace(-4, 4, 20),
                                                        'svc__C': np.logspace(-4, 4, 20),
                                                        'rf__max_depth': [None, 5, 10, 15],
                                                        'rf__min_samples_split': np.arange(2, 20, 2)},
                                   cv=5,
                                   n_iter=20,
                                   verbose=True,
                                   random_state=42)

# Fit the RandomizedSearchCV model
rs_voting_knn.fit(X_train, y_train)

# Best hyperparameters and model score
print("Best parameters for VotingClassifier with KNN:", rs_voting_knn.best_params_)
print("VotingClassifier (KNN) model test score:", rs_voting_knn.score(X_test, y_test))

### Tuning DecisionTreeClassifier with RandomForestClassifier 

In [None]:
# Define hyperparameter grid for RandomForestClassifier (which can also be used for DecisionTreeClassifier)
dt_grid = {
    "max_depth": [None, 3, 5, 10, 20],  # Vary depth of the tree
    "min_samples_split": np.arange(2, 20, 2),  # Minimum samples required to split a node
    "min_samples_leaf": np.arange(1, 20, 2),  # Minimum samples required to be at a leaf node
    "max_features": [None, "sqrt", "log2"],  # Features to consider for best split
    "criterion": ["gini", "entropy"],  # Split quality criteria
}

rs_dt = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=dt_grid, n_iter=20, cv=5, verbose=True, random_state=42)

# Fit the model
rs_dt.fit(X_train, y_train)

# Best parameters for DecisionTreeClassifier
print("Best parameters for DecisionTreeClassifier:", rs_dt.best_params_)

# Evaluate the tuned DecisionTreeClassifier
print("DecisionTreeClassifier test score:", rs_dt.score(X_test, y_test))

###  Tuning Logistic Regression with VotingClassifier 

In [None]:
# Define classifiers for VotingClassifier
log_reg = LogisticRegression(max_iter=100)
svc = SVC()
rf = RandomForestClassifier(n_estimators=100)

# Setup VotingClassifier with Logistic Regression
voting_log_reg = VotingClassifier(estimators=[('svc', svc), ('rf', rf)], voting='hard')

# Setup random hyperparameter search for VotingClassifier with Logistic Regression
rs_voting_log_reg = RandomizedSearchCV(voting_log_reg,
                                      param_distributions={'svc__C': np.logspace(-4, 4, 20),
                                                           'rf__max_depth': [None, 5, 10],
                                                           'rf__min_samples_split': np.arange(2, 20, 2)},
                                      cv=5,
                                      n_iter=20,
                                      verbose=True,
                                      random_state=42)

# Fit the RandomizedSearchCV model
rs_voting_log_reg.fit(X_train, y_train)

# Best hyperparameters and model score
print("Best parameters for VotingClassifier with Logistic Regression:", rs_voting_log_reg.best_params_)
print("VotingClassifier (Logistic Regression) model test score:", rs_voting_log_reg.score(X_test, y_test))

### Tuning Support Vector Classifier (SVC) with VotingClassifier

In [None]:
# Define classifiers for VotingClassifier
log_reg = LogisticRegression(max_iter=100)
rf = RandomForestClassifier(n_estimators=100)
svc = SVC()

# Setup VotingClassifier with SVC
voting_svc = VotingClassifier(estimators=[('log_reg', log_reg), ('rf', rf)], voting='hard')

# Setup random hyperparameter search for VotingClassifier with SVC
rs_voting_svc = RandomizedSearchCV(voting_svc,
                                   param_distributions={'log_reg__C': np.logspace(-4, 4, 20),
                                                        'rf__max_depth': [None, 5, 10],
                                                        'rf__min_samples_split': np.arange(2, 20, 2)},
                                   cv=5,
                                   n_iter=20,
                                   verbose=True,
                                   random_state=42)

# Fit the RandomizedSearchCV model
rs_voting_svc.fit(X_train, y_train)

# Best hyperparameters and model score
print("Best parameters for VotingClassifier with SVC:", rs_voting_svc.best_params_)
print("VotingClassifier (SVC) model test score:", rs_voting_svc.score(X_test, y_test))

<!-- ### Tuning SVC with BaggingClassifier -->

In [101]:
# # Define hyperparameters for SVC
# svc_grid = {
#     "base_estimator__C": np.logspace(-4, 4, 20),  # C parameter for SVC
#     "base_estimator__kernel": ["linear", "poly", "rbf", "sigmoid"],  # Kernel for SVC
#     "base_estimator__gamma": ["scale", "auto"],  # Gamma for SVC
#     "n_estimators": np.arange(10, 200, 20),  # Number of estimators in BaggingClassifier
# }

# # Setup BaggingClassifier with SVC as the base estimator
# bagging_svc = BaggingClassifier(base_estimator=SVC(), random_state=42)

# # Setup random hyperparameter search for BaggingClassifier with SVC
# rs_bagging_svc = RandomizedSearchCV(bagging_svc,
#                                     param_distributions=svc_grid,
#                                     cv=5,
#                                     n_iter=20,
#                                     verbose=True,
#                                     random_state=42)

# # Fit the RandomizedSearchCV model
# rs_bagging_svc.fit(X_train, y_train)

# # Best hyperparameters and model score
# print("Best parameters for BaggingClassifier with SVC:", rs_bagging_svc.best_params_)
# print("BaggingClassifier (SVC) model test score:", rs_bagging_svc.score(X_test, y_test))

### 5.3 Tuning models with GridSearchCV

The difference between RandomizedSearchCV and GridSearchCV is:

- sklearn.model_selection.RandomizedSearchCV searches over a grid of hyperparameters performing n_iter combinations (e.g. will explore random combinations of the hyperparameters for a defined number of iterations).
- sklearn.model_selection.GridSearchCV will test every single possible combination of hyperparameters in the grid (this is a thorough test but can take quite a long time).

In [None]:
%%time

# Setup random seed
np.random.seed(42) 

# Different LogisticRegression hyperparameters
log_reg_grid = {"C": np.logspace(-4, 4, 20),
                "solver": ["liblinear"]}

# Setup grid hyperparameter search for LogisticRegression
gs_log_reg = GridSearchCV(LogisticRegression(),
                          param_grid=log_reg_grid,
                          cv=5,
                          verbose=True)

# Fit grid hyperparameter search model
gs_log_reg.fit(X_train, y_train);

In [None]:
# Check the best parameters
gs_log_reg.best_params_

In [None]:
# Evaluate the model
gs_log_reg.score(X_test, y_test)

In [None]:
%%time

# Define hyperparameters for KNN
knn_grid = {
    "n_neighbors": np.arange(1, 21),  # Number of neighbors to use
    "weights": ["uniform", "distance"],  # Weighting function for neighbors
    "algorithm": ["auto", "ball_tree", "kd_tree", "brute"],  # Algorithm for computing nearest neighbors
    "leaf_size": np.arange(20, 60, 5),  # Leaf size for tree algorithms
    "p": [1, 2]  # Power parameter for Minkowski distance (1=Manhattan, 2=Euclidean)
}

# Setup grid hyperparameter search for KNN
gs_knn = GridSearchCV(KNeighborsClassifier(),
                      param_grid=knn_grid,
                      cv=5,  # 5-fold cross-validation
                      verbose=True)

# Fit grid hyperparameter search for KNN
gs_knn.fit(X_train, y_train)

# Best hyperparameters and model evaluation
print("Best parameters for KNN:", gs_knn.best_params_)
knn_best_model = gs_knn.best_estimator_
print("KNN test score:", knn_best_model.score(X_test, y_test))

# Evaluate KNN model
knn_predictions = knn_best_model.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_predictions)
knn_recall = recall_score(y_test, knn_predictions, average="weighted")

In [None]:
%%time

# Define hyperparameters for RandomForestClassifier
rf_grid = {
    "n_estimators": [100, 200],  # Number of trees in the forest
    "max_depth": [None, 10, 20],  # Maximum depth of the tree
    "min_samples_split": [2, 10],  # Minimum number of samples required to split a node
    "min_samples_leaf": [1, 4],  # Minimum number of samples required at each leaf node
    "bootstrap": [True]  # Whether bootstrap samples are used
}

# Setup grid hyperparameter search for RandomForestClassifier
gs_rf = GridSearchCV(RandomForestClassifier(random_state=42),
                     param_grid=rf_grid,
                     cv=5,  # 5-fold cross-validation
                     verbose=True)

# Fit grid hyperparameter search for RandomForestClassifier
gs_rf.fit(X_train, y_train)

# Best hyperparameters and model evaluation
print("Best parameters for RandomForestClassifier:", gs_rf.best_params_)
rf_best_model = gs_rf.best_estimator_
print("RandomForestClassifier test score:", gs_rf.score(X_test, y_test))

In [None]:
%%time

# Setup random seed
np.random.seed(42)

# Define hyperparameters for ANN (MLPClassifier)
ann_grid = {
    "hidden_layer_sizes": [(50,), (100,)],  # Different layer sizes
    "activation": ["relu"],  # Activation functions
    "solver": ["adam"],  # Optimization solvers
    "alpha":  [0.001, 0.01, 0.1],  # Focused range
    "learning_rate": ["adaptive"],  # Learning rate strategy
    "max_iter": [500]  # Max iterations for convergence
}

# Setup grid hyperparameter search for ANN (MLPClassifier)
gs_ann = GridSearchCV(MLPClassifier(),
                      param_grid=ann_grid,
                      cv=5,
                      verbose=True)

# Fit grid hyperparameter search for ANN
gs_ann.fit(X_train, y_train)

print("Best parameters for ANN:", gs_ann.best_params_)
print("ANN test score:", gs_ann.score(X_test, y_test))


In [None]:
%%time

# Define hyperparameters for Naive Bayes (GaussianNB)
nb_gaussian_grid = {
    "var_smoothing": np.logspace(-9, 0, 10)  # Smoothing parameter
}

# Setup grid hyperparameter search for Naive Bayes (GaussianNB)
gs_nb_gaussian = GridSearchCV(GaussianNB(),
                              param_grid=nb_gaussian_grid,
                              cv=5,
                              verbose=True)

# Fit grid hyperparameter search for Naive Bayes (GaussianNB)
gs_nb_gaussian.fit(X_train, y_train)

print("Best parameters for Naive Bayes (GaussianNB):", gs_nb_gaussian.best_params_)
print("Naive Bayes (GaussianNB) test score:", gs_nb_gaussian.score(X_test, y_test))


In [None]:
%%time

# Define hyperparameters for Naive Bayes (BernoulliNB)
nb_bernoulli_grid = {
    "alpha": np.logspace(-4, 4, 20),  # Additive smoothing parameter
    "binarize": [0.0, 0.1, 0.2, 0.3],  # Threshold for binarizing input
}

# Setup grid hyperparameter search for Naive Bayes (BernoulliNB)
gs_nb_bernoulli = GridSearchCV(BernoulliNB(),
                               param_grid=nb_bernoulli_grid,
                               cv=5,
                               verbose=True)

# Fit grid hyperparameter search for Naive Bayes (BernoulliNB)
gs_nb_bernoulli.fit(X_train, y_train)

print("Best parameters for Naive Bayes (BernoulliNB):", gs_nb_bernoulli.best_params_)
print("Naive Bayes (BernoulliNB) test score:", gs_nb_bernoulli.score(X_test, y_test))

In [None]:
# Define a function to calculate classification error
def classification_error(y_true, y_pred):
    return 1 - accuracy_score(y_true, y_pred)

# Define a function to calculate specificity
def specificity(y_true, y_pred):
    cm = confusion_matrix(y_true, y_pred)
    tn = cm.sum(axis=1) - cm.diagonal()  # True negatives for each class
    fp = cm.sum(axis=0) - cm.diagonal()  # False positives for each class
    specificity_per_class = tn / (tn + fp + 1e-10)  # Avoid division by zero
    return specificity_per_class.mean()  # Average specificity across all classes

tuned_model_scores = {
    "Logistic Regression": {
        "Accuracy": accuracy_score(y_test, gs_log_reg.predict(X_test)),
        "Precision": precision_score(y_test, gs_log_reg.predict(X_test), average="weighted"),
        "Sensitivity (Recall)": recall_score(y_test, gs_log_reg.predict(X_test), average="weighted"),
        "Classification Error": classification_error(y_test, gs_log_reg.predict(X_test)),
        "Specificity": specificity(y_test, gs_log_reg.predict(X_test)),
        "F1 Measure": f1_score(y_test, gs_log_reg.predict(X_test), average="weighted"),
    },
    "KNN": {
        "Accuracy": accuracy_score(y_test, gs_knn.predict(X_test)),
        "Precision": precision_score(y_test, gs_knn.predict(X_test), average="weighted"),
        "Sensitivity (Recall)": recall_score(y_test, gs_knn.predict(X_test), average="weighted"),
        "Classification Error": classification_error(y_test, gs_knn.predict(X_test)),
        "Specificity": specificity(y_test, gs_knn.predict(X_test)),
        "F1 Measure": f1_score(y_test, gs_knn.predict(X_test), average="weighted"),
    },
    "Random Forest": {
        "Accuracy": accuracy_score(y_test, rs_rf.predict(X_test)),
        "Precision": precision_score(y_test, rs_rf.predict(X_test), average="weighted"),
        "Sensitivity (Recall)": recall_score(y_test, rs_rf.predict(X_test), average="weighted"),
        "Classification Error": classification_error(y_test, rs_rf.predict(X_test)),
        "Specificity": specificity(y_test, rs_rf.predict(X_test)),
        "F1 Measure": f1_score(y_test, rs_rf.predict(X_test), average="weighted"),
    },
    "Decision Tree": {
        "Accuracy": accuracy_score(y_test, rs_dt.predict(X_test)),
        "Precision": precision_score(y_test, rs_dt.predict(X_test), average="weighted"),
        "Sensitivity (Recall)": recall_score(y_test, rs_dt.predict(X_test), average="weighted"),
        "Classification Error": classification_error(y_test, rs_dt.predict(X_test)),
        "Specificity": specificity(y_test, rs_dt.predict(X_test)),
        "F1 Measure": f1_score(y_test, rs_dt.predict(X_test), average="weighted"),
    },
    "Naive Bayes (GaussianNB)": {
        "Accuracy": accuracy_score(y_test, gs_nb_gaussian.predict(X_test)),
        "Precision": precision_score(y_test, gs_nb_gaussian.predict(X_test), average="weighted"),
        "Sensitivity (Recall)": recall_score(y_test, gs_nb_gaussian.predict(X_test), average="weighted"),
        "Classification Error": classification_error(y_test, gs_nb_gaussian.predict(X_test)),
        "Specificity": specificity(y_test, gs_nb_gaussian.predict(X_test)),
        "F1 Measure": f1_score(y_test, gs_nb_gaussian.predict(X_test), average="weighted"),
    },
    "Naive Bayes (BernoulliNB)": {
        "Accuracy": accuracy_score(y_test, gs_nb_bernoulli.predict(X_test)),
        "Precision": precision_score(y_test, gs_nb_bernoulli.predict(X_test), average="weighted"),
        "Sensitivity (Recall)": recall_score(y_test, gs_nb_bernoulli.predict(X_test), average="weighted"),
        "Classification Error": classification_error(y_test, gs_nb_bernoulli.predict(X_test)),
        "Specificity": specificity(y_test, gs_nb_bernoulli.predict(X_test)),
        "F1 Measure": f1_score(y_test, gs_nb_bernoulli.predict(X_test), average="weighted"),
    },
    "ANN": {
        "Accuracy": accuracy_score(y_test, gs_ann.predict(X_test)),
        "Precision": precision_score(y_test, gs_ann.predict(X_test), average="weighted"),
        "Sensitivity (Recall)": recall_score(y_test, gs_ann.predict(X_test), average="weighted"),
        "Classification Error": classification_error(y_test, gs_ann.predict(X_test)),
        "Specificity": specificity(y_test, gs_ann.predict(X_test)),
        "F1 Measure": f1_score(y_test, gs_ann.predict(X_test), average="weighted"),
    }
}

# Convert results to a DataFrame and scale to percentages
tuned_model_comparison = pd.DataFrame(tuned_model_scores).T * 100

# Display the DataFrame
print(tuned_model_comparison)

# Optional: Pretty-print the table
tuned_model_comparison.style.format({
    "Accuracy": "{:.2f}%",
    "Precision": "{:.2f}%",
    "Sensitivity (Recall)": "{:.2f}%",
    "Classification Error": "{:.2f}%",
    "Specificity": "{:.2f}%",
    "F1 Measure": "{:.2f}%"
})

# 6. Evaluating a classification model, beyond accuracy

In [111]:
# Make preidctions on test data
y_preds = gs_log_reg.predict(X_test)

In [None]:
y_preds

In [None]:
y_test

### 6.1 ROC Curve and AUC Scores

In [None]:
from sklearn.metrics import RocCurveDisplay 

# from_estimator() = use a model to plot ROC curve on data
RocCurveDisplay.from_estimator(estimator=gs_log_reg, 
                               X=X_test, 
                               y=y_test);

### 6.2 Creating a confusion matrix

In [None]:
# Display confusion matrix
print(confusion_matrix(y_test, y_preds))

In [None]:
import seaborn as sns
sns.set(font_scale=1.5) # Increase font size

def plot_conf_mat(y_test, y_preds):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3, 3))
    ax = sns.heatmap(confusion_matrix(y_test, y_preds),
                     annot=True, # Annotate the boxes
                     cbar=False)
    plt.xlabel("true label")
    plt.ylabel("predicted label")
    
plot_conf_mat(y_test, y_preds)

### 6.3 Classification report

In [None]:
# Show classification report
print(classification_report(y_test, y_preds))

In [None]:
# Check best hyperparameters
gs_log_reg.best_params_

In [119]:
# Import cross_val_score
from sklearn.model_selection import cross_val_score

# Instantiate best model with best hyperparameters (found with GridSearchCV)
clf = LogisticRegression(C=0.23357214690901212,
                         solver="liblinear")

In [None]:
%%time

# Cross-validated accuracy score
cv_acc = cross_val_score(clf,
                         X,
                         y,
                         cv=5, # 5-fold cross-validation, this is the default
                         scoring="accuracy") # accuracy as scoring
cv_acc

In [None]:
cv_acc = np.mean(cv_acc)
cv_acc

In [None]:
# Cross-validated precision score
cv_precision = np.mean(cross_val_score(clf,
                                       X,
                                       y,
                                       cv=5, # 5-fold cross-validation
                                       scoring="precision")) # precision as scoring
cv_precision

In [None]:
# Cross-validated recall score
cv_recall = np.mean(cross_val_score(clf,
                                    X,
                                    y,
                                    cv=5, # 5-fold cross-validation
                                    scoring="recall")) # recall as scoring
cv_recall

In [None]:
# Cross-validated F1 score
cv_f1 = np.mean(cross_val_score(clf,
                                X,
                                y,
                                cv=5, # 5-fold cross-validation
                                scoring="f1")) # f1 as scoring
cv_f1

In [None]:
# Visualizing cross-validated metrics
cv_metrics = pd.DataFrame({"Accuracy": cv_acc,
                            "Precision": cv_precision,
                            "Recall": cv_recall,
                            "F1": cv_f1},
                          index=[0])
cv_metrics.T.plot.bar(title="Cross-Validated Metrics", legend=False);

# 7. Feature importance

In [126]:
# Fit an instance of LogisticRegression (taken from above)
clf.fit(X_train, y_train);

In [None]:
# Check coef_
clf.coef_

In [None]:
# Match features to columns
features_dict = dict(zip(df.columns, list(clf.coef_[0])))
features_dict

In [None]:
# Visualize feature importance
features_df = pd.DataFrame(features_dict, index=[0])
features_df.T.plot.bar(title="Feature Importance", legend=False);

In [None]:
pd.crosstab(df["sex"], df["target"])

In [None]:
# Contrast slope (positive coefficient) with target
pd.crosstab(df["slope"], df["target"])