In [7]:
# Import required libraries
import os
import sys
import numpy as np
import pandas as pd
from IPython.display import display

# Set up project paths
cwd = os.getcwd()
project_root = os.path.abspath(os.path.join(cwd, '..'))
src_path = os.path.join(project_root, 'src')
if src_path not in sys.path:
    sys.path.append(src_path)

In [8]:
# Import Naïve Bayes and Logistic Regression functions
from nbayes import nbayes
from logreg import logreg

# Define the datasets and parameters
data_path = os.path.join(project_root, '440data')
datasets = {
    'volcanoes': os.path.join(data_path, 'volcanoes'),
    'spam': os.path.join(data_path, 'spam'),
    'voting': os.path.join(data_path, 'voting')
}
numb_of_bins = 10
nb_m_estimate = 0.01
lr_lambda = 0.01

In [9]:
# Initialize results list for volcanoes dataset
results_volcanoes = []

# Naïve Bayes for Volcanoes
print("Naïve Bayes on Volcanoes:")
nb_accuracy, nb_precision, nb_recall, nb_auc = nbayes(
    data_path=datasets['volcanoes'],
    use_cross_validation=False,  # Enables cross-validation for this run
    numb_of_bins=numb_of_bins,
    m=nb_m_estimate
)
results_volcanoes.append({
    "Dataset": "volcanoes",
    "Model": "Naïve Bayes",
    "Accuracy": nb_accuracy,
    "Precision": nb_precision,
    "Recall": nb_recall,
    "AUC": nb_auc
})

# Logistic Regression for Volcanoes
print("\nLogistic Regression on Volcanoes:")
lr_accuracy, lr_precision, lr_recall, lr_auc = logreg(
    data_path=datasets['volcanoes'],
    use_cross_validation=False,  # Enables cross-validation for this run
    reg_lambda=lr_lambda
)
results_volcanoes.append({
    "Dataset": "volcanoes",
    "Model": "Logistic Regression",
    "Accuracy": lr_accuracy,
    "Precision": lr_precision,
    "Recall": lr_recall,
    "AUC": lr_auc
})

# Display Volcanoes results
results_df_volcanoes = pd.DataFrame(results_volcanoes)
display(results_df_volcanoes)


Naïve Bayes on Volcanoes:
Accuracy: 0.633 0.000
Precision: 0.466 0.000
Recall: 0.833 0.000
Area under ROC: 0.766

Logistic Regression on Volcanoes:
Accuracy: 0.820 0.000
Precision: 0.679 0.000
Recall: 0.854 0.000
Area under ROC: 0.913


Unnamed: 0,Dataset,Model,Accuracy,Precision,Recall,AUC
0,volcanoes,Naïve Bayes,0.6329,0.466309,0.833105,0.766455
1,volcanoes,Logistic Regression,0.819812,0.678999,0.853625,0.91279


In [10]:
# Initialize results list for spam dataset
results_spam = []

# Naïve Bayes for Spam
print("Naïve Bayes on Spam:")
nb_accuracy, nb_precision, nb_recall, nb_auc = nbayes(
    data_path=datasets['spam'],
    use_cross_validation=False,
    numb_of_bins=numb_of_bins,
    m=nb_m_estimate
)
results_spam.append({
    "Dataset": "spam",
    "Model": "Naïve Bayes",
    "Accuracy": nb_accuracy,
    "Precision": nb_precision,
    "Recall": nb_recall,
    "AUC": nb_auc
})

# Logistic Regression for Spam
print("\nLogistic Regression on Spam:")
lr_accuracy, lr_precision, lr_recall, lr_auc = logreg(
    data_path=datasets['spam'],
    use_cross_validation=False,
    reg_lambda=lr_lambda
)
results_spam.append({
    "Dataset": "spam",
    "Model": "Logistic Regression",
    "Accuracy": lr_accuracy,
    "Precision": lr_precision,
    "Recall": lr_recall,
    "AUC": lr_auc
})

# Display Spam results
results_df_spam = pd.DataFrame(results_spam)
display(results_df_spam)


Naïve Bayes on Spam:
Accuracy: 0.706 0.000
Precision: 0.743 0.000
Recall: 0.810 0.000
Area under ROC: 0.539

Logistic Regression on Spam:
Accuracy: 0.693 0.000
Precision: 0.730 0.000
Recall: 0.808 0.000
Area under ROC: 0.741


Unnamed: 0,Dataset,Model,Accuracy,Precision,Recall,AUC
0,spam,Naïve Bayes,0.7063,0.743244,0.810287,0.538927
1,spam,Logistic Regression,0.69332,0.730478,0.807591,0.741435


In [11]:
# Initialize results list for voting dataset
results_voting = []

# Naïve Bayes for Voting
print("Naïve Bayes on Voting:")
nb_accuracy, nb_precision, nb_recall, nb_auc = nbayes(
    data_path=datasets['voting'],
    use_cross_validation=True,
    numb_of_bins=numb_of_bins,
    m=nb_m_estimate
)
results_voting.append({
    "Dataset": "voting",
    "Model": "Naïve Bayes",
    "Accuracy": nb_accuracy,
    "Precision": nb_precision,
    "Recall": nb_recall,
    "AUC": nb_auc
})

# Logistic Regression for Voting
print("\nLogistic Regression on Voting:")
lr_accuracy, lr_precision, lr_recall, lr_auc = logreg(
    data_path=datasets['voting'],
    use_cross_validation=True,
    reg_lambda=lr_lambda
)
results_voting.append({
    "Dataset": "voting",
    "Model": "Logistic Regression",
    "Accuracy": lr_accuracy,
    "Precision": lr_precision,
    "Recall": lr_recall,
    "AUC": lr_auc
})

# Display Voting results
results_df_voting = pd.DataFrame(results_voting)
display(results_df_voting)


Naïve Bayes on Voting:
Accuracy: 0.977 0.016
Precision: 0.971 0.035
Recall: 0.979 0.019
Area under ROC: 0.963

Logistic Regression on Voting:
Accuracy: 0.984 0.005
Precision: 1.000 0.000
Recall: 0.964 0.012
Area under ROC: 0.996


Unnamed: 0,Dataset,Model,Accuracy,Precision,Recall,AUC
0,voting,Naïve Bayes,0.977245,0.971412,0.979487,0.962916
1,voting,Logistic Regression,0.984116,1.0,0.964359,0.995557
