# Evaluation

In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
)

In [13]:
X_train = pd.read_csv("../2_data/X_train.csv")
X_test = pd.read_csv("../2_data/X_test.csv")
y_train = pd.read_csv("../2_data/y_train.csv")
y_test = pd.read_csv("../2_data/y_test.csv")

In [14]:
# Example paths for predicted results and true labels
files = {
    "Logistic Regression": "../2_data/evaluation_metrics/LogisticRegression.csv",
    "Random Forest": "../2_data/evaluation_metrics/DecisionTree.csv",
    "XGBoost": "../2_data/evaluation_metrics/XGBoost.csv",
    "Baseline": "../2_data/evaluation_metrics/Baseline.csv",
    "KNN": "../2_data/evaluation_metrics/KNN.csv",
    "Naive Bayes": "../2_data/evaluation_metrics/NaiveBayes.csv"
}

# Read predictions into a dictionary of DataFrames
results = {
    model: pd.read_csv(file)
    for model, file in files.items()
}

In [15]:
# Initialize a dictionary to store the average scores for each model
average_scores = {}

# Calculate the average score for each model
for model, df in results.items():
    scores = {
        'accuracy': df['accuracy'].mean(),
        'precision': df['precision'].mean(),
        'recall': df['recall'].mean(),
        'f1': df['f1'].mean(),
        'roc_auc': df['roc_auc'].mean() if 'roc_auc' in df.columns else 0
    }
    average_scores[model] = np.mean(list(scores.values()))

# Determine the best model based on the highest average score
best_model = max(average_scores, key=average_scores.get)
print(f"The best model is {best_model} with an average score of {average_scores[best_model]:.4f}")




The best model is XGBoost with an average score of 0.9393


In [16]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import pandas as pd

# Load your dataset
X = X_test
y = y_test

# Define the models
models = {
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(),
    'KNN': KNeighborsClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'MLP': MLPClassifier(),
    'XGBoost': XGBClassifier()
}

# Apply k-fold cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
results = {}

for model_name, model in models.items():
    cv_results = cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
    results[model_name] = cv_results.mean()
    print(f"{model_name}: {cv_results.mean():.4f} (+/- {cv_results.std():.4f})")

# Determine the best model based on cross-validation results
best_model = max(results, key=results.get)
print(f"The best model is {best_model} with an average accuracy of {results[best_model]:.4f}")

Naive Bayes: 0.8112 (+/- 0.0139)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(

Logistic Regression: 0.7977 (+/- 0.0363)
KNN: 0.7452 (+/- 0.0258)
Decision Tree: 0.9262 (+/- 0.0077)


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Random Forest: 0.9517 (+/- 0.0095)
SVM: 0.7346 (+/- 0.0326)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


MLP: 0.7338 (+/- 0.0531)
XGBoost: 0.9595 (+/- 0.0066)
The best model is XGBoost with an average accuracy of 0.9595
