# Imports

In [33]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV


# Load data

In [34]:
raw_data = pd.read_csv("./raw_filled_data.csv")

std_data = pd.read_csv("./standardized_data.csv")

norm_data = pd.read_csv("./normalized_data.csv")

# Test classifiers

In [35]:
def prepare_data(data):
    X = data.iloc[:, :-1]
    y = data.iloc[:, -1]
    return train_test_split(X, y, test_size=0.2, random_state=42)

def train_and_evaluate(X_train, X_test, y_train, y_test, data_type):
    print(f"\n=== Results for {data_type} ===")
    
    # Naive Bayes
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    nb_pred = nb.predict(X_test)
    nb_accuracy = accuracy_score(y_test, nb_pred)
    
    # Decision Tree
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X_train, y_train)
    dt_pred = dt.predict(X_test)
    dt_accuracy = accuracy_score(y_test, dt_pred)
    
    print(f"Naive Bayes Accuracy: {nb_accuracy:.4f}")
    print(f"Decision Tree Accuracy: {dt_accuracy:.4f}")
    
    print("\nNaive Bayes Classification Report:")
    print(classification_report(y_test, nb_pred))
    
    print("\nDecision Tree Classification Report:")
    print(classification_report(y_test, dt_pred))
    
    return nb, dt, nb_accuracy, dt_accuracy

datasets = {
    'Raw Data': raw_data,
    'Standardized Data': std_data,
    'Normalized Data': norm_data
}

results = {}

for name, data in datasets.items():
    X_train, X_test, y_train, y_test = prepare_data(data)
    nb_model, dt_model, nb_acc, dt_acc = train_and_evaluate(X_train, X_test, y_train, y_test, name)
    results[name] = {
        'nb_model': nb_model,
        'dt_model': dt_model,
        'nb_accuracy': nb_acc,
        'dt_accuracy': dt_acc
    }

# Summary comparison
print("\n=== SUMMARY COMPARISON ===")
for name, result in results.items():
    print(f"{name}:")
    print(f"  Naive Bayes: {result['nb_accuracy']:.4f}")
    print(f"  Decision Tree: {result['dt_accuracy']:.4f}")


=== Results for Raw Data ===
Naive Bayes Accuracy: 0.6268
Decision Tree Accuracy: 0.7840

Naive Bayes Classification Report:
              precision    recall  f1-score   support

           1       0.73      0.49      0.59        73
           2       0.81      0.64      0.72       114
           3       0.30      0.54      0.39        13
           4       0.62      0.84      0.71        19
           5       0.30      0.82      0.44        17
           6       0.86      0.70      0.77        63
           7       0.79      0.63      0.70        54
           8       0.56      0.94      0.70        16
           9       0.31      0.83      0.45        12
          10       0.47      0.40      0.43        45

    accuracy                           0.63       426
   macro avg       0.58      0.68      0.59       426
weighted avg       0.70      0.63      0.64       426


Decision Tree Classification Report:
              precision    recall  f1-score   support

           1       0.7

We can see that Decision Tree Classifier (DT) is doing better the Naive Bayes Classifier (NB) - one of the reason might be that in dataset there are some features which are not normally distributed.<br>

The class imbalance also has a impact on the ability to predict for NB - with many more healthy cases, the algorithm develops a strong bias toward predicting the majority class. The statistics shows it - dangerous cases (CLASS value <8-10>) are fairly worse predicted than the more common cases. DTs can be more robust here because they focus on local patterns within subsets of data. However, there's a risk that the DT will start overfitting.<br>

NB often shows higher recall than precision, especially for minority classes. This suggests it has a lower decision threshold - it's more willing to predict a class, leading to more false positives.<br>
DT generally maintains better precision-recall balance, indicating more conservative and accurate predictions.


Another reason why NB is doing worse is feature interactions in CTG dataset. NB assumes feature independence, so it can't capture these interactions directly. It treats each feature separately when calculating probabilities.

There are no big differences between the datasets since methods used to preprocess data (standardization, normalization) didn't change the distribution of features. Probably using PCA or selecting features would improve results for NB.

# Hyperparameters finetuning

In [36]:
nb_param_grid = {
    'var_smoothing': np.logspace(0, -9, num=100)
}

dt_param_grid = {
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', None]
}

results = {}

for dataset_name, data in datasets.items():
    print(f"\n{'='*20} {dataset_name} {'='*20}")
    
    X_train, X_test, y_train, y_test = prepare_data(data)
    
    results[dataset_name] = {}
    
    print(f"\n--- Gaussian Naive Bayes on {dataset_name} ---")
    
    nb_grid_search = GridSearchCV(
        estimator=GaussianNB(),
        param_grid=nb_param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    
    nb_grid_search.fit(X_train, y_train)
    nb_best_model = nb_grid_search.best_estimator_
    nb_test_score = nb_best_model.score(X_test, y_test)
    
    print(f"Best parameters: {nb_grid_search.best_params_}")
    print(f"Best CV score: {nb_grid_search.best_score_:.4f}")
    print(f"Test accuracy: {nb_test_score:.4f}")
    
    results[dataset_name]['NB'] = {
        'best_params': nb_grid_search.best_params_,
        'cv_score': nb_grid_search.best_score_,
        'test_score': nb_test_score,
        'model': nb_best_model
    }
    
    print(f"\n--- Decision Tree on {dataset_name} ---")
    
    dt_grid_search = GridSearchCV(
        estimator=DecisionTreeClassifier(random_state=42),
        param_grid=dt_param_grid,
        cv=5,
        scoring='accuracy',
        n_jobs=-1
    )
    
    dt_grid_search.fit(X_train, y_train)
    dt_best_model = dt_grid_search.best_estimator_
    dt_test_score = dt_best_model.score(X_test, y_test)
    
    print(f"Best parameters: {dt_grid_search.best_params_}")
    print(f"Best CV score: {dt_grid_search.best_score_:.4f}")
    print(f"Test accuracy: {dt_test_score:.4f}")
    
    results[dataset_name]['DT'] = {
        'best_params': dt_grid_search.best_params_,
        'cv_score': dt_grid_search.best_score_,
        'test_score': dt_test_score,
        'model': dt_best_model
    }


print(f"\n{'='*60}")
print("SUMMARY COMPARISON")
print(f"{'='*60}")

print(f"{'Dataset':<20} {'Algorithm':<15} {'CV Score':<10} {'Test Score':<10}")
print("-" * 60)

for dataset_name in datasets.keys():
    for algo in ['NB', 'DT']:
        cv_score = results[dataset_name][algo]['cv_score']
        test_score = results[dataset_name][algo]['test_score']
        algo_name = 'Naive Bayes' if algo == 'NB' else 'Decision Tree'
        print(f"{dataset_name:<20} {algo_name:<15} {cv_score:<10.4f} {test_score:<10.4f}")

best_combo = None
best_score = 0

for dataset_name in datasets.keys():
    for algo in ['NB', 'DT']:
        test_score = results[dataset_name][algo]['test_score']
        if test_score > best_score:
            best_score = test_score
            best_combo = (dataset_name, algo)

if best_combo:
    dataset_name, algo = best_combo
    algo_name = 'Naive Bayes' if algo == 'NB' else 'Decision Tree'
    print(f"\nBest combination: {algo_name} on {dataset_name}")
    print(f"Test accuracy: {best_score:.4f}")
    print(f"Best parameters: {results[dataset_name][algo]['best_params']}")



--- Gaussian Naive Bayes on Raw Data ---


Best parameters: {'var_smoothing': np.float64(1e-09)}
Best CV score: 0.5947
Test accuracy: 0.6268

--- Decision Tree on Raw Data ---
Best parameters: {'criterion': 'entropy', 'max_depth': 7, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best CV score: 0.8041
Test accuracy: 0.7911


--- Gaussian Naive Bayes on Standardized Data ---
Best parameters: {'var_smoothing': np.float64(0.004328761281083057)}
Best CV score: 0.6994
Test accuracy: 0.7136

--- Decision Tree on Standardized Data ---
Best parameters: {'criterion': 'entropy', 'max_depth': 7, 'max_features': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best CV score: 0.8059
Test accuracy: 0.7911


--- Gaussian Naive Bayes on Normalized Data ---
Best parameters: {'var_smoothing': np.float64(0.004328761281083057)}
Best CV score: 0.7088
Test accuracy: 0.7183

--- Decision Tree on Normalized Data ---
Best parameters: {'criterion': 'entropy', 'max_depth': 7, 'max_features': None, 'min_samples_leaf': 2, 'min_sam

Finetuning the smoothing hyperparameter increased the NB accuracy by few %.<br>
Finetuning parameters in DT didn't improve the accuracy significantly.<br>
For each dataset the parameters stay the same for DT - that's because of DT is scale-invariant.<br>