In [1]:
from src.utils.load_processed_data import load_wine, load_glass, load_pima_diabetes
from src.naive_bayes import GaussianNaiveBayes, NaiveBayes, MixedNaiveBayes
from src.cross_validation import CrossValidator
from src.evaluators import MetricsEvaluator
from src.digitisers import (DataTransformer, DiscreteEncoder, 
                            RoundDigitizer, KMeansDigitizer, 
                            MultipleAttributeTransformer)
import time

In [21]:
import pandas as pd
import numpy as np
import tqdm

# Loading data

In [3]:
data = load_wine()
X, Y = data
X.shape
len(set(Y))

3

In [4]:
nb_repetitions = 100
nb_folds = 10

## Classical NB - no Discretisation

In [6]:
enc = MultipleAttributeTransformer({i: DiscreteEncoder() for i in range(X.shape[1])})
enc.fit(X)
x = enc.transform(X)

In [7]:
tic = time.time()
metrics_evals = []
for _ in range(nb_repetitions):
    cv = CrossValidator(x,Y, lambda: NaiveBayes(zero_frequency_fill=True))
    metrics_evals += cv.kfold_cross_validation(nb_folds, shuffle=True)
toc = time.time()
base_time = toc-tic

In [8]:
sum([meval.accuracy() for meval in metrics_evals])/nb_folds/nb_repetitions

0.74489215686274701

## Round Digitisation

In [9]:
round_digitisers = {0 : RoundDigitizer(40),
                    1 : RoundDigitizer(40),
                    2 : RoundDigitizer(30),
                    3 : RoundDigitizer(20),
                    4 : RoundDigitizer(20),
                    5 : RoundDigitizer(35),
                    6 : RoundDigitizer(45),
                    7 : RoundDigitizer(15),
                    8 : RoundDigitizer(30),
                    9 : RoundDigitizer(45),
                    10 : RoundDigitizer(25),
                    11 : RoundDigitizer(40),
                    12 : RoundDigitizer(40)}

In [10]:
tic = time.time()
round_metrics_evals = []
for _ in range(nb_repetitions):
    round_cv = CrossValidator(X, Y, 
                              lambda: NaiveBayes(zero_frequency_fill=True),
                              lambda: MultipleAttributeTransformer(round_digitisers))

    round_metrics_evals += round_cv.kfold_cross_validation(nb_folds, shuffle=True)
toc = time.time()
round_time = toc-tic

In [11]:
sum([meval.accuracy() for meval in round_metrics_evals])/nb_folds/nb_repetitions

0.9474379084967407

## K-Means Digitisation - 1/3

In [25]:
third_kmean_digitisers = {0 : KMeansDigitizer(40),
                          1 : KMeansDigitizer(40),
                          2 : KMeansDigitizer(30),
                          3 : KMeansDigitizer(20),
                          4 : KMeansDigitizer(20),
                          5 : KMeansDigitizer(35),
                          6 : KMeansDigitizer(45),
                          7 : KMeansDigitizer(15),
                          8 : KMeansDigitizer(30),
                          9 : KMeansDigitizer(45),
                          10 : KMeansDigitizer(25),
                          11 : KMeansDigitizer(40),
                          12 : KMeansDigitizer(40)}

In [26]:
tic = time.time()
third_kmeans_metrics_evals = []
for _ in tqdm.tqdm(range(nb_repetitions)):
    third_kmeans_cv = CrossValidator(X, Y, 
                              lambda: NaiveBayes(zero_frequency_fill=True),
                              lambda: MultipleAttributeTransformer(third_kmean_digitisers))

    third_kmeans_metrics_evals += third_kmeans_cv.kfold_cross_validation(nb_folds, shuffle=True)
toc = time.time()
third_kmeans_time = toc-tic

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [06:52<00:00,  4.13s/it]


In [27]:
sum([meval.accuracy() for meval in third_kmeans_metrics_evals])/nb_folds/nb_repetitions

0.94234313725491059

## K-Means Digitisation - 1/6

In [22]:
kmean_digitisers = {0 : KMeansDigitizer(20),
                    1 : KMeansDigitizer(20),
                    2 : KMeansDigitizer(15),
                    3 : KMeansDigitizer(10),
                    4 : KMeansDigitizer(10),
                    5 : KMeansDigitizer(20),
                    6 : KMeansDigitizer(25),
                    7 : KMeansDigitizer(10),
                    8 : KMeansDigitizer(15),
                    9 : KMeansDigitizer(25),
                    10 : KMeansDigitizer(10),
                    11 : KMeansDigitizer(20),
                    12 : KMeansDigitizer(20)}

In [23]:
tic = time.time()
kmeans_metrics_evals = []
for _ in tqdm.tqdm(range(nb_repetitions)):
    kmeans_cv = CrossValidator(X, Y, 
                              lambda: NaiveBayes(zero_frequency_fill=True),
                              lambda: MultipleAttributeTransformer(kmean_digitisers))

    kmeans_metrics_evals += kmeans_cv.kfold_cross_validation(nb_folds, shuffle=True)
toc = time.time()
kmeans_time = toc-tic

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [03:56<00:00,  2.37s/it]


In [24]:
sum([meval.accuracy() for meval in kmeans_metrics_evals])/nb_folds/nb_repetitions

0.96658496732026744

## Gaussian NB

In [28]:
tic = time.time()
gauss_metrics_evals = []
for _ in tqdm.tqdm(range(nb_repetitions)):
    gauss_cv = CrossValidator(X,Y, lambda: GaussianNaiveBayes())
    gauss_metrics_evals += gauss_cv.kfold_cross_validation(10, shuffle=True)
toc = time.time()
gauss_time = toc-tic

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 12.95it/s]


In [15]:
sum([meval.accuracy() for meval in gauss_metrics_evals])/nb_folds/nb_repetitions

0.97390196078431812

# Results sum up

In [29]:
res = pd.DataFrame({'Metoda Dyskretyzacji': ['Brak', 'Zaokrąglanie','K-Means 1/3','K-Means 1/6', "Gauss'owski NB"]})
res = res.set_index(res['Metoda Dyskretyzacji'])
res = res.drop('Metoda Dyskretyzacji', axis=1)
res

Brak
Zaokrąglanie
K-Means 1/3
K-Means 1/6
Gauss'owski NB


In [30]:
def calculate_metrics_avgs_stds(func_name, 
                                metrics_lists=[metrics_evals, 
                                               round_metrics_evals,
                                               third_kmeans_metrics_evals,
                                               kmeans_metrics_evals,
                                               gauss_metrics_evals]):
    means = []
    stds = []
    for mevals in metrics_lists:
        metric = np.asarray([eval('m.'+func_name+'()') for m in mevals])
        means.append(metric.mean())
        stds.append(metric.std())
    return means, stds

In [31]:
acc = calculate_metrics_avgs_stds('accuracy')
res['Accuracy'] = acc[0]
res['Accuracy Std'] = acc[1]

prec = calculate_metrics_avgs_stds('precision')
res['Precision'] = prec[0]
res['Precision Std'] = prec[1]

rec = calculate_metrics_avgs_stds('recall')
res['Recall'] = rec[0]
res['Recall Std'] = rec[1]

fmeas = calculate_metrics_avgs_stds('f_measure')
res['F-measure'] = fmeas[0]
res['F-measure Std'] = fmeas[1]

res['Średni czas kroswalidacji'] = np.asarray([base_time, round_time, third_kmeans_time, kmeans_time, gauss_time])/nb_repetitions

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


ValueError: Length of values does not match length of index

In [33]:
res

Unnamed: 0_level_0,Accuracy,Accuracy Std,Precision,Precision Std,Recall,Recall Std,F-measure,F-measure Std,Średni czas kroswalidacji
Metoda Dyskretyzacji,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Brak,0.744892,0.105998,0.80797,0.085135,0.744892,0.105998,0.739884,0.109496,3.015564
Zaokrąglanie,0.947438,0.052406,0.958549,0.038231,0.947438,0.052406,0.947713,0.051404,0.728339
K-Means 1/3,0.942343,0.054373,0.953394,0.042945,0.942343,0.054373,0.942779,0.053621,4.129686
K-Means 1/6,0.966585,0.041575,0.972956,0.033089,0.966585,0.041575,0.966759,0.041346,2.366775
Gauss'owski NB,0.973422,0.038674,0.978356,0.031757,0.973422,0.038674,0.973653,0.03845,0.07738


In [34]:
res.transpose()

Metoda Dyskretyzacji,Brak,Zaokrąglanie,K-Means 1/3,K-Means 1/6,Gauss'owski NB
Accuracy,0.744892,0.947438,0.942343,0.966585,0.973422
Accuracy Std,0.105998,0.052406,0.054373,0.041575,0.038674
Precision,0.80797,0.958549,0.953394,0.972956,0.978356
Precision Std,0.085135,0.038231,0.042945,0.033089,0.031757
Recall,0.744892,0.947438,0.942343,0.966585,0.973422
Recall Std,0.105998,0.052406,0.054373,0.041575,0.038674
F-measure,0.739884,0.947713,0.942779,0.966759,0.973653
F-measure Std,0.109496,0.051404,0.053621,0.041346,0.03845
Średni czas kroswalidacji,3.015564,0.728339,4.129686,2.366775,0.07738


In [35]:
print(res.transpose().to_latex())

\begin{tabular}{lrrrrr}
\toprule
Metoda Dyskretyzacji &      Brak &  Zaokrąglanie &  K-Means 1/3 &  K-Means 1/6 &  Gauss'owski NB \\
\midrule
Accuracy                  &  0.744892 &      0.947438 &     0.942343 &     0.966585 &        0.973422 \\
Accuracy Std              &  0.105998 &      0.052406 &     0.054373 &     0.041575 &        0.038674 \\
Precision                 &  0.807970 &      0.958549 &     0.953394 &     0.972956 &        0.978356 \\
Precision Std             &  0.085135 &      0.038231 &     0.042945 &     0.033089 &        0.031757 \\
Recall                    &  0.744892 &      0.947438 &     0.942343 &     0.966585 &        0.973422 \\
Recall Std                &  0.105998 &      0.052406 &     0.054373 &     0.041575 &        0.038674 \\
F-measure                 &  0.739884 &      0.947713 &     0.942779 &     0.966759 &        0.973653 \\
F-measure Std             &  0.109496 &      0.051404 &     0.053621 &     0.041346 &        0.038450 \\
Średni czas kroswa

In [37]:
print(res.to_latex(float_format=lambda x: '{:.4f}'.format(x)))

\begin{tabular}{lrrrrrrrrr}
\toprule
{} &  Accuracy &  Accuracy Std &  Precision &  Precision Std &  Recall &  Recall Std &  F-measure &  F-measure Std &  Średni czas kroswalidacji \\
Metoda Dyskretyzacji &           &               &            &                &         &             &            &                &                            \\
\midrule
Brak                 &    0.7449 &        0.1060 &     0.8080 &         0.0851 &  0.7449 &      0.1060 &     0.7399 &         0.1095 &                     3.0156 \\
Zaokrąglanie         &    0.9474 &        0.0524 &     0.9585 &         0.0382 &  0.9474 &      0.0524 &     0.9477 &         0.0514 &                     0.7283 \\
K-Means 1/3          &    0.9423 &        0.0544 &     0.9534 &         0.0429 &  0.9423 &      0.0544 &     0.9428 &         0.0536 &                     4.1297 \\
K-Means 1/6          &    0.9666 &        0.0416 &     0.9730 &         0.0331 &  0.9666 &      0.0416 &     0.9668 &         0.0413 &             

# Stage 2

In [58]:
data = load_pima_diabetes()
X, Y = data
X.shape

(768, 8)

## Gaussian

In [101]:
tic = time.time()
gaussian_metrics_evals = []
for _ in range(nb_repetitions):
    gaussian_cv = CrossValidator(X,Y, lambda: GaussianNaiveBayes())
    gaussian_metrics_evals += gaussian_cv.kfold_cross_validation(10, shuffle=True)
toc = time.time()
gaussian_time = toc-tic

In [102]:
sum([meval.accuracy() for meval in gaussian_metrics_evals])/nb_folds/nb_repetitions

0.75543250170881804

## Mixed

In [131]:
digitisers = {0 : KMeansDigitizer(10),
              1 : None,
              2 : None,
              3 : KMeansDigitizer(12),
              4 : KMeansDigitizer(20),
              5 : KMeansDigitizer(20),
              6 : None, # BMI
              7 : None,
              8 : KMeansDigitizer(10)}

transformer_initialiser = lambda: MultipleAttributeTransformer(digitisers)

In [132]:
model_initialiser = lambda: MixedNaiveBayes([1,2,6,7], zero_frequency_fill=True)

In [133]:
tic = time.time()
mixed_metrics_evals = []
for _ in range(nb_repetitions):
    mixed_cv = CrossValidator(X,Y, model_initialiser,transformer_initialiser)
    mixed_metrics_evals += mixed_cv.kfold_cross_validation(10, shuffle=True)
toc = time.time()
mixed_time = toc-tic

In [134]:
sum([meval.accuracy() for meval in mixed_metrics_evals])/nb_folds/nb_repetitions

0.70939747095010275

In [137]:
data = load_glass()
X,Y = data
X.shape

(214, 9)

In [138]:
tic = time.time()
gaussian_metrics_evals = []
for _ in range(nb_repetitions):
    gaussian_cv = CrossValidator(X,Y, lambda: GaussianNaiveBayes())
    gaussian_metrics_evals += gaussian_cv.kfold_cross_validation(10, shuffle=True)
toc = time.time()
gaussian_time = toc-tic

In [139]:
sum([meval.accuracy() for meval in gaussian_metrics_evals])/nb_folds/nb_repetitions

0.45953463203463246

In [150]:
digitisers = {0 : None,
              1 : None,
              2 : None,
              3 : None,
              4 : None,
              5 : None,#KMeansDigitizer(20),
              6 : None,
              7 : None,#KMeansDigitizer(10),
              8 : None}#KMeansDigitizer(10)}

transformer_initialiser = lambda: MultipleAttributeTransformer(digitisers)

In [151]:
model_initialiser = lambda: MixedNaiveBayes([0,1,2,3,4,6], zero_frequency_fill=True)

In [152]:
tic = time.time()
mixed_metrics_evals = []
for _ in range(nb_repetitions):
    mixed_cv = CrossValidator(X,Y, model_initialiser,transformer_initialiser)
    mixed_metrics_evals += mixed_cv.kfold_cross_validation(10, shuffle=True)
toc = time.time()
mixed_time = toc-tic

In [153]:
sum([meval.accuracy() for meval in mixed_metrics_evals])/nb_folds/nb_repetitions

0.26583982683982782