# Naive Bayes

## Setup

In [1]:
# Imports
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import pipeline
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.dpi'] = 160
colors = sns.color_palette("viridis", 10)

import utilities as utils
from utilities import *

In [2]:
# Pamateters
data_dir = "../data/preprocessed"
data_source_type = "gdansk" # ["gdansk", "physionet"]
splice_type = "constant" # ["complete", "constant", "random"]
label_type = "classification" # ["classification", "regression"]
simulation = True # [True, False]

max_size_cv = 80_000
max_size_fit = 160_000

In [3]:
# Auto adjustment

data_type = "simulated" if simulation else "original"

if simulation:
    splice_type = "constant"
    data_source_type = "gdansk"
    N = 96
    simulated = 100
else:
    N = 48
    simulated = None
    
read_path = f"{data_dir}/{data_type}_{data_source_type}_{splice_type}_features_{label_type}_milliseconds_"
y_name = "label" if label_type == "classification" else "age"

if label_type != "classification":
    raise Exception("label_type can only be classification.")
    
no_classes = 7 if data_source_type == "gdansk" else 6
classification = label_type=="classification"

relative_folder_dir = "../report/img/learning/"
basic_path = f"{data_type}_{data_source_type}_naive_bayes_{label_type}_{splice_type}_"
error_distribution_save_path = relative_folder_dir + basic_path + "error_distribution.png"
error_distribution_unbiased_save_path = relative_folder_dir + basic_path + "error_distribution_unbiased.png"

metrics_save_path = f"../report/results/{basic_path}metrics.txt"

## Data

In [4]:
%%time
train = pd.read_csv(read_path + "train.csv", index_col=0)
val = pd.read_csv(read_path + "val.csv", index_col=0)
test = pd.read_csv(read_path + "test.csv", index_col=0)

train = pd.concat([train, val])
del val

if train.shape[0] > max_size_cv:
    print(f"Reducing samples for cross validation from {train.shape[0]} to {max_size_cv}.")
    train_cv = train.sample(max_size_cv)
else:
    print(f"Keeping {train.shape[0]} samples for cross validation.")
    train_cv = train
    
if train.shape[0] > max_size_fit:
    print(f"Reducing samples for fitting from {train.shape[0]} to {max_size_fit}.")
    train_fit = train.sample(max_size_fit)
else:
    print(f"Keeping {train.shape[0]} samples for fitting.")
    train_fit = train
    
# Train CV
X_train_cv = train_cv.loc[:, train_cv.columns != y_name]
X_train_cv = X_train_cv.drop(columns=['tinn']) # Is all 'None'
Y_train_cv = train_cv[y_name]

# Train Fit
X_train_fit = train_fit.loc[:, train_fit.columns != y_name]
X_train_fit = X_train_fit.drop(columns=['tinn']) # Is all 'None'
Y_train_fit = train_fit[y_name]

# Train Eval
X_train = train.loc[:, train.columns != y_name]
X_train = X_train.drop(columns=['tinn']) # Is all 'None'
Y_train = train[y_name]
#Y_train = Y_train_fit

# Test
X_test = test.loc[:, test.columns != y_name]
X_test = X_test.drop(columns=['tinn']) # Is all 'None'
Y_test = test[y_name]
#Y_test = Y_test_fit

Reducing samples for cross validation from 1179400 to 80000.
Reducing samples for fitting from 1179400 to 160000.
CPU times: user 8.59 s, sys: 572 ms, total: 9.16 s
Wall time: 10.2 s


## Classifier

In [5]:
class_weights = utils.class_weights_from_path(read_path + f"train.csv", no_classes, y_name, inverse=False)
print(class_weights)

parameters = {
                'classifier__alpha': (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)
             }

pipe = pipeline.Pipeline(steps = [
    ('classifier', MultinomialNB(class_prior=class_weights))
])

clf = GridSearchCV(pipe, parameters, cv = 3, n_jobs = -1, verbose = 10)
clf.fit(X_train_cv, Y_train_cv)

parameter = clf.best_params_

print(f"Best set of parameters is: {parameter}. Fitting now.")

clf = MultinomialNB(class_prior=class_weights, alpha=parameter['classifier__alpha'])

clf.fit(X_train_fit, Y_train_fit)

print("Fit is completed. Predicting now.")
Y_train_pred = clf.predict(X_train)
Y_test_pred = clf.predict(X_test)

[0.13648362 0.12664921 0.21875686 0.14967612 0.14967612 0.11513576
 0.1036223 ]
Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  19 out of  30 | elapsed:    1.3s remaining:    0.7s
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed:    1.4s remaining:    0.4s
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed:    1.4s remaining:    0.2s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:    1.4s finished


ValueError: Negative values in data passed to MultinomialNB (input X)

In [None]:
result = ""

if label_type == "classification" and splice_type == "complete":
    train_error_distribution = Y_train_pred - Y_train
    test_error_distribution = Y_test_pred - Y_test
    
    result += f"Accuracy Train: {sklearn.metrics.accuracy_score(Y_train, Y_train_pred)}\n"
    result += f"Accuracy Test: {sklearn.metrics.accuracy_score(Y_test, Y_test_pred)}\n"
    
elif label_type == "classification" and splice_type == "constant":
    train_error_distribution = Y_train_pred - Y_train
    test_error_distribution = Y_test_pred - Y_test
    
    result += f"Accuracy Train: {utils.accuracy_score_from_label_chunks(Y_train, Y_train_pred, N=N, simulated=simulated)}\n"
    result += f"Accuracy Test: {utils.accuracy_score_from_label_chunks(Y_test, Y_test_pred, N=N, simulated=simulated)}\n"
    
else:
    print("label_type and splice_type combination not supported.")

result += f"\nMax Size CV: Taken -> {min(train.shape[0], max_size_cv)}, cap -> {max_size_cv}, available -> {train.shape[0]}.\n"
result += f"Max Size Fit: Taken -> {min(train.shape[0], max_size_fit)}, cap -> {max_size_fit}, available -> {train.shape[0]}.\n"
result += f"Best Parameters: {parameter}\n\n"

In [None]:
bins = len(np.unique(test_error_distribution)) if classification else None
sns.set(style="white")
sns.distplot(test_error_distribution, color=sns.color_palette("viridis", 10)[5], label="Error Density", bins=bins)
plt.ylabel('Density');
plt.xlabel('Error');
plt.title(f'Naive Bayes Error Distribution ({label_type} | {splice_type})');
plt.legend();
plt.savefig(error_distribution_save_path);
plt.show();

In [None]:
with open(metrics_save_path, "w") as text_file:
    text_file.write(result)

In [None]:
print(result)