# Naive bayesian classifier

## Implementation



### Function to load the datasets

In [1]:
import pandas as pd

def load_dataset(paths):
    '''
    Load the datasets in paths, return the data and the targets.
    '''
    # load train and test datasets together
    dataset = pd.concat([pd.read_csv(f) for f in paths])
    # keep targets for future comparative purposes
    targets = dataset['cover_type']
    # remove target column
    dataset.drop('cover_type', inplace=True, axis=1)
    # check shape
    print("[INFO] Dataset shape: ", dataset.shape)
    # return
    return (dataset, targets)

### Function to perform the training

In [2]:
from sklearn.naive_bayes import GaussianNB
from sklearn.externals import joblib

def prepare_naive_model(X, y, path_save_model):
    '''
    Perform naive bayes classification.
    '''
    # train model
    model = GaussianNB()
    print("[INFO] Fitting...")
    model.fit(X, y)
    # save it
    print("[INFO] Saving...")
    joblib.dump(model,path_save_model)
    # return it
    print("[INFO] Done.")
    return model

### Function to perform normality tests

In [3]:
from scipy import stats
import numpy as np

def normality_test(dataset):
    '''
    Performs three normality tests on the data:
    Shapiro-Wilker and D'Agostino.
    '''
    shapiro = [p for (_,p) in map(stats.shapiro, [dataset[c] for c in dataset.columns])]
    _, dagostino = stats.normaltest(dataset)
    return {'shapiro':shapiro, 'dagostino':dagostino}

### Load the datasets

In [4]:
original_train_dataset, original_train_targets = load_dataset(['../datasets/covertype_norm_train.csv'])
original_test_dataset, original_test_targets = load_dataset(['../datasets/covertype_norm_test.csv'])
lda_train_dataset, lda_train_targets = load_dataset(['../datasets/covertype_lda_train.csv'])
lda_test_dataset, lda_test_targets = load_dataset(['../datasets/covertype_lda_test.csv'])

[INFO] Dataset shape:  (14421, 54)
[INFO] Dataset shape:  (4808, 54)
[INFO] Dataset shape:  (14421, 6)
[INFO] Dataset shape:  (4808, 6)


### Perform normality tests

In [5]:
normality_original = normality_test(original_train_dataset)
normality_lda = normality_test(lda_train_dataset)
print("[INFO] Normality tests for original: \n", normality_original)
print("[INFO] Normality tests for LDA: \n", normality_lda)

[INFO] Normality tests for original: 
 {'shapiro': [0.0, 0.0, 2.802596928649634e-45, 0.0, 0.0, 0.0, 0.0, 0.0, 5.083340120300435e-30, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'dagostino': array([0.00000000e+000, 0.00000000e+000, 2.76074249e-136, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       1.62863745e-061, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       5.09771520e-061, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       6.06404803e-001, 6.06404803e-001, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       6.06404803e-001, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00



### Perform training

In [6]:
# train with original
original_model = prepare_naive_model(original_train_dataset, original_train_targets, '../models/naive_original.save')
# train with original
lda_model = prepare_naive_model(lda_train_dataset, lda_train_targets, '../models/naive_lda.save')

[INFO] Fitting...
[INFO] Saving...
[INFO] Done.
[INFO] Fitting...
[INFO] Saving...
[INFO] Done.


### Model evaluation

We're going to use the accuracy score with respect to the test set in order to evaluate the models.

#### Original

In [9]:
from sklearn.metrics import accuracy_score
original_preds = original_model.predict(original_test_dataset)
original_acc_score = accuracy_score(original_test_targets, original_preds)
print("[INFO] Original acc score:", original_acc_score)

[INFO] Original acc score: 0.4317803660565724


#### LDA

In [10]:
lda_preds = lda_model.predict(lda_test_dataset)
lda_acc_score = accuracy_score(lda_test_targets, lda_preds)
print("[INFO] LDA acc score:", lda_acc_score)

[INFO] LDA acc score: 0.6356073211314476
