# Naive bayesian classifier

## Implementation



### Function to load the datasets

In [8]:
import pandas as pd

def load_dataset(paths):
    '''
    Load the datasets in paths, return the data and the targets.
    '''
    # load train and test datasets together
    dataset = pd.concat([pd.read_csv(f) for f in paths])
    # keep targets for future comparative purposes
    targets = dataset['cover_type']
    # remove target column
    dataset.drop('cover_type', inplace=True, axis=1)
    # check shape
    print("[INFO] Dataset shape: ", dataset.shape)
    # return
    return (dataset, targets)

### Function to perform the training

In [9]:
from sklearn.naive_bayes import GaussianNB
from sklearn.externals import joblib

def prepare_naive_model(X, y, path_save_model):
    '''
    Perform naive bayes classification.
    '''
    # train model
    model = GaussianNB()
    print("[INFO] Fitting...")
    model.fit(X, y)
    # save it
    print("[INFO] Saving...")
    joblib.dump(model,path_save_model)
    # return it
    print("[INFO] Done.")
    return model

### Function to perform normality tests

In [10]:
from scipy import stats
import numpy as np

def normality_test(dataset):
    '''
    Performs three normality tests on the data:
    Shapiro-Wilker and D'Agostino.
    '''
    shapiro = [p for (_,p) in map(stats.shapiro, [dataset[c] for c in dataset.columns])]
    _, dagostino = stats.normaltest(dataset)
    return {'shapiro':shapiro, 'dagostino':dagostino}

### Load the datasets

In [11]:
original_train_dataset, original_train_targets = load_dataset(['../datasets/covertype_norm_train.csv'])
original_test_dataset, original_test_targets = load_dataset(['../datasets/covertype_norm_test.csv'])
lda_train_dataset, lda_train_targets = load_dataset(['../datasets/covertype_lda_train.csv'])
lda_test_dataset, lda_test_targets = load_dataset(['../datasets/covertype_lda_test.csv'])

[INFO] Dataset shape:  (14421, 54)
[INFO] Dataset shape:  (4808, 54)
[INFO] Dataset shape:  (14421, 6)
[INFO] Dataset shape:  (4808, 6)


### Perform normality tests

In [12]:
normality_original = normality_test(original_train_dataset)
normality_lda = normality_test(lda_train_dataset)
print("[INFO] Normality tests for original: \n", normality_original)
print("[INFO] Normality tests for LDA: \n", normality_lda)



[INFO] Normality tests for original: 
 {'shapiro': [0.0, 0.0, 2.802596928649634e-45, 0.0, 0.0, 0.0, 0.0, 0.0, 5.083340120300435e-30, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], 'dagostino': array([0.00000000e+000, 0.00000000e+000, 2.76074249e-136, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       1.62863745e-061, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       5.09771520e-061, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       6.06404803e-001, 6.06404803e-001, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       6.06404803e-001, 0.00000000e+000, 0.00000000e+000, 0.00000000e+000,
       0.00000000e+000, 0.00

### Perform training

In [13]:
# train with original
original_model = prepare_naive_model(original_train_dataset, original_train_targets, '../models/naive_original.save')
# train with original
lda_model = prepare_naive_model(lda_train_dataset, lda_train_targets, '../models/naive_lda.save')

[INFO] Fitting...
[INFO] Saving...
[INFO] Done.
[INFO] Fitting...
[INFO] Saving...
[INFO] Done.


### Model evaluation

We're going to use the accuracy score with respect to the test set in order to evaluate the models.

#### Original

In [24]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support

original_preds = original_model.predict(original_test_dataset)
original_acc_score = accuracy_score(original_test_targets, original_preds)
print("[INFO] Original acc score:", original_acc_score)
print("[INFO] Confusion matrix: ")
print(confusion_matrix(original_test_targets, original_preds))
print("[INFO] Precision, recall, fscore, support ")
print(precision_recall_fscore_support(original_test_targets, original_preds, average='macro'))

[INFO] Original acc score: 0.4317803660565724
[INFO] Confusion matrix: 
[[  9   9   4   0 104  38 522]
 [  0  68  64  11 198  62 284]
 [  0   0 246 438   1   2   0]
 [  0   0   0 687   0   0   0]
 [  0   1 159   0 322 134  71]
 [  0   0 215 389  15  68   0]
 [  0   0   2   0   9   0 676]]
[INFO] Precision, recall, fscore, support 
(0.5477038695404968, 0.4316933082831208, 0.343567904249623, None)


#### LDA

In [25]:
lda_preds = lda_model.predict(lda_test_dataset)
lda_acc_score = accuracy_score(lda_test_targets, lda_preds)
print("[INFO] LDA acc score:", lda_acc_score)
print("[INFO] Confusion matrix: ")
print(confusion_matrix(lda_test_targets, lda_preds))
print("[INFO] Precision, recall, fscore, support ")
print(precision_recall_fscore_support(lda_test_targets, lda_preds, average='macro'))

[INFO] LDA acc score: 0.6356073211314476
[INFO] Confusion matrix: 
[[483  98   1   0  28   2  74]
 [210 288  20   0 132  29   8]
 [  0   1 319  86  23 258   0]
 [  0   0  63 566   0  58   0]
 [ 33 134  62   0 436  22   0]
 [  0  16 169  28  65 409   0]
 [129   1   2   0   0   0 555]]
[INFO] Precision, recall, fscore, support 
(0.6383655658669276, 0.6356215599152948, 0.6343086072006612, None)


## Correlation test

In [7]:
import pandas as pd

original_dataset, original_targets = load_dataset(['../datasets/covertype_norm_train.csv','../datasets/covertype_norm_test.csv'])
print(original_dataset.corr().mean())

lda_dataset, lda_targets = load_dataset(['../datasets/covertype_lda_train.csv','../datasets/covertype_lda_test.csv'])
print(lda_dataset.corr().mean())

[INFO] Dataset shape:  (19229, 54)
elevation           0.063029
aspect              0.026981
slope              -0.024508
horiz_dist_hydro    0.054084
vert_dist_hydro     0.033909
horiz_dist_road     0.051260
hillshade_9        -0.006286
hill_shade_noon     0.043517
hill_shade_15       0.031287
horiz_dist_fire     0.046283
wild_area_0         0.016658
wild_area_1         0.022170
wild_area_2         0.021860
wild_area_3        -0.047225
soil_type_0        -0.007470
soil_type_1        -0.006244
soil_type_2        -0.018280
soil_type_3        -0.006882
soil_type_4         0.001309
soil_type_5        -0.007912
soil_type_6              NaN
soil_type_7              NaN
soil_type_8         0.014967
soil_type_9        -0.040811
soil_type_10       -0.001757
soil_type_11        0.012847
soil_type_12        0.005309
soil_type_13       -0.001807
soil_type_14             NaN
soil_type_15        0.007805
soil_type_16       -0.013800
soil_type_17        0.013857
soil_type_18        0.015126
soil_typ