# Test: Our Naive Bayes Model vs. Scikit-learn Naive Bayes

## Using Random Data (of Random Size and Shape):

In [1]:
# Import libraries, packages, modules:

# Models (estimator classes):
from naive_bayes import NaiveBayesGaussian
from sklearn.naive_bayes import GaussianNB

# Other, for testing below:
import numpy as np
import pandas as pd
import random

In [2]:
# Test data: Make randomized data to test with:

test_num_features = random.randint(1, 51)
test_num_rows = test_num_features * 5
# OR: random.randint(0, 101) OR np.random.randint(0, 101)

test_feature_matrix = [[random.uniform(-10, 10) for feature in range(test_num_features)] for row in range(test_num_rows)]
# OR: np.random.uniform(low=-10, high=10, size=(test_num_rows, test_num_features))
test_target = [random.randint(0, 1) for row in range(test_num_rows)]
# OR: np.random.random_integers(low=0, high=1, size=(test_length,))

In [3]:
print(f"test_feature_matrix shape: {len(test_feature_matrix)} x {len(test_feature_matrix[0])}")

test_feature_matrix shape: 215 x 43


In [4]:
print(f"test_target length: {len(test_target)}")

test_target length: 215


### Predictions and Accuracy vs. Scikit-learn's NB Model:

Our model's predictions and accuracy score:

In [5]:
# Our model's predictions:
model_nb = NaiveBayesGaussian()
model_nb.fit(X_features=test_feature_matrix, y_target=test_target)
model_nb.predict(X_features=test_feature_matrix)

[0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0]

In [6]:
# Our model's accuracy score:
model_nb.score(X_features=test_feature_matrix, y_true=test_target)

0.7348837209302326

Scikit-learn Naive Bayes (Gaussian) model's predictions and accuracy score:

In [7]:
# Scikit-learn model's predictions:
from sklearn.naive_bayes import GaussianNB

sklearn_model_nb = GaussianNB()
sklearn_model_nb.fit(X=test_feature_matrix, y=test_target)
sklearn_predictions = sklearn_model_nb.predict(X=test_feature_matrix)
sklearn_predictions

array([0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0])

In [8]:
# Scikit-learn model's accuracy score:
from sklearn.metrics import accuracy_score

accuracy_score(y_true=test_target, y_pred=sklearn_predictions)

0.7348837209302326

## Using Real-world Dataset: Tanzania Water Pumps Data

### Setup: Import Pre-prepped Dataset:

In [9]:
# Import pre-prepped dataset: training data and inputs only for test data:
PATH_DATASET_TANZANIA = "./datasets_prepped/tanzania_water/"
tanzania_water_X_train = pd.read_csv(PATH_DATASET_TANZANIA + "X_train.csv", 
                                     index_col=0)
tanzania_water_y_train = pd.read_csv(PATH_DATASET_TANZANIA + "y_train.csv", 
                                     index_col=0)
tanzania_water_X_test = pd.read_csv(PATH_DATASET_TANZANIA + "X_test.csv", 
                                     index_col=0)

### Predictions and Accuracy vs. Scikit-learn's NB Model:

Our model's predictions and accuracy score:

In [13]:
# Import libraries, packages, modules needed:
import category_encoders as ce
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [14]:
# Transform input data into numeric data before fitting models:

# Initiate class instances of transformers in a pipeline:
transformers = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='median'), 
    StandardScaler()
)

# Fit on training data:
X_train_transformed = transformers.fit_transform(tanzania_water_X_train)
X_test_transformed = transformers.transform(tanzania_water_X_test)

In [15]:
# Our model's predictions:
model_nb = NaiveBayesGaussian()
model_nb.fit(X_features=X_train_transformed, y_target=tanzania_water_y_train)
model_nb.predict(X_features=X_test_transformed)

['non functional',
 'functional',
 'functional',
 'functional',
 'functional',
 'functional',
 'functional',
 'non functional',
 'functional',
 'functional',
 'functional',
 'functional',
 'non functional',
 'functional',
 'non functional',
 'functional',
 'non functional',
 'functional',
 'functional',
 'functional',
 'functional',
 'functional',
 'functional',
 'functional',
 'functional',
 'functional',
 'functional',
 'non functional',
 'functional',
 'functional',
 'non functional',
 'functional',
 'functional',
 'non functional',
 'functional',
 'non functional',
 'non functional',
 'functional',
 'non functional',
 'non functional',
 'functional',
 'functional',
 'non functional',
 'functional needs repair',
 'functional',
 'non functional',
 'functional',
 'non functional',
 'functional',
 'functional',
 'non functional',
 'non functional',
 'non functional',
 'functional',
 'functional',
 'non functional',
 'functional',
 'functional',
 'functional',
 'functional',
 'non funct

In [16]:
# Get actual results to compare to and get our model's predictions' accuracy:
tanzania_water_y_test = pd.read_csv(PATH_DATASET_TANZANIA + "y_test.csv", 
                                     index_col=0)

In [17]:
# Our model's accuracy score:
model_nb.score(X_features=X_test_transformed, y_true=tanzania_water_y_test)

0.6536700336700336

Scikit-learn Naive Bayes (Gaussian) model's predictions and accuracy score:

In [18]:
# Scikit-learn model's predictions:
from sklearn.naive_bayes import GaussianNB

sklearn_model_nb = GaussianNB()
sklearn_model_nb.fit(X=X_train_transformed, y=tanzania_water_y_train)
sklearn_predictions = sklearn_model_nb.predict(X=X_test_transformed)
sklearn_predictions

array(['non functional', 'functional', 'functional', ...,
       'non functional', 'functional', 'non functional'], dtype='<U23')

In [19]:
# Scikit-learn model's accuracy score:
from sklearn.metrics import accuracy_score

accuracy_score(y_true=tanzania_water_y_test, y_pred=sklearn_predictions)

0.6536026936026936