### Test: Our Naive Bayes Model vs. Scikit-learn Naive Bayes, Using Random Data:

In [14]:
# Import libraries, packages, modules:

# Models (estimator classes):
from gaussian_nb import NaiveBayesGaussian
from sklearn.naive_bayes import GaussianNB

# Other, for testing below:
import numpy as np
import pandas as pd
import random

In [15]:
# Test data: Make randomized data to test with:

test_num_features = 3  # OR: random.randint(1, 101)
test_num_rows = test_num_features * 5
# OR: random.randint(0, 101) OR np.random.randint(0, 101)

test_feature_matrix = [[random.uniform(-10, 10) for feature in range(test_num_features)] for row in range(test_num_rows)]
# OR: np.random.uniform(low=-10, high=10, size=(test_num_rows, test_num_features))
test_target = [random.randint(0, 1) for row in range(test_num_rows)]
# OR: np.random.random_integers(low=0, high=1, size=(test_length,))

In [16]:
test_feature_matrix

[[-7.403970413089076, -3.2709309900112338, -9.954860450160368],
 [3.3278403602726137, -5.3964929203232685, -4.586314516494365],
 [0.3220027589002523, -7.995692305358855, 0.21943335155485144],
 [-4.569679301514713, -8.658198618700641, -3.1342917623980497],
 [6.74697392536617, 3.5291024486759053, -2.578146715010652],
 [7.095008176805813, -9.607593770280207, -8.559249239382689],
 [-9.415069821806107, -8.78397558875023, 8.670796128063309],
 [-1.4428601747408099, 3.9860581459479256, -8.357931025649062],
 [-4.648839127580491, -0.9989160629221328, -6.583662667325935],
 [8.483248436978261, -1.7682553064144013, -9.668120928623352],
 [9.882925902564217, 4.058044383693369, -8.002108910054147],
 [-3.8100467594185012, 1.8525788528660279, -0.9278159191042548],
 [-3.3482737284326163, -7.785209273813061, 7.345678965098283],
 [-8.467816203826992, -5.753375814074706, 7.309682235498478],
 [-1.780175465799898, -7.067834301877958, -8.962009468070757]]

In [17]:
test_target

[1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1]

In [18]:
# Test:
model_nb = NaiveBayesGaussian()
model_nb.fit(X_features=test_feature_matrix, y_target=test_target)
model_nb.predict(X_features=test_feature_matrix)

[0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0]

In [19]:
model_nb.score(X_features=test_feature_matrix, y_target=test_target)

0.6

In [20]:
model_nb.classes

[0, 1]

In [21]:
model_nb.class_priors

{0: 0.5333333333333333, 1: 0.4666666666666667}

In [22]:
model_nb.ccps_likelihoods

{0: [{'class': 0,
   'feature_number': 0,
   'mean': -1.2930471224930618,
   'variance': 28.179758893538228},
  {'class': 0,
   'feature_number': 1,
   'mean': -4.296260269457393,
   'variance': 19.15412179227826},
  {'class': 0,
   'feature_number': 2,
   'mean': -2.181940793542394,
   'variance': 43.84027613700075}],
 1: [{'class': 1,
   'feature_number': 0,
   'mean': 0.18794936351751687,
   'variance': 59.07225774278907},
  {'class': 1,
   'feature_number': 1,
   'mean': -2.7558012808120464,
   'variance': 34.88510138046505},
  {'class': 1,
   'feature_number': 2,
   'mean': -4.330484939102794,
   'variance': 44.68834075698772}]}

In [23]:
X_features = test_feature_matrix
y_target = test_target

In [24]:
X_features = np.array(X_features)
X_features

array([[-7.40397041, -3.27093099, -9.95486045],
       [ 3.32784036, -5.39649292, -4.58631452],
       [ 0.32200276, -7.99569231,  0.21943335],
       [-4.5696793 , -8.65819862, -3.13429176],
       [ 6.74697393,  3.52910245, -2.57814672],
       [ 7.09500818, -9.60759377, -8.55924924],
       [-9.41506982, -8.78397559,  8.67079613],
       [-1.44286017,  3.98605815, -8.35793103],
       [-4.64883913, -0.99891606, -6.58366267],
       [ 8.48324844, -1.76825531, -9.66812093],
       [ 9.8829259 ,  4.05804438, -8.00210891],
       [-3.81004676,  1.85257885, -0.92781592],
       [-3.34827373, -7.78520927,  7.34567897],
       [-8.4678162 , -5.75337581,  7.30968224],
       [-1.78017547, -7.0678343 , -8.96200947]])

In [25]:
X_features.shape[1]

3

In [26]:
num_features = X_features.shape[1]
for feature_num in range(num_features):
    print(X_features[:, feature_num])

[-7.40397041  3.32784036  0.32200276 -4.5696793   6.74697393  7.09500818
 -9.41506982 -1.44286017 -4.64883913  8.48324844  9.8829259  -3.81004676
 -3.34827373 -8.4678162  -1.78017547]
[-3.27093099 -5.39649292 -7.99569231 -8.65819862  3.52910245 -9.60759377
 -8.78397559  3.98605815 -0.99891606 -1.76825531  4.05804438  1.85257885
 -7.78520927 -5.75337581 -7.0678343 ]
[-9.95486045 -4.58631452  0.21943335 -3.13429176 -2.57814672 -8.55924924
  8.67079613 -8.35793103 -6.58366267 -9.66812093 -8.00210891 -0.92781592
  7.34567897  7.30968224 -8.96200947]


In [27]:
x_df = pd.DataFrame(X_features)
x_df["y_target"] = test_target
x_df

Unnamed: 0,0,1,2,y_target
0,-7.40397,-3.270931,-9.95486,1
1,3.32784,-5.396493,-4.586315,0
2,0.322003,-7.995692,0.219433,0
3,-4.569679,-8.658199,-3.134292,0
4,6.746974,3.529102,-2.578147,1
5,7.095008,-9.607594,-8.559249,1
6,-9.41507,-8.783976,8.670796,1
7,-1.44286,3.986058,-8.357931,0
8,-4.648839,-0.998916,-6.583663,0
9,8.483248,-1.768255,-9.668121,0


In [28]:
x_df.groupby(by="y_target").mean()

Unnamed: 0_level_0,0,1,2
y_target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,-1.293047,-4.29626,-2.181941
1,0.187949,-2.755801,-4.330485


In [29]:
x_df.groupby(by="y_target").var()

Unnamed: 0_level_0,0,1,2
y_target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,28.179759,19.154122,43.840276
1,59.072258,34.885101,44.688341


In [None]:
# 
# Calculate the prior probability for given class labels

# [?? To do: Handle exceptions when input arrays are not the right type -- e.g., are strings instead of ints and floats ??]

# Prep: 

In [None]:
# Step 2: Find Conditional/Likelihood probability with each attribute for each class
# Step 3: Put these value in Bayes Formula and calculate posterior probability.
# Step 4: See which class has a higher probability, given the input belongs to the higher probability class.

In [None]:
# [?? To do: Kernel density estimation KDE ??]