In [1]:
import numpy as np

In [2]:
def create_dataset(n_rows=10, n_cols=2, prob=(0.5, 0.5), seed=None):
    '''
    creates a 2D numpy array with 0s and 1s for columns
    
    INPUT:
        n_rows = (int) number of rows in dataset
        n_cols = (int) number of columns starting with target followed by features
        prob = (tuple) probability of success for target, feature 1, feature 2, ..., feature n
    OUTPUT:
        dataset (numpy array)
    '''
    # error handling
    assert type(n_rows) == int, 'n_rows must be an integer'
    assert type(n_cols) == int, 'n_cols must be an integer'
    assert type(prob) == tuple, 'prob must be a tuple of probabilities'
    assert len(prob) == n_cols, 'tuple must contain probabilities for each n_col'
    assert type(seed) == int, 'seed must be an integer'
    
    # reproducibility
    if seed:
        np.random.seed(seed)
        
    # create dataset
    for i, p in enumerate(prob):
        if i < 1:
            dataset = np.random.binomial(n=1, p=prob[i], size=n_rows)
        else:
            column = np.random.binomial(n=1, p=prob[i], size=n_rows)
            dataset = np.c_[dataset, column]
    
    return dataset

In [43]:
probabilities = (0.5, 0.5, 0.5)
data = create_dataset(n_rows=3, n_cols=3, prob=probabilities, seed=42)

In [44]:
data

array([[0, 1, 0],
       [1, 0, 1],
       [1, 0, 1]])

In [45]:
X = data[:, 1:]
y = data[:, 0]

In [239]:
class BernoulliNB:
    
    def __init__(self):
        self.prob_target_success_ = None
        self.prob_target_failure_ = None
    
    def fit(self, X, y):
        # target
        self.prob_target_success_ = np.mean(y)
        self.prob_target_failure_ = 1 - self.prob_target_success_
        # features
        self.prob_feature_success_ = np.mean(X, axis=0)
        self.prob_feature_failure_ = 1 - self.prob_feature_success_
    
    def predict(self, X):
        # mask to filter success & failure probabilities
        feature_mask_success = np.where(np.array(X)==1) 
        feature_mask_failure = np.where(np.array(X)==0)
        
        # filter probabilities
        feature_success = self.prob_feature_success_[feature_mask_success]
        feature_failure = self.prob_feature_failure_[feature_mask_failure]
        
        # calculate posterior probability
        success = self.prob_target_success_ * np.prod(feature_success)
        failure = self.prob_target_failure_ * np.prod(feature_failure)
        
        # show posterior probability by class 
        print('P(failure): {} | P(success): {}'.format(failure, success))
        
        return np.argmax( np.array([failure, success]) )

In [240]:
nb = BernoulliNB()
nb.fit(X,y)

In [241]:
test = np.array([0,0,1,1]).reshape(2,2)

In [242]:
nb.predict([0,1])

P(failure): 0.22222222222222227 | P(success): 0.4444444444444444


1