In [1]:
import numpy as np

In [2]:
def create_dataset(n_rows=10, n_cols=2, prob=(0.5, 0.5), seed=None):
    '''
    creates a 2D numpy array with 0s and 1s for columns
    
    INPUT:
        n_rows = (int) number of rows in dataset
        n_cols = (int) number of columns starting with target followed by features
        prob = (tuple) probability of success for target, feature 1, feature 2, ..., feature n
    OUTPUT:
        dataset (numpy array)
    '''
    # error handling
    assert type(n_rows) == int, 'n_rows must be an integer'
    assert type(n_cols) == int, 'n_cols must be an integer'
    assert type(prob) == tuple, 'prob must be a tuple of probabilities'
    assert len(prob) == n_cols, 'tuple must contain probabilities for each n_col'
    assert type(seed) == int, 'seed must be an integer'
    
    # reproducibility
    if seed:
        np.random.seed(seed)
        
    # create dataset
    for i, p in enumerate(prob):
        if i < 1:
            dataset = np.random.binomial(n=1, p=prob[i], size=n_rows)
        else:
            column = np.random.binomial(n=1, p=prob[i], size=n_rows)
            dataset = np.c_[dataset, column]
    
    return dataset

In [3]:
probabilities = (0.5, 0.5, 0.5, 0.5)
data = create_dataset(n_rows=4, n_cols=4, prob=probabilities, seed=42)

In [4]:
data

array([[0, 0, 1, 1],
       [1, 0, 1, 0],
       [1, 0, 0, 0],
       [1, 1, 1, 0]])

In [5]:
X = data[:, 1:]
y = data[:, 0]

In [6]:
X

array([[0, 1, 1],
       [0, 1, 0],
       [0, 0, 0],
       [1, 1, 0]])

In [7]:
class BernoulliNB:
    
    
    def __init__(self):
        self.prob_target_one_ = None
        self.prob_target_zero_ = None
     
    
    def _reshape(self, X):
        '''converts array to appropriate dimensions'''
        try:
            n_cols = X.shape[1]
        except:
            n_cols = X.reshape(-1,1)
        return n_cols
    
    
    def fit(self, X, y):
        '''calculate priors and likelihoods'''
        # setup
        n_classes = len(np.unique(y))
        n_cols = self._reshape(X)
        
        # priors (target)
        self.prob_target_one_ = np.mean(y)
        self.prob_target_zero_ = 1 - self.prob_target_one_
        
        # likelihoods (features)
        n_feature_bins = 2
        self.prob_features_ = np.zeros((n_feature_bins, n_cols), dtype=float)
        
        for i in range(n_feature_bins):
            for j in range(n_cols):
                column = X[:,j]
                intersection_y_and_col = sum(np.logical_and(y==i, column==i))
                sum_y = sum(y==i)
                self.prob_features_[i,j] = intersection_y_and_col / sum_y
        
        prob_complements = 1 - self.prob_features_
        self.prob_features_ = np.concatenate((nb.prob_features_, prob_complements), axis=0)
        self.prob_features_ = self.prob_features_[[0,2,3,1],:] # rearrange array
    
    
    def predict(self, X):
        '''return most likely class'''
        # NEED MULTIPLE PREDICTIONS
        
        # setup
        X = np.array(X)
        n_rows = X.shape[0]
        try: 
            n_cols = X.shape[1] 
        except: 
            n_cols = X.shape[0]
        class_zero_probs,  class_one_probs = np.split(nb.prob_features_, 2)
        
        # likelihoods
        class_zero_likelihoods = class_zero_probs[X, np.arange(n_cols)]
        class_one_likelihoods = class_one_probs[X, np.arange(n_cols)]
            
        # posteriors
        class_zero_posterior = self.prob_target_zero_ * np.prod(class_zero_likelihoods)
        class_one_posterior = self.prob_target_one_ * np.prod(class_one_likelihoods)

        return (class_zero_posterior, class_one_posterior), np.argmax((class_zero_posterior, class_one_posterior))

In [8]:
nb = BernoulliNB()
nb.fit(X,y)

In [9]:
nb.prob_features_

array([[ 1.        ,  0.        ,  0.        ],
       [ 0.        ,  1.        ,  1.        ],
       [ 0.66666667,  0.33333333,  1.        ],
       [ 0.33333333,  0.66666667,  0.        ]])

In [10]:
nb.predict(np.array([1,0,0]))

((0.0, 0.083333333333333343), 1)