In [1]:
import numpy as np

In [2]:
def create_dataset(n_rows=10, n_cols=2, prob=(0.5, 0.5), seed=None):
    '''
    creates a 2D numpy array with 0s and 1s for columns
    
    INPUT:
        n_rows = (int) number of rows in dataset
        n_cols = (int) number of columns starting with target followed by features
        prob = (tuple) probability of success for target, feature 1, feature 2, ..., feature n
    OUTPUT:
        dataset (numpy array)
    '''
    # error handling
    assert type(n_rows) == int, 'n_rows must be an integer'
    assert type(n_cols) == int, 'n_cols must be an integer'
    assert type(prob) == tuple, 'prob must be a tuple of probabilities'
    assert len(prob) == n_cols, 'tuple must contain probabilities for each n_col'
    assert type(seed) == int, 'seed must be an integer'
    
    # reproducibility
    if seed:
        np.random.seed(seed)
        
    # create dataset
    for i, p in enumerate(prob):
        if i < 1:
            dataset = np.random.binomial(n=1, p=prob[i], size=n_rows)
        else:
            column = np.random.binomial(n=1, p=prob[i], size=n_rows)
            dataset = np.c_[dataset, column]
    
    return dataset

In [3]:
probabilities = (0.5, 0.5, 0.5, 0.5)
data = create_dataset(n_rows=4, n_cols=4, prob=probabilities, seed=42)

In [4]:
data

array([[0, 0, 1, 1],
       [1, 0, 1, 0],
       [1, 0, 0, 0],
       [1, 1, 1, 0]])

In [5]:
X = data[:, 1:]
y = data[:, 0]

In [6]:
X

array([[0, 1, 1],
       [0, 1, 0],
       [0, 0, 0],
       [1, 1, 0]])

In [7]:
np.isin(X, 1)

array([[False,  True,  True],
       [False,  True, False],
       [False, False, False],
       [ True,  True, False]], dtype=bool)

In [8]:
len(np.unique(y))

2

In [22]:
class BernoulliNB:
    
    
    def __init__(self):
        self.prob_target_success_ = None
        self.prob_target_failure_ = None
     
    
    def _reshape(self, X):
        '''converts array to appropriate dimensions'''
        try:
            n_cols = X.shape[1]
        except:
            n_cols = X.reshape(-1,1)
        return n_cols
    
    
    def fit(self, X, y):
        '''calculate priors and likelihoods'''
        # setup
        n_classes = len(np.unique(y))
        n_cols = self._reshape(X)
        
        # priors (target)
        self.prob_target_success_ = np.mean(y)
        self.prob_target_failure_ = 1 - self.prob_target_success_
        
        # likelihoods (features)
        #*******************************
        # Cleanup w/vectorized version *
        #*******************************
        self.prob_features_ = np.zeros((2, n_cols), dtype=float)
        for i in range(n_classes):
            for j in range(n_cols):
                column = X[:,j]
                self.prob_features_[i,j] = np.mean(y[column==i])
    
    
    def predict(self, X):
        '''return most likely class'''
        X = np.array(X)
        rows = X.shape[0]
        n_cols = self._reshape(X)
            
        probabilities = self.prob_features_[X, np.arange(rows)]
        print(probabilities)
        return np.prod(probabilities)
        # mask to filter success & failure probabilities
        #feature_mask_success = np.where(np.array(X)==1) 
        #feature_mask_failure = np.where(np.array(X)==0)
        
        # filter probabilities
        #feature_success = self.prob_feature_success_[feature_mask_success]
        #feature_failure = self.prob_feature_failure_[feature_mask_failure]
        
        # calculate posterior probability
        #success = self.prob_target_success_ * np.prod(feature_success)
        #failure = self.prob_target_failure_ * np.prod(feature_failure)
        
        # show posterior probability by class 
        #print('P(failure): {} | P(success): {}'.format(failure, success))
        
        #return np.argmax( np.array([failure, success]) )

In [23]:
nb = BernoulliNB()
nb.fit(X,y)

In [24]:
nb.predict([1,0,0])

[ 1.  1.  1.]


1.0

In [25]:
nb.prob_features_

array([[ 0.66666667,  1.        ,  1.        ],
       [ 1.        ,  0.66666667,  0.        ]])

In [29]:
data

array([[0, 0, 1, 1],
       [1, 0, 1, 0],
       [1, 0, 0, 0],
       [1, 1, 1, 0]])

In [26]:
test = np.array([0,1,1])
test

array([0, 1, 1])

In [27]:
nb.prob_features_[test, np.arange(len(test))]

array([ 0.66666667,  0.66666667,  0.        ])

In [15]:
np.argwhere(X)

array([[0, 1],
       [0, 2],
       [1, 1],
       [3, 0],
       [3, 1]])

In [16]:
np.split

<function numpy.lib.shape_base.split>