# Introduction
This is my from scratch implementation of **Bernoulli Naive Bayes**. 

Note that the model currently only handles **binary classification**. 

In [1]:
import numpy as np

In [2]:
def create_dataset(n_rows=10, n_cols=2, probs=(0.5, 0.5), seed=None):
    '''
    creates a 2D numpy array with 0s and 1s for columns
    
    INPUT:
        n_rows = (int) number of rows in dataset
        n_cols = (int) number of columns starting with target followed by features
        prob = (tuple) probability of success for target, feature 1, feature 2, ..., feature n
    OUTPUT:
        dataset (numpy array)
    '''
    # error handling
    assert type(n_rows) == int, 'n_rows must be an integer'
    assert type(n_cols) == int, 'n_cols must be an integer'
    assert type(probs) == tuple, 'prob must be a tuple of probabilities'
    assert len(probs) == n_cols, 'tuple must contain probabilities for each n_col'
    assert type(seed) == int, 'seed must be an integer'
    
    # reproducibility
    if seed:
        np.random.seed(seed)
        
    # create dataset
    for i, prob in enumerate(probs):
        if i < 1:
            dataset = np.random.binomial(n=1, p=prob, size=n_rows)
        else:
            column = np.random.binomial(n=1, p=prob, size=n_rows)
            dataset = np.c_[dataset, column]
    
    return dataset

In [3]:
probabilities = (0.3, 0.5, 0.2, 0.5)
data = create_dataset(n_rows=4000, n_cols=4, probs=probabilities, seed=42)

In [4]:
data

array([[0, 1, 0, 1],
       [1, 1, 0, 0],
       [1, 1, 0, 0],
       ..., 
       [0, 0, 0, 1],
       [0, 1, 1, 1],
       [0, 0, 0, 1]])

In [5]:
X = data[:, 1:]
y = data[:, 0]

In [6]:
X

array([[1, 0, 1],
       [1, 0, 0],
       [1, 0, 0],
       ..., 
       [0, 0, 1],
       [1, 1, 1],
       [0, 0, 1]])

---

In [7]:
class BernoulliNB:
    '''Bernoulli Naive Bayes for binary classification only.'''
    
    
    def __init__(self):
        self.class_one_prior_ = None
        self.class_zero_prior_ = None
        self.likelihoods_ = None
     
    
    def _reshape(self, X):
        '''converts array to appropriate dimensions'''
        try:
            self._n_cols = X.shape[1]
        except:
            self._n_cols = X.reshape(-1,1)
        return self._n_cols
    
    
    def fit(self, X, y, laplace_smoothing=True):
        '''calculate priors and likelihoods'''
        # setup
        n_bins = 2
        self._n_cols = self._reshape(X)
        self.likelihoods_ = np.zeros((n_bins, self._n_cols), dtype=float)
        
        # priors (target)
        self.class_one_prior_ = np.mean(y)
        self.class_zero_prior_ = 1 - self.class_one_prior_
        
        # additive smoothing
        if laplace_smoothing:
            numerator = 1
            denominator = len(X)
        else:
            numerator = 0
            denominator = 1

        # calculate likelihood matrix
        for i in range(n_bins):
            for j in range(self._n_cols):
                column = X[:,j]
                y_intersect_col = np.logical_and(y==i, column==i)
                y_intersect_col = sum(y_intersect_col)
                sum_y = sum(y==i)
                self.likelihoods_[i,j] = (y_intersect_col) / (sum_y) + numerator/denominator
        
        # find likelihood matrix complements
        prob_complements = 1 - self.likelihoods_
        self.likelihoods_ = np.concatenate((self.likelihoods_, prob_complements), axis=0)
        # reorder likelihood matrix
        self.likelihoods_ = self.likelihoods_[[0,2,3,1],:]
    
    
    def predict(self, X, return_probs=True):
        '''return most likely class'''
        
        # error checking
        assert type(X) == type(np.array([])), "X must be a numpy ndarray!"
        
        # setup
        try: 
            n_cols = X.shape[1] 
            n_rows = X.shape[0]
        except: 
            n_cols = X.shape[0]
            n_rows = int(len(X) / n_cols)
        assert n_cols == self._n_cols, "number of columns in X don't match those in the training set!"
        
        # more setup
        predictions = np.zeros((n_rows,1), dtype=int)
        probs = []
        class_zero_probs,  class_one_probs = np.split(self.likelihoods_, 2)
        
        # main logic
        for i in range(n_rows):
            # likelihoods
            observation = X[i]
            class_zero_likelihoods = class_zero_probs[observation, np.arange(self._n_cols)]
            class_one_likelihoods = class_one_probs[observation, np.arange(self._n_cols)]
            
            # posteriors
            class_zero_posterior = self.class_zero_prior_ * np.prod(class_zero_likelihoods)
            class_one_posterior = self.class_one_prior_ * np.prod(class_one_likelihoods)
            probs.append((class_zero_posterior, class_one_posterior))
            
            # save predictions for outputting
            if n_rows == 1:
                predictions = np.argmax((class_zero_posterior, class_one_posterior))
            else:
                predictions[i] = np.argmax((class_zero_posterior, class_one_posterior))
            
        # whether to return tuple of probabilities    
        if return_probs:
            return probs, predictions
        else:
            return predictions

## Testing

In [8]:
nb = BernoulliNB()
nb.fit(X, y, laplace_smoothing=True)
nb.likelihoods_

array([[ 0.50682661,  0.79619739,  0.51713589],
       [ 0.49317339,  0.20380261,  0.48286411],
       [ 0.52544503,  0.81103896,  0.48163711],
       [ 0.47455497,  0.18896104,  0.51836289]])

In [9]:
nb_smooth = BernoulliNB()
nb_smooth.fit(X, y, laplace_smoothing=False)
nb_smooth.likelihoods_

array([[ 0.50657661,  0.79594739,  0.51688589],
       [ 0.49342339,  0.20405261,  0.48311411],
       [ 0.52569503,  0.81128896,  0.48188711],
       [ 0.47430497,  0.18871104,  0.51811289]])

#### Single data point

In [10]:
one_d = np.array([0,0,0])
one_d

array([0, 0, 0])

#### Two data points

In [11]:
two_d = np.array([0,0,0,1,1,1]).reshape(2,-1)
two_d

array([[0, 0, 0],
       [1, 1, 1]])

### Predictions

#### 1D Array

In [12]:
nb.predict(one_d, return_probs=True)

([(0.14675556313974908, 0.0609087486122162)], 0)

In [13]:
nb_smooth.predict(one_d, return_probs=True)

([(0.14656622722773208, 0.060988152329824151)], 0)

#### 2D Array - No Smoothing

In [14]:
nb.predict(two_d, return_probs=False)

array([[0],
       [0]])

In [15]:
probs, predictions = nb.predict(two_d)
for prob, prediction in zip(probs, predictions):
    print('P(class 0) = {:9.4} | P(class=1) = {:9.4} | Prediction = {}'.format(prob[0], prob[1], prediction[0]))

P(class 0) =    0.1468 | P(class=1) =   0.06091 | Prediction = 0
P(class 0) =   0.03413 | P(class=1) =   0.01379 | Prediction = 0


#### 2D Array - Laplace Smoothing

In [16]:
nb_smooth.predict(two_d, return_probs=False)

array([[0],
       [0]])

In [17]:
probs, predictions = nb_smooth.predict(two_d)
for prob, prediction in zip(probs, predictions):
    print('P(class 0) = {:8.4} | P(class=1) = {:8.4} | Prediction = {}'.format(prob[0], prob[1], prediction[0]))

P(class 0) =   0.1466 | P(class=1) =  0.06099 | Prediction = 0
P(class 0) =  0.03421 | P(class=1) =  0.01376 | Prediction = 0


## Extensions for Future Work

1. Log transform probabilities to handle underflow