In [1]:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix

We'll be using the [Congressional Voting Records dataset](https://archive.ics.uci.edu/ml/datasets/Congressional+Voting+Records) to predict the party of a congressperson based on their voting record (each vote is a Yes/No binary feature). Let's first read in the data:

In [2]:
votes = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/voting-records/house-votes-84.data', header=None)

In [3]:
votes.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y
5,democrat,n,y,y,n,y,y,n,n,n,n,n,n,y,y,y,y
6,democrat,n,y,n,y,y,y,n,n,n,n,n,n,?,y,y,y
7,republican,n,y,n,y,y,y,n,n,n,n,n,n,y,y,?,y
8,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,y
9,democrat,y,y,y,n,n,n,y,y,y,n,n,n,n,n,?,?


So there are some missing values here represented by a question mark. Let's replace these with the most common vote in each column for now (after first converting each `?` to `None`), and then to numerical values (1 for `y`, 0 for `n`) so we can actually use `scikit-learn`'s Bernoulli Naive Bayes classifier `BernoulliNB`.

In [4]:
votes = votes.replace({'?': None}).apply(lambda x: x.fillna(x.value_counts().index[0])).replace({'y': 1, 'n': 0})
votes.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,republican,0,1,0,1,1,1,0,0,0,1,0,1,1,1,0,1
1,republican,0,1,0,1,1,1,0,0,0,0,0,1,1,1,0,1
2,democrat,0,1,1,0,1,1,0,0,0,0,1,0,1,1,0,0
3,democrat,0,1,1,0,1,1,0,0,0,0,1,0,1,0,0,1
4,democrat,1,1,1,0,1,1,0,0,0,0,1,0,1,1,1,1


In [5]:
votes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435 entries, 0 to 434
Data columns (total 17 columns):
0     435 non-null object
1     435 non-null int64
2     435 non-null int64
3     435 non-null int64
4     435 non-null int64
5     435 non-null int64
6     435 non-null int64
7     435 non-null int64
8     435 non-null int64
9     435 non-null int64
10    435 non-null int64
11    435 non-null int64
12    435 non-null int64
13    435 non-null int64
14    435 non-null int64
15    435 non-null int64
16    435 non-null int64
dtypes: int64(16), object(1)
memory usage: 57.9+ KB


OK, looks good. Now let's create some more meaningful column names:

In [6]:
votes.columns = ['party'] + ['vote' + str(c) for c in votes.columns[1:]]
votes.head(10)

Unnamed: 0,party,vote1,vote2,vote3,vote4,vote5,vote6,vote7,vote8,vote9,vote10,vote11,vote12,vote13,vote14,vote15,vote16
0,republican,0,1,0,1,1,1,0,0,0,1,0,1,1,1,0,1
1,republican,0,1,0,1,1,1,0,0,0,0,0,1,1,1,0,1
2,democrat,0,1,1,0,1,1,0,0,0,0,1,0,1,1,0,0
3,democrat,0,1,1,0,1,1,0,0,0,0,1,0,1,0,0,1
4,democrat,1,1,1,0,1,1,0,0,0,0,1,0,1,1,1,1
5,democrat,0,1,1,0,1,1,0,0,0,0,0,0,1,1,1,1
6,democrat,0,1,0,1,1,1,0,0,0,0,0,0,1,1,1,1
7,republican,0,1,0,1,1,1,0,0,0,0,0,0,1,1,0,1
8,republican,0,1,0,1,1,1,0,0,0,0,0,1,1,1,0,1
9,democrat,1,1,1,0,0,0,1,1,1,0,0,0,0,0,0,1


Let's now split `votes` into a train and test set and see what we get when we fit and evaluate `BernoulliNB` on it:

In [7]:
cut = round(len(votes) * .25)
votes_train = votes[:-cut]
votes_test = votes[-cut:]

In [8]:
X_train, X_test = votes_train.drop(columns='party'), votes_test.drop(columns='party')
y_train, y_test = votes_train['party'], votes_test['party']

In [9]:
clf = BernoulliNB()
clf.fit(X_train, y_train)

BernoulliNB()

In [10]:
y_pred_sklearn = clf.predict(X_test)
clf.score(X_test, y_test)

0.8715596330275229

In [11]:
confusion_matrix(y_test, y_pred_sklearn)

array([[55, 11],
       [ 3, 40]])

Now let's work to make our own Bernoulli Naive Bayes classifier and see how that compares.

In [12]:
class Bernoulli_NB():
    
    def __init__(self, smooth=1):
        # constructor with smoothing parameter in case there are zero yes or no values for a feature/class combo
        self.smooth = smooth
        
    def fit(self, X, y):
        
        # get the target (prior) distribution
        self.target_dist = np.array(y.value_counts(normalize=True))
        
        # combine target with features and get smoothed percentages of "yes" (1) values for each class
        combined = pd.concat([X, y.rename('target')], axis=1)
        self.yes = (combined.groupby('target').sum()+self.smooth).div(combined.groupby('target').count() + 2*self.smooth)
    
    def predict(self, X):
        
        # calculate the joint probabilities of each observation being in either class and take the maximizing class
        get_pred = lambda r: pd.DataFrame([self.yes.iloc[:,i] if val else 1-self.yes.iloc[:,i] for i, val in enumerate(r)]).T.prod(axis=1).mul(self.target_dist).idxmax()
        return np.array(X.apply(get_pred, axis=1))
    
    def score(self, X, y):
        
        # return accuracy
        return (self.predict(X) == y).mean()

In order to fit and predict the classes, I've first taken advantage of there being only two possible values for each and every feature (for the dataset in this example, a yes or no vote). So I really only need to calculate all the conditional probabilities (likelihoods) that a given observation (voter) voted yes if they were a Democrat or a Republican (and can easily compute the same for no votes):

In [13]:
yes = (votes_train.groupby('party').sum()+1).div(votes_train.groupby('party').count() + 2)
yes

Unnamed: 0_level_0,vote1,vote2,vote3,vote4,vote5,vote6,vote7,vote8,vote9,vote10,vote11,vote12,vote13,vote14,vote15,vote16
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
democrat,0.581281,0.507389,0.906404,0.034483,0.206897,0.448276,0.79803,0.847291,0.82266,0.477833,0.438424,0.118227,0.310345,0.37931,0.625616,0.965517
republican,0.173228,0.598425,0.141732,0.96063,0.952756,0.874016,0.275591,0.19685,0.11811,0.559055,0.094488,0.811024,0.858268,0.984252,0.094488,0.685039


In [14]:
1 - yes # no votes for each party

Unnamed: 0_level_0,vote1,vote2,vote3,vote4,vote5,vote6,vote7,vote8,vote9,vote10,vote11,vote12,vote13,vote14,vote15,vote16
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
democrat,0.418719,0.492611,0.093596,0.965517,0.793103,0.551724,0.20197,0.152709,0.17734,0.522167,0.561576,0.881773,0.689655,0.62069,0.374384,0.034483
republican,0.826772,0.401575,0.858268,0.03937,0.047244,0.125984,0.724409,0.80315,0.88189,0.440945,0.905512,0.188976,0.141732,0.015748,0.905512,0.314961


Adding 1 in to all the yes votes and 2 to all the total counts is an application of [Laplace Smoothing](https://towardsdatascience.com/introduction-to-na%C3%AFve-bayes-classifier-fa59e3e24aaf) (see section 3) to take care of cases where the count of a yes or no vote is zero (since we are estimating posterior probabilities by taking products and a single zero would indeed be an issue). Adding 1 to each vote's yes and no counts for each party and then 2 to the total count for each party then ensures everything (the corresponding rows of `yes` and `1- yes` will still add up to 1.

Now taking a look at a sample test observation, I can decide which of `yes` or `1-yes` to evaluate as part of a "rolling" product to calculate the relative probability that test observation is Democrat or Republican, of course taking into account the training target distribution of Republicans and Democrats (the prior):

In [15]:
party_dist = np.array(y_train.value_counts(normalize=True))
party_dist # Democrat, Republican

array([0.61656442, 0.38343558])

In [16]:
test_vote = X_test.iloc[0]
test_vote

vote1     1
vote2     1
vote3     0
vote4     1
vote5     0
vote6     0
vote7     1
vote8     1
vote9     1
vote10    0
vote11    1
vote12    0
vote13    0
vote14    1
vote15    0
vote16    1
Name: 326, dtype: int64

In [17]:
# get probability of each vote if Democrat/Republican for this observation's votes
pd.DataFrame([yes.iloc[:,i] if t else 1-yes.iloc[:,i] for i,t in enumerate(test_vote)]).T 

Unnamed: 0_level_0,vote1,vote2,vote3,vote4,vote5,vote6,vote7,vote8,vote9,vote10,vote11,vote12,vote13,vote14,vote15,vote16
party,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
democrat,0.581281,0.507389,0.093596,0.034483,0.793103,0.551724,0.79803,0.847291,0.82266,0.522167,0.438424,0.881773,0.689655,0.37931,0.374384,0.965517
republican,0.173228,0.598425,0.858268,0.96063,0.047244,0.125984,0.275591,0.19685,0.11811,0.440945,0.094488,0.188976,0.141732,0.984252,0.905512,0.685039


In [18]:
# get relative probablity the observation is Republican or Democrat from the votes and party distribution
pd.DataFrame([yes.iloc[:,i] if t else 1-yes.iloc[:,i] for i,t in enumerate(test_vote)]).T.prod(axis=1).mul(party_dist)

party
democrat      2.726804e-06
republican    8.515422e-10
dtype: float64

In [19]:
# get the index of the two parties (classes) with the maximum (posterior) probability
pd.DataFrame([yes.iloc[:,i] if t else 1-yes.iloc[:,i] for i,t in enumerate(test_vote)]).T.\
    prod(axis=1).\
    mul(party_dist).\
    idxmax()

'democrat'

Now let's look at our class in action:

In [20]:
bnb = Bernoulli_NB()

In [21]:
bnb.fit(X_train, y_train)

In [22]:
y_pred = bnb.predict(X_test)

In [23]:
bnb.score(X_test, y_test)

0.8715596330275229

In [24]:
confusion_matrix(y_test, y_pred)

array([[55, 11],
       [ 3, 40]])

In [25]:
(y_pred == y_pred_sklearn).mean()

1.0

Seems to have given us the same results as `scikit-learn`!