In [94]:
import pandas as pd
import numpy as np
np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)                 

# names of columns
with open("./data/house-votes-84.txt") as file:
    labels = [line.strip() for line in file]

# load data
df = pd.read_csv("./data/house-votes-84.csv",header=None,names=labels)
df.head()

Unnamed: 0,party,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-afric
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [95]:
# All attributes are boolean (y,n) => (1,0); label is also boolean (dem,rep) => (1,0)
# Replace '?' with NaN
df_enc = df.replace(to_replace=['democrat', 'republican', 'y','n','?'],value=[1,0,1,0,np.nan])
df_enc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 435 entries, 0 to 434
Data columns (total 17 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   party                                  435 non-null    int64  
 1   handicapped-infants                    423 non-null    float64
 2   water-project-cost-sharing             387 non-null    float64
 3   adoption-of-the-budget-resolution      424 non-null    float64
 4   physician-fee-freeze                   424 non-null    float64
 5   el-salvador-aid                        420 non-null    float64
 6   religious-groups-in-schools            424 non-null    float64
 7   anti-satellite-test-ban                421 non-null    float64
 8   aid-to-nicaraguan-contras              420 non-null    float64
 9   mx-missile                             413 non-null    float64
 10  immigration                            428 non-null    float64
 11  synfue

In [96]:
# Check for weird/wrong stuff
df_enc.describe(include="all")

Unnamed: 0,party,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-afric
count,435.0,423.0,387.0,424.0,424.0,420.0,424.0,421.0,420.0,413.0,428.0,414.0,404.0,410.0,418.0,407.0,331.0
mean,0.613793,0.44208,0.503876,0.596698,0.417453,0.504762,0.641509,0.567696,0.57619,0.501211,0.504673,0.362319,0.423267,0.509756,0.593301,0.427518,0.812689
std,0.48744,0.497222,0.500632,0.49114,0.493721,0.500574,0.480124,0.495985,0.49475,0.500605,0.500563,0.481252,0.49469,0.500516,0.491806,0.495327,0.390752
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
50%,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
75%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [97]:
class BernoulliNB:
    def __init__(self):
        self.priors = None
        self.marginals = None
        self.classes = None
        self.X = None
        self.y = None
        self.test = None
        self.Xmeans = None
    
    def __initialiseClasses(self): 
        self.classes = np.unique(self.y)
    
    def __initialiseClassPriors(self): # P(C_k) for classes
        # np.unique(y) RETURNS SORTED so [0,1]
        self.priors = np.array([self.X[self.y == c].shape[0] / self.X.shape[0] for c in self.classes])  # obs for which class is c / # all obs

    def __initialiseMarginals(self): # P(x_i|C_k)
        # example: feature x_i = "age" column
        # we have classes and features
        # for bernoulli model MLE of P(x_i|C_k) = mean(x_i)
        # for each class we calculate the mean of x_i to get an array of P(x_i|C_k)
            # columns are axis = 0
            # LAPLACE SMOOTHING if all examples in class c have x_i= then the whole product would be 0 or log(0)=inf
            self.marginals = np.array([(np.sum(self.X[self.y == c],axis=0)+1) / (self.X[self.y == c].shape[0]+self.X.shape[1]) for c in self.classes])
            # obs from class c => flatten them into a f-sized array of means (f - # of features)
            # +-------+-------------------------+-------------------------+-----+
            # | class | p_0 (mean of feature 1) | p_1 (mean of feature 2) | ... |
            # +-------+-------------------------+-------------------------+-----+
            # | 0     | 0.3                     | 0.4                     |     |
            # +-------+-------------------------+-------------------------+-----+
            # | 1     | 0.7                     | 0.6                     |     |
            # +-------+-------------------------+-------------------------+-----+
            # C'class' isnt a real column
    
    def __initialiseMeans(self, X):
        self.XMeans = X.mean(axis=0)

    def fit(self, X, y):
        # fill X NaN with the means of each column
        self.__initialiseMeans(X)
        X_clean = X.fillna(value=self.XMeans,axis=0)
        self.X = X_clean.to_numpy()
        # y shouldn't have missing values
        if y.isnull().values.any():
            raise Exception("NaN not allowed for labels.")
        self.y = y.to_numpy()
        self.__initialiseClasses()
        self.__initialiseClassPriors()
        self.__initialiseMarginals()
    
    # Use log instead to prevent over?flow
    def predict(self, x):
        # x is a f-length vector
        # for each class calculate p(c)*prod(p^x_i*(1-p)^(1-x_i))
        # pick class with highest probability
        x_clean = x.fillna(value=self.XMeans)
        x = x_clean.to_numpy()
        p = np.multiply(np.power(self.marginals,x), np.power(1-self.marginals,1-x)) # table of P(x_i|C_k) where rows are for different classes
        logp = np.log(p) # log of all probs, still a table
        s = self.priors+np.sum(logp,axis=1) # k-length vector of P(C_k)prod(P(x_i|C_k))
        # return index (which is the class) of max(p)
        return np.argmax(s)

In [98]:
def CV(model, X, y, k = 10):
    
    n = X.shape[0]
    inds = np.arange(0,n)
    np.random.shuffle(inds)
    folds = np.array_split(inds,k) # k groups of size l//(n+1) or l//n

    batches = []
    for i, test in enumerate(folds): #(one group, k-1 group) == (test, train)
        train = np.concatenate(np.delete(folds,i)).ravel()
        batches.append((test, train))
    
    scores = []
    
    for test, train in batches:
        X_train, y_train, X_test, y_test = X.iloc[train], y.iloc[train], X.iloc[test], y.iloc[test]
        model.fit(X_train, y_train)
        y_pred = [model.predict(x) for _, x in X_test.iterrows()]

        accuracy = np.sum(y_pred == y_test) / y_test.shape[0]
        scores.append(accuracy)
    
    avg_score = np.average(scores)
    return avg_score, scores

In [99]:
X= df_enc.iloc[:,1:]
y = df_enc.iloc[:,0]

In [105]:
model = BernoulliNB()
avg_score, scores = CV(model, X, y)

In [106]:
print("Model's average score is {:.2f}%.".format(100*avg_score))

Model's average score is 89.65%.
