In [1]:
import cPickle as cp
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as sci

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Imputer

# NAIVE BAYES CLASSIFIER CLASS
class NBC:
    
    # Initialisation
    def __init__(self, feature_types=0, num_classes=0):
        self.feature_types = feature_types
        self.num_classes = num_classes
        self.theta = 0
        self.pi = 0
        
    def fit(self, X_train, Y_train):
        
        # Setting classes, number of datapoints, and pi and theta dictionaries
        Y_classes = np.unique(Y_train)
        N, _ = X_train.shape
        pi = {}
        theta = {}
        
        for cls in Y_classes:
            
            # Index tells us the rows which result in current class
            index = Y_train == cls
            
            pi[cls] = float(np.sum(Y_train == cls)) / N
            
            # Theta is a dictionary that holds probability distribution for each class and feature
            theta[cls] = range(np.size(self.feature_types))
            
            # Isolating the rows which result in current class
            X_cls = X_train[index]
            
            for i in range(np.size(self.feature_types)):
                
                # If the feature is binary it will just hold 1-mean (probability of 0) and mean (probability of 1)
                if self.feature_types[i] == 'b':
                    theta[cls][i] = np.zeros(3)
                    theta[cls][i][2] = 1
                    
                    N_2 = np.sum(X_cls[:,i].astype(np.float) == 2)
                    N_full = np.size(X_cls[:,i])
                    
                    # Calculating probability so that it excludes any missing values
                    if N_2 != N_full:
                        theta[cls][i][0] = 1 - (float(np.sum(X_cls[:,i].astype(np.float) == 1)) / (N_full - N_2))
                        theta[cls][i][1] = float(np.sum(X_cls[:,i].astype(np.float) == 1)) / (N_full - N_2)
                        
                    # If all values are missing just set probs to 0.5
                    else:
                        theta[cls][i][0] = 0.5
                        theta[cls][i][1] = 0.5     
                
                # If the feature is real it will hold mean and std for the distribution
                if self.feature_types[i] == 'r':
                    theta[cls][i] = np.zeros(2)
                    theta[cls][i][0] = np.mean(X_cls[:,i].astype(np.float))
                    theta[cls][i][1] = np.std(X_cls[:,i].astype(np.float))
                 
                # If the feature is categorical it will hold the probability of appearance for each category in feature
                if self.feature_types[i] == 'c':
                    theta[cls][i] = {}
                    for cat in np.unique(X_train[:,i]):
                            Nc, _ = X_cls.shape
                            cat_prob = np.sum(X_cls[:,i] == cat) / Nc
                            theta[cls][i][cat] = cat_prob
                                  
        self.theta = theta
        self.pi = pi
        print(pi)
        print(theta)
        
    def predict(self, X_test):
        N, _ = X_test.shape
        
        # Setting the prediction variable
        y_hat = np.chararray(N)
        
        for n in range(N):
            max_prob = -float('inf')
            cls_hat = 0
            
            for cls in self.pi:
                temp_prob = np.log(self.pi[cls])
                
                for feature in range(np.size(self.feature_types)):
                    
                    # Binary Features
                    if self.feature_types[feature] == 'b':
                        temp_prob += np.log(self.theta[cls][feature][X_test[n, feature]])
                        
                    # Real Features
                    if self.feature_types[feature] == 'r':
                        temp_prob += sci.norm.logpdf(X_test[n, feature].astype(np.float), self.theta[cls][feature][0], self.theta[cls][feature][1])
                    
                    # Categorical Features
                    if self.feature_types[feature] == 'c':
                        temp_prob += np.log(self.theta[cls][feature][X_test[n, feature]])
                        
                if temp_prob >= max_prob:
                    max_prob = temp_prob
                    cls_hat = cls
            
            y_hat[n] = cls_hat
        
        return y_hat
        

X, y = cp.load(open('voting-full.cPickle', 'rb'))

print(np.shape(X))
print(np.shape(y))
print('\n')


X = X[:10,:16]
y = y[:10]
print(X)
print(y)
print('\n')

nbc = NBC(['b','b','b','b','b'], 2)
nbc.fit(X,y)
print('\n')
print(nbc.predict(X))




        

(435, 16)
(435,)


[[ 0.  0.  1.  0.  0.  1.  1.  1.  1.  1.  1.  0.  1.  1.  0.  1.]
 [ 1.  0.  1.  0.  0.  0.  1.  1.  1.  0.  0.  0.  0.  0.  1.  2.]
 [ 1.  1.  1.  0.  0.  1.  2.  1.  0.  0.  1.  1.  0.  1.  0.  2.]
 [ 1.  0.  1.  1.  0.  0.  0.  1.  1.  1.  0.  0.  0.  1.  1.  1.]
 [ 2.  2.  2.  0.  0.  0.  1.  1.  1.  1.  0.  0.  1.  0.  1.  1.]
 [ 0.  1.  1.  0.  0.  0.  1.  1.  1.  0.  0.  0.  1.  0.  2.  2.]
 [ 1.  0.  1.  0.  0.  2.  1.  1.  1.  0.  2.  2.  0.  2.  2.  2.]
 [ 0.  0.  0.  1.  1.  1.  0.  0.  0.  0.  0.  1.  1.  1.  0.  0.]
 [ 0.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  2.  1.  2.  2.]
 [ 1.  0.  0.  1.  1.  1.  0.  0.  0.  0.  1.  1.  1.  1.  0.  0.]]
[ 1.  1.  1.  0.  1.  1.  1.  0.  0.  0.]


{0.0: 0.4, 1.0: 0.6}
{0.0: [array([ 0.5,  0.5,  1. ]), array([ 1.,  0.,  1.]), array([ 0.66666667,  0.33333333,  1.        ]), array([ 0.,  1.,  1.]), array([ 0.33333333,  0.66666667,  1.        ])], 1.0: [array([ 0.4,  0.6,  1. ]), array([ 0.6,  0.4,  1. ]), array(

