In [152]:
import numpy as np
import math
from pandas import get_dummies, DataFrame
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score, KFold
from utils import load_data
from scipy.stats import norm

In [298]:
class NaiveBayes:

    def __init__(self):
        """
        Your initialization procedure if required
        """
        pass
    
    def create_dict(self, x):
        dct = dict()
        if type(x[0]) == float:
            dct['mean'], dct['sigma'] = self.estimate_mean_and_stdev(x)
        else:
            for unq_v in np.unique(x):
                dct[unq_v] = (x==unq_v).sum()
        return dct

    def fit(self, X, Y):
        """
        This method calculates class probabilities and conditional probabilities to be used for prediction

        Both numerical and categorical features are accepted.
        Conditional probability of numerical features is calculated based on Probability Density Function
        (assuming normal distribution)

        :param X: training data, numpy array of shape (n,m)
        :param Y: training labels, numpy array of shape (n,1)
        """
        # TODO START YOUR CODE HERE
        self.m_pos = Y[Y[:,0]=='+'].size
        self.m_neg = Y[Y[:,0]=='-'].size
        self.n = X.shape[1]
        self.pos_proba = (Y=='+').mean()
        self.neg_proba = 1-self.pos_proba
        self.pos_dicts = [self.create_dict(X[:,i][Y[:,0]=='+']) for i in range(X.shape[1])]
        self.neg_dicts = [self.create_dict(X[:,i][Y[:,0]=='-']) for i in range(X.shape[1])]
        # END YOUR CODE HERE

    @staticmethod
    def estimate_mean_and_stdev(values):
        """
        Estimates parameters of normal distribution - empirical mean and standard deviation
        :param values: attribute sample values
        :return: mean, stdev
        """
        # TODO START YOUR CODE HERE
        m = values.size
        mean = values.sum()/m
        std = ((values-mean).dot(values-mean)/m)**.5
        return mean, std
        # END YOUR CODE HERE

    @staticmethod
    def calc_probability(val, mean, stdev):
        """
        Estimates probability of encountering a point (val) given parameters of normal distribution
        based on probability density function
        :param val: point
        :param mean: mean value
        :param stdev: standard deviation
        :return: relative likelihood of a point
        """
        # TODO START YOUR CODE HERE
        return np.sqrt(2*np.pi*stdev**2)**-1*np.e**-((val-mean)**2/2/stdev**2)
        # END YOUR CODE HERE

    def predict(self, X):
        """
        Predict class labels for given input. Refer to lecture slides for corresponding formula
        :param X: test data, numpy array of shape (n,m)
        :return: numpy array of predictions
        """
        # TODO START YOUR CODE HERE
        probas = []
        alpha = 1
        for x in X:
            pos = sum([np.log((self.pos_dicts[i][x[i]]+alpha)/(self.m_pos+alpha*self.n)) if x[i] in self.pos_dicts[i] else np.log(alpha*(self.m_pos+alpha*self.n)**-1) if type(x[i])!=float else np.log(self.calc_probability(x[i], self.pos_dicts[i]['mean'], self.pos_dicts[i]['sigma'])) for i in range(x.size)])
            neg = sum([np.log((self.neg_dicts[i][x[i]]+alpha)/(self.m_neg+alpha*self.n)) if x[i] in self.neg_dicts[i] else np.log(alpha*(self.m_neg+alpha*self.n)**-1) if type(x[i])!=float else np.log(self.calc_probability(x[i], self.pos_dicts[i]['mean'], self.pos_dicts[i]['sigma'])) for i in range(x.size)])
            probas += '+' if pos>neg else'-'
        return np.array(probas)
        # END YOUR CODE HERE

    def get_params(self, deep = False):
        return {}

In [294]:
X, Y = load_data("crx.data.csv")
# indexes of numerical attributes
numerical_attrs = [1, 2, 7, 10, 13, 14]
X[:, numerical_attrs] = X[:, numerical_attrs].astype(float)

In [295]:
# categorical features only. Use this to test your initial implementation
X_cat = np.delete(X, numerical_attrs, 1)

In [299]:
clf = NaiveBayes()
clf.fit(X, Y)
Y_pred = clf.predict(X[[0],:])


In [302]:
scores = cross_val_score(NaiveBayes(), X_cat, Y, cv=KFold(n_splits=15, shuffle=True), scoring='accuracy')
print("Categorical Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Categorical Accuracy: 0.86 (+/- 0.09)


In [77]:
# use this as a benchmark. Your algorithm (on categorical features) should reach the same accuracy
X_dummy = DataFrame.as_matrix(get_dummies(DataFrame(X_cat)))
scores = cross_val_score(MultinomialNB(), X_dummy, Y.ravel(), cv=KFold(n_splits=15, shuffle=True), scoring='accuracy')
print("Categorical Accuracy of Standard NB: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Categorical Accuracy of Standard NB: 0.86 (+/- 0.09)


In [275]:
# all (mixed) features. Use this to test your final implementation
scores = cross_val_score(NaiveBayes(), X, Y, cv=KFold(n_splits=15, shuffle=True), scoring='accuracy')
print("Overall Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

# write your thoughts here (if any)

Overall Accuracy: 0.85 (+/- 0.12)
