# Results for Gaussian Naive Bayes

In [1]:
%matplotlib inline

In [2]:
from matplotlib import pyplot as plt

In [27]:
import os
import pickle
import numpy as np
from scipy import sparse
from functools import partial
from sklearn.externals import joblib
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
# from midi_ml.models.linear_decision_rules import NaiveBayesClassifier

In [4]:
labels = joblib.load("/home/jovyan/persistent_data/data/dumps/labeled_corpus_labels.pkl")
features = joblib.load("/home/jovyan/persistent_data/data/dumps/labeled_corpus_matrix.pkl")
features = features.todense()
bach_labels = [k for k in range(len(labels)) if labels[k] == "bach-js"]
mozart_labels = [k for k in range(len(labels)) if labels[k] == "mozart"]
X = features[bach_labels + mozart_labels].A
y = np.array([1 for i in range(len(bach_labels))] + [0 for i in range(len(mozart_labels))])
y = y.reshape((y.shape[0],))
del features

In [5]:
X.shape

(3027, 16384)

In [28]:
class NaiveBayesClassifier(object):
    """
    Classifiers of the Naive Bayes family. All input features are assumed to be drawn from
     a distribution of the same parametric form
     http://people.csail.mit.edu/jrennie/papers/icml03-nb.pdf
    """

    def __init__(self,
                 X: np.array,
                 y: np.array,
                 parametric_form: str = "multinomial",
                 keep_copy_of_X: bool = True,
                 smoothing: int = 1):
        """
        :param X: (N * M)-dimensional array containing the input data in matrix form
        :param y: (N * 1)-dimensional array containing the binary target variable, encoded as 0 and 1
        :param parametric_form: the parametric form that the features are assumed to take (used to define PDF)
        :param keep_copy_of_X: Whether to keep a copy of X in memory (to be used for making predictions on the training set)
        :param smoothing: Smoothing parameter for dirichlet prior in multinomial model (sets to None if "bernoulli" or "gaussian" or chosen)
        """
        parametric_forms = ["bernoulli", "multinomial", "gaussian"]
        if parametric_form not in parametric_forms:
            raise ValueError("Please select a distribution in %s" % str(parametric_forms))
        self.X = X
        self.y = y
        self.classes_ = set(self.y)
        self.parametric_form_ = parametric_form
        self.keep_copy_of_X = keep_copy_of_X
        if parametric_form is "multinomial":
            self.smoothing_ = smoothing
        else:
            self.smoothing_ = None
        self.num_records_ = None  # type: int
        self.X_given_class_ = {}
        self.thetas_ = {}
        self.priors_ = {}
        self.log_pdf_given_class_ = {}

    @staticmethod
    def log_gaussian_pdf(x: float, mu: float, sigma: float) -> np.array:
        """
        Log of the Gaussian probability density function
        :param x: value (or np.array of values) at which we compute the relative log-likelihood of drawing that point
        :param mu: mean of the Gaussian
        :param sigma: standard deviation of the Gaussian
        :return: Array with the log-probability of each x
        """
        return np.array(np.log(1. / np.sqrt(2 * np.pi * sigma ** 2)) - (x - mu) ** 2 / (2 * sigma ** 2))

    def _get_class_conditional_data(self):
        """
        Separate the data by class
        :return:
        """
        self.num_records_ = self.X.shape[0]
        for c in self.classes_:
            self.X_given_class_[c] = self.X[np.where(self.y == c)]
            self.priors_[c] = float(self.X_given_class_[c].shape[0]) / self.num_records_
        if not self.keep_copy_of_X:
            self.X = None

    def _make_predictions(self, X: np.array = None):
        """
        Predict the class of the input values X
        :param X: matrix to make predictions with
        :return:
        """
        predictions = np.zeros((X.shape[0], len(self.classes_)))
        for c in self.classes_:
            if self.parametric_form_ in ("multinomial", "bernoulli"):
                class_conditional_log_probabilities = np.dot(X, self.thetas_[c])
            elif self.parametric_form_ == "gaussian":
                class_conditional_log_probabilities = np.nan_to_num(self.log_pdf_given_class_[c](X)).sum(axis=1)
            else:
                raise ValueError("Must select proper feature family to make predictions")
            # we add the log of the prior probability of the class as an "intercept"
            predictions[:, c] = class_conditional_log_probabilities + np.log(self.priors_[c])
        return predictions.argmax(axis=1)

    def predict(self, new_X: np.array = None) -> np.array:
        """
        Exposed API to make predictions
        :param new_X: New set of X values to make predictions with (optional)
        :return:
        """
        if new_X is None:
            if not self.keep_copy_of_X:
                raise ValueError("Must keep copy of X in order to make predictions")
            return self._make_predictions(self.X)
        else:
            return self._make_predictions(new_X)

    def _train_bernoulli_model(self):
        """
        Train a Bernoulli Naive Bayes
        :return:
        """
        # We use the log of the probability that a document is drawn from this parametric
        # form of the distribution to ease the computation (by avoiding multiplying very small numbers)
        for c in self.classes_:
            class_size = self.X_given_class_[c].shape[0]
            feature_counts = self.X_given_class_[c].sum(axis=0)
            self.thetas_[c] = np.log(feature_counts + 1) - np.log(class_size + 2)

    def _train_multinomial_model(self):
        """
        Train a Multinomial Naive Bayes
        :return:
        """
        # We use the log of the probability that a document is drawn from this parametric
        # form of the distribution to ease the computation (by avoiding multiplying very small numbers)
        for c in self.classes_:
            # get values of n
            feature_sums = self.X_given_class_[c].sum(axis=0)
            alpha_i = float(self.smoothing_) / self.X_given_class_[c].shape[1]
            alpha = self.smoothing_
            self.thetas_[c] = np.log(feature_sums + alpha_i) - np.log(feature_sums.sum() + alpha)

    # TODO: in high dimensions we'll be storing 10s of thousands of functions which could be inefficient
    def _train_gaussian_model(self):
        """
        Train a Gaussain Naive Bayes
        :return:
        """
        for c in self.classes_:
            means = self.X_given_class_[c].mean(axis=0)
            variances = self.X_given_class_[c].var(axis=0)
            self.log_pdf_given_class_[c] = partial(self.log_gaussian_pdf,
                                                   mu=means,
                                                   sigma=np.sqrt(variances))

    def _get_parametric_probability_estimates(self, parametric_form: str):
        """
        Estimate the parameters of the parametric probability distributions
        :param parametric_form: the form we assume for estimating PDFs/PMFs
        :return:
        """
        if parametric_form == "bernoulli":
            self._train_bernoulli_model()
        elif parametric_form == "multinomial":
            self._train_multinomial_model()
        elif parametric_form == "gaussian":
            self._train_gaussian_model()
        else:
            raise ValueError("Please select a valid family of probability distributions")

    def fit(self):
        """
        Exposed API for training model
        :return:
        """
        self._get_class_conditional_data()
        self._get_parametric_probability_estimates(parametric_form=self.parametric_form_)


# Train Models

In [29]:
predicted = []
actuals = []
nb_models = []
i = 0
for train_idx, test_idx in KFold(n=X.shape[0], n_folds=5, shuffle=True):
    X_train = X[train_idx]
    X_test = X[test_idx]
    y_train = y[train_idx]
    y_test = y[test_idx]
    nb = NaiveBayesClassifier(X_train, y_train,
                              parametric_form="gaussian", keep_copy_of_X=False)
    
    nb.fit()
    
    preds = nb.predict(X_test)
    predicted.append(preds)
    actuals.append(y_test)
    nb_models.append(nb)
    pickle.dump(nb_models, open("./dumps/gaussian_nb/nb_model_fold_{fold}.pkl"
                                .format(fold=str(i)), 'wb'))
    pickle.dump(predicted, open("./dumps/gaussian_nb/predicted_fold_{fold}.pkl"
                                .format(fold=str(i)), 'wb'))
    pickle.dump(actuals, open("./dumps/gaussian_nb/actuals_fold_{fold}.pkl"
                                .format(fold=str(i)), 'wb'))
    i += 1



In [31]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [32]:
model_folder = "./dumps/gaussian_nb/"
os.listdir(model_folder)

['predicted_fold_0.pkl',
 'actuals_fold_2.pkl',
 'predicted_fold_1.pkl',
 'nb_model_fold_1.pkl',
 'actuals_fold_4.pkl',
 'predicted_fold_3.pkl',
 'predicted_fold_4.pkl',
 'predicted_fold_2.pkl',
 'actuals_fold_0.pkl',
 'nb_model_fold_2.pkl',
 'nb_model_fold_4.pkl',
 'actuals_fold_3.pkl',
 'nb_model_fold_3.pkl',
 'actuals_fold_1.pkl',
 'nb_model_fold_0.pkl']

In [33]:
predicted = pickle.load(open(model_folder + "predicted_fold_4.pkl", 'rb'))
actuals = pickle.load(open(model_folder + "actuals_fold_4.pkl", 'rb'))

In [34]:
for pred, actual in zip(predicted, actuals):
    print("confusion matrix")
    print(confusion_matrix(actual, pred))
    print("\taccuracy")
    print("\t\t", accuracy_score(actual, pred))
    print("\tf1")
    print("\t\t", f1_score(actual, pred))
    print("\trecall")
    print("\t\t", recall_score(actual, pred))
    print("\tprecision")
    print("\t\t", precision_score(actual, pred))

confusion matrix
[[ 41  87]
 [ 19 459]]
	accuracy
		 0.825082508251
	f1
		 0.896484375
	recall
		 0.960251046025
	precision
		 0.840659340659
confusion matrix
[[ 47  96]
 [ 18 445]]
	accuracy
		 0.811881188119
	f1
		 0.886454183267
	recall
		 0.961123110151
	precision
		 0.822550831793
confusion matrix
[[ 37 105]
 [  9 454]]
	accuracy
		 0.811570247934
	f1
		 0.888454011742
	recall
		 0.980561555076
	precision
		 0.812164579606
confusion matrix
[[ 50 108]
 [ 13 434]]
	accuracy
		 0.8
	f1
		 0.877654196158
	recall
		 0.970917225951
	precision
		 0.80073800738
confusion matrix
[[ 45  95]
 [ 27 438]]
	accuracy
		 0.798347107438
	f1
		 0.877755511022
	recall
		 0.941935483871
	precision
		 0.821763602251


In [42]:
from sklearn.metrics import matthews_corrcoef

In [43]:
accuracy = []
precision = []
recall = []
f1 = []
mcc = []
predicted = pickle.load(open(model_folder + "predicted_fold_4.pkl", 'rb'))
actuals = pickle.load(open(model_folder + "actuals_fold_4.pkl", 'rb'))
for pred, actual in zip(predicted, actuals):
    print("confusion matrix")
    print(confusion_matrix(actual, pred))
    print("\taccuracy")
    acc = accuracy_score(actual, pred)
    accuracy.append(acc)
    print("\t\t", acc)
    print("\tf1")
    f = f1_score(actual, pred)
    f1.append(f)
    print("\t\t", f)
    print("\trecall")
    r = recall_score(actual, pred)
    recall.append(r)
    print("\t\t", r)
    print("\tprecision")
    p = precision_score(actual, pred)
    precision.append(p)
    print("\t\t", p)
    print("\tmcc")
    m = matthews_corrcoef(actual, pred)
    mcc.append(m)
    print("\t\t", m)

confusion matrix
[[ 41  87]
 [ 19 459]]
	accuracy
		 0.825082508251
	f1
		 0.896484375
	recall
		 0.960251046025
	precision
		 0.840659340659
	mcc
		 0.383423059696
confusion matrix
[[ 47  96]
 [ 18 445]]
	accuracy
		 0.811881188119
	f1
		 0.886454183267
	recall
		 0.961123110151
	precision
		 0.822550831793
	mcc
		 0.397642916482
confusion matrix
[[ 37 105]
 [  9 454]]
	accuracy
		 0.811570247934
	f1
		 0.888454011742
	recall
		 0.980561555076
	precision
		 0.812164579606
	mcc
		 0.38556000048
confusion matrix
[[ 50 108]
 [ 13 434]]
	accuracy
		 0.8
	f1
		 0.877654196158
	recall
		 0.970917225951
	precision
		 0.80073800738
	mcc
		 0.413293172757
confusion matrix
[[ 45  95]
 [ 27 438]]
	accuracy
		 0.798347107438
	f1
		 0.877755511022
	recall
		 0.941935483871
	precision
		 0.821763602251
	mcc
		 0.343018183263


In [37]:
np.mean(accuracy)

0.80937621034830765

In [44]:
print(np.min(mcc))
print(np.mean(mcc))
print(np.max(mcc))

0.343018183263
0.384587466536
0.413293172757


In [45]:
print(np.min(accuracy))
print(np.mean(accuracy))
print(np.max(accuracy))

0.798347107438
0.809376210348
0.825082508251
