# Results for Multinomial Naive Bayes

In [1]:
%matplotlib inline

In [2]:
from matplotlib import pyplot as plt

In [3]:
import os
import pickle
import numpy as np
from scipy import sparse
from sklearn.externals import joblib
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from midi_ml.models.linear_decision_rules import NaiveBayesClassifier

In [4]:
labels = joblib.load("/home/jovyan/persistent_data/data/dumps/labeled_corpus_labels.pkl")
features = joblib.load("/home/jovyan/persistent_data/data/dumps/labeled_corpus_matrix.pkl")
features = features.todense()
bach_labels = [k for k in range(len(labels)) if labels[k] == "bach-js"]
mozart_labels = [k for k in range(len(labels)) if labels[k] == "mozart"]
X = features[bach_labels + mozart_labels].A
y = np.array([1 for i in range(len(bach_labels))] + [0 for i in range(len(mozart_labels))])
y = y.reshape((y.shape[0],))
del features

In [11]:
X.shape

(3027, 16384)

# Train Models

In [5]:
predicted = []
actuals = []
nb_models = []
i = 0
for train_idx, test_idx in KFold(n=X.shape[0], n_folds=5, shuffle=True):
    X_train = X[train_idx]
    X_test = X[test_idx]
    y_train = y[train_idx]
    y_test = y[test_idx]
    nb = NaiveBayesClassifier(X_train, y_train,
                              parametric_form="multinomial", keep_copy_of_X=False)
    try:
        nb.fit()
    except:
        continue
    
    preds = nb.predict(X_test)
    predicted.append(preds)
    actuals.append(y_test)
    nb_models.append(nb)
    pickle.dump(nb_models, open("./dumps/multinomial_nb/nb_model_fold_{fold}.pkl"
                                .format(fold=str(i)), 'wb'))
    pickle.dump(predicted, open("./dumps/multinomial_nb/predicted_fold_{fold}.pkl"
                                .format(fold=str(i)), 'wb'))
    pickle.dump(actuals, open("./dumps/multinomial_nb/actuals_fold_{fold}.pkl"
                                .format(fold=str(i)), 'wb'))
    i += 1

In [7]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

In [8]:
model_folder = "./dumps/multinomial_nb/"
os.listdir(model_folder)

['predicted_fold_0.pkl',
 'actuals_fold_2.pkl',
 'predicted_fold_1.pkl',
 'nb_model_fold_1.pkl',
 'actuals_fold_4.pkl',
 'predicted_fold_3.pkl',
 'predicted_fold_4.pkl',
 'predicted_fold_2.pkl',
 'actuals_fold_0.pkl',
 'nb_model_fold_2.pkl',
 'nb_model_fold_4.pkl',
 'actuals_fold_3.pkl',
 'nb_model_fold_3.pkl',
 'actuals_fold_1.pkl',
 'nb_model_fold_0.pkl']

In [9]:
predicted = pickle.load(open(model_folder + "predicted_fold_4.pkl", 'rb'))
actuals = pickle.load(open(model_folder + "actuals_fold_4.pkl", 'rb'))

In [19]:
accuracy = []
precision = []
recall = []
f1 = []
mcc = []
predicted = pickle.load(open(model_folder + "predicted_fold_4.pkl", 'rb'))
actuals = pickle.load(open(model_folder + "actuals_fold_4.pkl", 'rb'))
for pred, actual in zip(predicted, actuals):
    print("confusion matrix")
    print(confusion_matrix(actual, pred))
    print("\taccuracy")
    acc = accuracy_score(actual, pred)
    accuracy.append(acc)
    print("\t\t", acc)
    print("\tf1")
    f = f1_score(actual, pred)
    f1.append(f)
    print("\t\t", f)
    print("\trecall")
    r = recall_score(actual, pred)
    recall.append(r)
    print("\t\t", r)
    print("\tprecision")
    p = precision_score(actual, pred)
    precision.append(p)
    print("\t\t", p)
    print("\tmcc")
    m = matthews_corrcoef(actual, pred)
    mcc.append(m)
    print("\t\t", m)

confusion matrix
[[100  44]
 [ 78 384]]
	accuracy
		 0.798679867987
	f1
		 0.862921348315
	recall
		 0.831168831169
	precision
		 0.897196261682
	mcc
		 0.491175475206
confusion matrix
[[108  35]
 [ 95 368]]
	accuracy
		 0.785478547855
	f1
		 0.849884526559
	recall
		 0.794816414687
	precision
		 0.913151364764
	mcc
		 0.494844742462
confusion matrix
[[ 80  48]
 [ 81 396]]
	accuracy
		 0.786776859504
	f1
		 0.85993485342
	recall
		 0.830188679245
	precision
		 0.891891891892
	mcc
		 0.420679359178
confusion matrix
[[114  33]
 [ 81 377]]
	accuracy
		 0.811570247934
	f1
		 0.86866359447
	recall
		 0.823144104803
	precision
		 0.919512195122
	mcc
		 0.549361582055
confusion matrix
[[104  45]
 [ 72 384]]
	accuracy
		 0.806611570248
	f1
		 0.867796610169
	recall
		 0.842105263158
	precision
		 0.895104895105
	mcc
		 0.512339914643


In [18]:
from sklearn.metrics import matthews_corrcoef

In [13]:
np.mean(accuracy)

0.79782341870550688

In [23]:
print(np.min(mcc))
print(np.mean(mcc))
print(np.max(mcc))

0.420679359178
0.493680214709
0.549361582055


In [24]:
print(np.min(accuracy))
print(np.mean(accuracy))
print(np.max(accuracy))

0.785478547855
0.797823418706
0.811570247934
