# Results for Linear Discriminant Analysis

In [1]:
import os
import pickle
import numpy as np
from scipy import sparse
from sklearn.externals import joblib
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix
from sklearn.cross_validation import train_test_split
from midi_ml.models.linear_decision_rules import LinearDiscriminantAnalysis

In [2]:
labels = joblib.load("/home/jovyan/persistent_data/data/dumps/labeled_corpus_labels.pkl")
features = joblib.load("/home/jovyan/persistent_data/data/dumps/labeled_corpus_matrix.pkl")
features = features.todense()
bach_labels = [k for k in range(len(labels)) if labels[k] == "bach-js"]
mozart_labels = [k for k in range(len(labels)) if labels[k] == "mozart"]
X = features[bach_labels + mozart_labels].A
y = np.array([1 for i in range(len(bach_labels))] + [0 for i in range(len(mozart_labels))])
y = y.reshape((y.shape[0],))
del features

As an example, plot the single dimension that the LDA projects on to

In [None]:
regularization_parameter = 0.01
predicted = []
actuals = []
i = 0
for train_idx, test_idx in KFold(n=X.shape[0], n_folds=5, shuffle=True):
    X_train = X[train_idx]
    X_test = X[test_idx]
    y_train = y[train_idx]
    y_test = y[test_idx]
    lda = LinearDiscriminantAnalysis(X_train, y_train, regularization=regularization_parameter, keep_copy_of_X=False)
    lda.fit()
    preds = lda.predict(X_test)
    predicted.append(preds)
    actuals.append(y_test)
    del lda
    pickle.dump(predicted, open("./dumps/lda/predicted_fold_{fold}_reg_{reg}.pkl"
                                .format(fold=str(i), reg=str(regularization_parameter)), 'wb'))
    pickle.dump(actuals, open("./dumps/lda/actuals_fold_{fold}_reg_{reg}.pkl"
                                .format(fold=str(i), reg=str(regularization_parameter)), 'wb'))
    i += 1

In [None]:
regularization_parameter = 0.1
predicted = []
actuals = []
i = 0
for train_idx, test_idx in KFold(n=X.shape[0], n_folds=5, shuffle=True):
    X_train = X[train_idx]
    X_test = X[test_idx]
    y_train = y[train_idx]
    y_test = y[test_idx]
    lda = LinearDiscriminantAnalysis(X_train, y_train, regularization=regularization_parameter, keep_copy_of_X=False)
    lda.fit()
    preds = lda.predict(X_test)
    predicted.append(preds)
    actuals.append(y_test)
    del lda
    pickle.dump(predicted, open("./dumps/lda/predicted_fold_{fold}_reg_{reg}.pkl"
                                .format(fold=str(i), reg=str(regularization_parameter)), 'wb'))
    pickle.dump(actuals, open("./dumps/lda/actuals_fold_{fold}_reg_{reg}.pkl"
                                .format(fold=str(i), reg=str(regularization_parameter)), 'wb'))
    i += 1

In [None]:
regularization_parameter = 0.3
predicted = []
actuals = []
i = 0
for train_idx, test_idx in KFold(n=X.shape[0], n_folds=5, shuffle=True):
    X_train = X[train_idx]
    X_test = X[test_idx]
    y_train = y[train_idx]
    y_test = y[test_idx]
    lda = LinearDiscriminantAnalysis(X_train, y_train, regularization=regularization_parameter, keep_copy_of_X=False)
    lda.fit()
    preds = lda.predict(X_test)
    predicted.append(preds)
    actuals.append(y_test)
    del lda
    pickle.dump(predicted, open("./dumps/lda/predicted_fold_{fold}_reg_{reg}.pkl"
                                .format(fold=str(i), reg=str(regularization_parameter)), 'wb'))
    pickle.dump(actuals, open("./dumps/lda/actuals_fold_{fold}_reg_{reg}.pkl"
                                .format(fold=str(i), reg=str(regularization_parameter)), 'wb'))
    i += 1

In [36]:
regularization_parameter = 0.001
predicted = []
actuals = []
i = 0
for train_idx, test_idx in KFold(n=X.shape[0], n_folds=5, shuffle=True):
    X_train = X[train_idx]
    X_test = X[test_idx]
    y_train = y[train_idx]
    y_test = y[test_idx]
    lda = LinearDiscriminantAnalysis(X_train, y_train, regularization=regularization_parameter, keep_copy_of_X=False)
    lda.fit()
    preds = lda.predict(X_test)
    predicted.append(preds)
    actuals.append(y_test)
    del lda
    pickle.dump(predicted, open("./dumps/lda/predicted_fold_{fold}_reg_{reg}.pkl"
                                .format(fold=str(i), reg=str(regularization_parameter)), 'wb'))
    pickle.dump(actuals, open("./dumps/lda/actuals_fold_{fold}_reg_{reg}.pkl"
                                .format(fold=str(i), reg=str(regularization_parameter)), 'wb'))
    i += 1

# Results

In [14]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, matthews_corrcoef

In [39]:
regularizations = [0.001, 0.01, 0.1, 0.3, 0.5]

In [40]:
model_folder = "./dumps/lda/"

In [41]:
accuracy = {reg: [] for reg in regularizations}
precision = {reg: [] for reg in regularizations}
recall = {reg: [] for reg in regularizations}
f1 = {reg: [] for reg in regularizations}
mcc = {reg: [] for reg in regularizations}
for reg in regularizations:
    print("Regularization: %s" % reg)
    predicted = pickle.load(open(model_folder + "predicted_fold_4_reg_%s.pkl" % reg, 'rb'))
    actuals = pickle.load(open(model_folder + "actuals_fold_4_reg_%s.pkl" % reg, 'rb'))
    for pred, actual in zip(predicted, actuals):
        print("confusion matrix")
        print(confusion_matrix(actual, pred))
        print("\taccuracy")
        acc = accuracy_score(actual, pred)
        accuracy[reg].append(acc)
        print("\t\t", acc)
        print("\tf1")
        f = f1_score(actual, pred)
        f1[reg].append(f)
        print("\t\t", f)
        print("\trecall")
        r = recall_score(actual, pred)
        recall[reg].append(r)
        print("\t\t", r)
        print("\tprecision")
        p = precision_score(actual, pred)
        precision[reg].append(p)
        print("\t\t", p)
        print("\tmcc")
        m = matthews_corrcoef(actual, pred)
        mcc[reg].append(m)
        print("\t\t", m)


    print("\n\n\n\n")

Regularization: 0.001
confusion matrix
[[ 67  75]
 [ 16 448]]
	accuracy
		 0.849834983498
	f1
		 0.90780141844
	recall
		 0.965517241379
	precision
		 0.856596558317
	mcc
		 0.538816197119
confusion matrix
[[ 78  61]
 [ 16 451]]
	accuracy
		 0.872937293729
	f1
		 0.921348314607
	recall
		 0.96573875803
	precision
		 0.880859375
	mcc
		 0.611908878937
confusion matrix
[[ 75  65]
 [ 17 448]]
	accuracy
		 0.864462809917
	f1
		 0.916155419223
	recall
		 0.963440860215
	precision
		 0.873294346979
	mcc
		 0.586237304508
confusion matrix
[[ 91  60]
 [ 13 441]]
	accuracy
		 0.879338842975
	f1
		 0.923560209424
	recall
		 0.971365638767
	precision
		 0.880239520958
	mcc
		 0.658421252548
confusion matrix
[[ 80  59]
 [ 15 451]]
	accuracy
		 0.877685950413
	f1
		 0.924180327869
	recall
		 0.967811158798
	precision
		 0.88431372549
	mcc
		 0.628251769351





Regularization: 0.01
confusion matrix
[[ 76  59]
 [ 12 459]]
	accuracy
		 0.882838283828
	f1
		 0.928210313448
	recall
		 0.974522292994
	p

In [53]:
reg = 0.001
print(np.min(mcc[reg]))
print(np.mean(mcc[reg]))
print(np.max(mcc[reg]))

0.538816197119
0.604727080493
0.658421252548


In [60]:
reg = 0.3
print(np.min(accuracy[reg]))
print(np.mean(accuracy[reg]))
print(np.max(accuracy[reg]))

0.852892561983
0.870161197938
0.887788778878


In [46]:
reg = 0.1
print(np.min(accuracy[reg]))
print(np.mean(accuracy[reg]))
print(np.max(accuracy[reg]))

0.876033057851
0.887018520034
0.907438016529
