In [None]:
""" usage:
        a4.py train TRAIN_FEATURE_FILE [--new] [--validate]
        a4.py classify MUSIC_FEATURE_FILE
    The files should be CSV with 6 columns, the last of which is the target/label/class (or empty, if classifying), and the first of which is ignored.
"""
#-------------------------------------------------------------------------------
# Name: Pat Kujawa
# Purpose: MM audio classification asn 4
#-------------------------------------------------------------------------------

from __future__ import division
import os, sys
import docopt

import numpy as np
import cPickle as pickle

from sklearn.naive_bayes import GaussianNB
from sklearn import cross_validation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_score


picklePath = r"classifier.pickle"
target_names = ['speech', 'music']  # false, true


def preProcess(csvFile, classifying=False):
    """Returns (data, targets) where targets is bool array repr IsMusic.
    """
##    csvFile = r"C:\Users\Pat\Dropbox\UM Grad School\2013 Fall\Multimedia MM processing 578\asn4-audio-classifier\energy,zc,zcr,centroid,bw,name,ismusic.csv"
##    datatable = np.genfromtxt(csvFile, delimiter=',', names=True, dtype=None)
##    featureTable = datatable[sorted(list(set(datatable.dtype.names) - {'zc', 'ismusic', 'name'}))]  # use zero crossing rate instead of absolute count; ditch non-feature data
##    classifications = datatable['ismusic']  # bool

    names = np.genfromtxt(csvFile, delimiter=',', usecols=(0), dtype=str)
    data = np.genfromtxt(csvFile, delimiter=',', usecols=(1,2,3,4))
    if classifying:
        targets = None
    else:
        targets = np.genfromtxt(csvFile, delimiter=',', usecols=(5), dtype=bool)  # bool ismusic
    return data, names, targets


def train(data, names, targets, startNew=False, cv=False):
    """Create and serialize a classifier trained on 2/3 of the input data.
        :param startNew: create a new classifier if true else add to the training of the previous classifier
        :param cv: do cross-validation with a subset of items
    """
    classifier = None
    if not startNew:
        try:
            with open(picklePath, 'rb') as f:
                classifier = pickle.load(f)
        except:
            sys.stderr.write("Couldn't deserialize classifier. Creating a new one instead \n")

    t = targets

    # From DZone.com refcard: Data Mining - Discovering and Visualizing Patterns with Python by Giuseppe Vettigli
    classifier = classifier or GaussianNB()
    if not cv:
        classifier.fit(data, t) # training
        print 'Trained on all files:', ','.join(names)
        return ''

    ##from sklearn import svm
    ##classifier = svm.SVC()  # classifying all as Speech

    # t_ means target, as in expected/desired classification
    train, test, t_train, t_test, trainFiles, testFiles = \
            cross_validation.train_test_split(data, t, names, test_size=0.33)

    # show which files are used for train/test
    print 'Training files:', ','.join(trainFiles)
##    print sum((s.startswith("mu") for s in trainFiles)), 'music files /', len(trainFiles)
    print 'Test files:', ','.join(testFiles)

    classifier.fit(train, t_train)  # train

    print 'Prior probabilities (n={}):'.format(len(trainFiles))
    for cls, prob in zip(classifier.classes_, classifier.class_prior_):
        print target_names[cls], prob

    print "Accuracy for 2/3 training, 1/3 test:"
    print classifier.score(test, t_test)  # test
    # 0.0625 :(

    print "Confusion matrix for 2/3 training, 1/3 test:"
    print confusion_matrix(classifier.predict(test), t_test)
    ##[[2 2]
    ## [4 8]]

    print 'Classification report for 2/3 training, 1/3 test:'
    print classification_report(classifier.predict(test),
            t_test, target_names=target_names)

    print 'leave one out cv'
    # cross validation with leave one out
    # http://stackoverflow.com/questions/17499068/train-scikit-svm-customize-score-assessment
    scores = cross_val_score(classifier, data, t,
            cv=cross_validation.LeaveOneOut(len(t)))
    print scores, np.sum(scores), '/', len(scores), '=', np.mean(scores)

    try:
        with open(picklePath, 'wb') as f:
            pickle.dump(classifier, f, protocol=pickle.HIGHEST_PROTOCOL)
    except:
        sys.stderr.write("Error persisting classifier to file. Are you in a protected directory\n")
    globals().update(locals())
    return ''


def classify(data):
    """Predict the class of the data from a deserialized classifier.
    """
    assert data.ndim == 1
    try:
        with open(picklePath, 'rb') as f:
            classifier = pickle.load(f)
    except:
        sys.stderr.write("Error: no classifier found. Need to train first.\n")
        return
    result = classifier.predict(data)
    globals().update(locals())
    return target_names[result[0]]


def main():
    args = docopt.docopt(__doc__, options_first=False)
    if args['train']:
        print("train(*preProcess(args['TRAIN_FEATURE_FILE']),startNew=args['--new'], cv=args['--validate'])")
    elif args['classify']:
        print("classify(preProcess(args['MUSIC_FEATURE_FILE'])[0])")


if __name__ == '__main__':
    main()