In [2]:
folder_path = "SpeechAO_AllActors/"

In [3]:
import sys
import os
import subprocess
import xgboost as xgb

In [4]:
def get_wav_file_paths(folder_path):
    return list(map(lambda a: folder_path + str(a), 
               filter(lambda a: ".wav" in str(a), subprocess.check_output(['ls', folder_path]).splitlines())))

In [5]:
import sys, aubio
import sys
from aubio import source, pvoc, mfcc
from numpy import vstack, zeros, diff
import numpy as np


def process_wav_file(filename,
                     samplerate = 0,
                     win_s = 1024,
                     seconds_window = 3,
                     svm = True):
    hop_s = win_s // 4
    filename = filename.replace("b'", "")[:-1]
    #print(filename)
    n_filters = 40              # must be 40 for mfcc
    n_coeffs = 13
    s = source(filename, samplerate, hop_s)
    p = pvoc(win_s, hop_s)
    m = mfcc(win_s, n_filters, n_coeffs, samplerate)
    n_samples = 1#s.duration / s.samplerate / seconds_window
    if n_samples == 0:
        return []
    mfccs = zeros([n_coeffs,])
    frames_read = 0
    while True:
        samples, read = s()
        #print(samples, read)
        spec = p(samples)
        mfcc_out = m(spec)
        mfccs = vstack((mfccs, mfcc_out))
        frames_read += read
        if read < hop_s: break

    mfccs1 = diff(mfccs, axis = 0)
    mfccs2 = diff(mfccs, axis = 0)
    #print mfccs.shape, mfccs1.shape, mfccs2.shape
    all_data = np.concatenate((mfccs[1:,:], mfccs1, mfccs1), 1)
    
    final = []
    size_row = len(all_data) / n_samples
    if svm:
        final.append(get_mean_avg_etc(all_data))
    else:    
        final.append(all_data)
#     for i in range(n_samples):
#         if svm:
#             final.append(get_mean_avg_etc(all_data[i*size_row: (i+1)*size_row]))
#         else:    
#             final.append(all_data[i*size_row: (i+1)*size_row])
    return final
    

In [6]:
#Speech (1 = neutral, 2 = calm, 3 = happy, 4 = sad, 5 = angry, 6 = fearful, 7 = disgust, 8 = surprised)

def get_label(filename, svm):
    row = np.zeros(8)
    filename = filename.split("-")
    row[int(filename[2]) - 1] = 1
    if svm:
        return int(filename[2]) - 1
    else:
        return row
    

In [7]:
def get_mean_avg_etc(row):
    new_row = []
    new_row += list(row.mean(axis = 0))
    new_row += list(row.std(axis = 0))
    new_row += list(row.min(axis = 0))
    new_row += list(row.max(axis = 0))
    return new_row

In [8]:
import random 
def make_dataset(folder_path, svm = True):
    filenames = get_wav_file_paths(folder_path)
    np.random.shuffle(filenames)
    y = []
    X = []
    for filename in filenames:
        tmp_x = process_wav_file(filename, svm = svm)
        n = len(tmp_x)
        if svm:
            tmp_y = [get_label(filename, svm), ]*n
        else:
            tmp_y = [list(get_label(filename, svm)), ]*n


        X += tmp_x
        y += tmp_y
    return np.array(X), np.array(y)

In [9]:
X,y = make_dataset(folder_path)

In [105]:
X.shape, y.shape

((1440, 156), (1440,))

In [101]:
import sklearn.preprocessing
#X = sklearn.preprocessing.normalize(X)

In [53]:
### CLF 1 - SVM

In [40]:
import gensim
import sklearn, sklearn.datasets
import sklearn.naive_bayes, sklearn.linear_model, sklearn.svm, sklearn.neighbors, sklearn.ensemble
import matplotlib.pyplot as plt
import scipy.sparse
import numpy as np
import time, re
from sklearn.feature_extraction.text import TfidfVectorizer

scipy.sparse.sparsetools is a private module for scipy.sparse, and should not be used.
  _deprecated()


In [41]:
def baseline(train_data, train_labels, test_data, test_labels, omit=[]):
    """Train various classifiers to get a baseline."""
    clf, train_accuracy, test_accuracy, train_f1, test_f1, exec_time = [], [], [], [], [], []
    #clf.append(sklearn.neighbors.KNeighborsClassifier(n_neighbors=10))
    #clf.append(sklearn.ensemble.AdaBoostClassifier())
    #clf.append(sklearn.naive_bayes.BernoulliNB(alpha=.01))
    clf.append(sklearn.ensemble.RandomForestClassifier(n_estimators=100, max_depth=5))
    #clf.append(sklearn.naive_bayes.MultinomialNB(alpha=.01))
    clf.append(sklearn.ensemble.GradientBoostingClassifier(max_depth=5))
    #clf.append(sklearn.svm.SVC())
    clf.append(sklearn.linear_model.RidgeClassifier())
    #clf.append(sklearn.svm.LinearSVC())
    for i,c in enumerate(clf):
        if i not in omit:
            print(c)
            #t_start = time.process_time()
            c.fit(train_data, train_labels)
            train_pred = c.predict(train_data)
            test_pred = c.predict(test_data)
            train_accuracy.append('{:5.2f}'.format(100*sklearn.metrics.accuracy_score(train_labels, train_pred)))
            test_accuracy.append('{:5.2f}'.format(100*sklearn.metrics.accuracy_score(test_labels, test_pred)))
            train_f1.append('{:5.2f}'.format(100*sklearn.metrics.f1_score(train_labels, train_pred, average='weighted')))
            test_f1.append('{:5.2f}'.format(100*sklearn.metrics.f1_score(test_labels, test_pred, average='weighted')))
            #exec_time.append('{:5.2f}'.format(time.process_time() - t_start))
    print('Train accuracy:      {}'.format(' '.join(train_accuracy)))
    print('Test accuracy:       {}'.format(' '.join(test_accuracy)))
    print('Train F1 (weighted): {}'.format(' '.join(train_f1)))
    print('Test F1 (weighted):  {}'.format(' '.join(test_f1)))
    #print('Execution time:      {}'.format(' '.join(exec_time)))

In [11]:
limit = int(X.shape[0] / 10 * 8)

In [12]:
X_train = X[:limit]
y_train = y[:limit]
X_val = X[limit:]
y_val = y[limit:]

In [36]:
xg_train = xgb.DMatrix( X_train, label=y_train)
xg_test = xgb.DMatrix(X_val, label=y_val)
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.1
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 10
param['num_class'] = 8

watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
num_round = 96
bst = xgb.train(param, xg_train, num_round, watchlist );
# get prediction
pred = bst.predict( xg_test )


print ('predicting, classification error=%f' % (sum( int(pred[i]) != y_val[i] for i in range(len(y_val))) / float(len(y_val)) ))

# do the same thing again, but output probabilities
param['objective'] = 'multi:softprob'
bst = xgb.train(param, xg_train, num_round, watchlist );
# Note: this convention has been changed since xgboost-unity
# get prediction, this is in 1D array, need reshape to (ndata, nclass)
yprob = bst.predict( xg_test ).reshape( y_val.shape[0], 8 )
ylabel = np.argmax(yprob, axis=1)

print ('predicting, classification error=%f' % (sum( int(ylabel[i]) != y_val[i] for i in range(len(y_val))) / float(len(y_val)) ))



[0]	train-merror:0.173611	test-merror:0.673611
[1]	train-merror:0.126736	test-merror:0.635417
[2]	train-merror:0.092014	test-merror:0.635417
[3]	train-merror:0.08941	test-merror:0.614583
[4]	train-merror:0.06684	test-merror:0.607639
[5]	train-merror:0.054688	test-merror:0.618056
[6]	train-merror:0.049479	test-merror:0.59375
[7]	train-merror:0.038194	test-merror:0.586806
[8]	train-merror:0.032118	test-merror:0.590278
[9]	train-merror:0.02691	test-merror:0.59375
[10]	train-merror:0.023438	test-merror:0.583333
[11]	train-merror:0.017361	test-merror:0.583333
[12]	train-merror:0.015625	test-merror:0.576389
[13]	train-merror:0.013021	test-merror:0.576389
[14]	train-merror:0.012153	test-merror:0.572917
[15]	train-merror:0.007812	test-merror:0.5625
[16]	train-merror:0.006944	test-merror:0.565972
[17]	train-merror:0.005208	test-merror:0.5625
[18]	train-merror:0.005208	test-merror:0.555556
[19]	train-merror:0.005208	test-merror:0.552083
[20]	train-merror:0.002604	test-merror:0.548611
[21]	train-

[95]	train-merror:0	test-merror:0.506944
predicting, classification error=0.506944


In [42]:
print(baseline(X_train, y_train, X_val, y_val))

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)


  'precision', 'predicted', average, warn_for)


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features=None, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=None,
              subsample=1.0, verbose=0, warm_start=False)
RidgeClassifier(alpha=1.0, class_weight=None, copy_X=True, fit_intercept=True,
        max_iter=None, normalize=False, random_state=None, solver='auto',
        tol=0.001)
Train accuracy:      74.31 100.00 59.29
Test accuracy:       45.83 49.31 42.71
Train F1 (weighted): 72.86 100.00 57.10
Test F1 (weighted):  42.46 49.31 40.61
None


In [61]:
from collections import Counter
Counter(y_train)

Counter({0: 80, 1: 153, 2: 144, 3: 152, 4: 153, 5: 160, 6: 156, 7: 154})

In [62]:
rnnX,rnny = make_dataset(folder_path, svm=False)

In [63]:
rnnX_train = rnnX[:limit]
rnny_train = rnny[:limit]
rnnX_val = rnnX[limit:]
rnny_val = rnny[limit:]

In [86]:
Counter([len(x) for x in rnnX_train])

Counter({551: 1,
         564: 1,
         570: 2,
         576: 9,
         582: 5,
         589: 13,
         595: 8,
         601: 17,
         607: 26,
         614: 20,
         620: 29,
         626: 19,
         632: 24,
         639: 34,
         645: 38,
         651: 56,
         657: 57,
         664: 54,
         670: 56,
         676: 52,
         682: 54,
         689: 46,
         695: 45,
         701: 60,
         707: 43,
         714: 38,
         720: 37,
         726: 32,
         732: 25,
         739: 20,
         745: 27,
         751: 18,
         758: 19,
         764: 14,
         770: 22,
         776: 19,
         783: 12,
         789: 13,
         795: 11,
         801: 10,
         808: 10,
         814: 6,
         820: 8,
         826: 8,
         833: 2,
         839: 5,
         845: 1,
         851: 2,
         858: 1,
         864: 5,
         870: 4,
         876: 1,
         889: 3,
         895: 3,
         901: 1,
         908: 1,
         914:

In [88]:
'''Trains a LSTM on the IMDB sentiment classification task.
The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods such as TF-IDF + LogReg.
Notes:

- RNNs are tricky. Choice of batch size is important,
choice of loss and optimizer is critical, etc.
Some configurations won't converge.

- LSTM loss decrease patterns during training can be quite different
from what you see with CNNs/MLPs/etc.
'''
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding,TimeDistributed
from keras.layers import LSTM, GRU
from keras.datasets import imdb

max_features = 20000
maxlen = 810  # cut texts after this number of words (among top max_features most common words)
batch_size = 32


y_train = rnny_train
y_test = rnny_val
print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(rnnX_train, maxlen=maxlen)
x_test = sequence.pad_sequences(rnnX_val, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)


Pad sequences (samples x time)
x_train shape: (1152, 810, 39)
x_test shape: (288, 810, 39)


In [91]:
print('Build model...')
model = Sequential()
model.add(TimeDistributed(Dense(39), input_shape=(maxlen, 39)))
model.add(GRU(output_dim=39,return_sequences=True, dropout=0.2))
model.add(GRU(output_dim=20,return_sequences=True, dropout=0.2))
model.add(GRU(output_dim=10,return_sequences=False, dropout=0.2))
model.add(Dense(8, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

Build model...




In [92]:

print('Train...')
model.fit(x_train, rnny_train,
          batch_size=batch_size,
          epochs=15,
          verbose=1,
          validation_data=(x_test, y_test))
score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Train on 1152 samples, validate on 288 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test score: 1.88621328937
Test accuracy: 0.246527777778
