In [1]:
import os
import numpy as np
from scipy.io import wavfile
from python_speech_features import mfcc

In [2]:
def features_extraction(sig, rate, nfft=2048, **kwargs):
    """Compute mean and standard deviation of each MFCCs from an audio signal.
    :param signal: the audio signal from which to compute features. Should be an N*1 array
    :param samplerate: the samplerate of the signal we are working with.
    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
    :param numcep: the number of cepstrum to return, default 13
    :param nfilt: the number of filters in the filterbank, default 26.
    :param nfft: the FFT size. Default is 2048.
    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
    :returns: Two numpy arrays of size (numcep,). First vector contains mean of MFCCs, second - standard deviation of MFCCs
    """
    mfcc_feat = mfcc(sig, rate, nfft=nfft, **kwargs)
    mfcc_mean = mfcc_feat.mean(axis=0)
    mfcc_std = mfcc_feat.std(axis=0)
    return mfcc_mean, mfcc_std

In [75]:
DATASET = './datasets/mix/'

categorie_folders = [name for name in os.listdir(DATASET) if os.path.isdir(DATASET+name)]
categories = {}
category_N = 0
for category in categorie_folders:
    categories[category_N] = category
    category_N += 1

In [77]:
dataset = np.zeros((0, 27))
for category_N, category in categories.items():
    waves = [f for f in os.listdir(DATASET+category) if f.endswith('.wav')]
    for wav in waves:
        try:
            rate, sig = wavfile.read(DATASET+category + '/' + wav)
        except ValueError:
            print('ValueError: '+DATASET+category + '/' + wav)
            continue
        mfcc_mean, mfcc_std = features_extraction(sig, rate)
        features = np.concatenate((mfcc_mean, mfcc_std, [category_N])).reshape(1,27)
        dataset = np.concatenate((dataset, features))           

In [78]:
dataset.shape

(4048, 27)

In [76]:
categories

{0: 'Negative', 1: 'Neutral', 2: 'Positive'}

In [79]:
np.savetxt("dataset_toronto+ravdess_three_categories(without calm).csv", dataset, delimiter=",")

In [80]:
dataset_ravdess_train = np.zeros((0, 27))
dataset_ravdess_test = np.zeros((0, 27))
np.random.shuffle(dataset)
dataset_train_len = int(dataset.shape[0] * 0.7)
dataset_ravdess_train = np.concatenate((dataset_ravdess_train, dataset[:dataset_train_len]))
dataset_ravdess_test = np.concatenate((dataset_ravdess_test, dataset[dataset_train_len:]))

In [81]:
dataset_train_X = dataset_ravdess_train[:,:-1]
dataset_train_Y = dataset_ravdess_train[:,-1]
dataset_test_X = dataset_ravdess_test[:,:-1]
dataset_test_Y = dataset_ravdess_test[:,-1]

In [93]:
from sklearn.svm import SVC

In [104]:
clf = SVC(kernel='poly', degree=3)

In [95]:
dataset_train_X.shape

(2833, 26)

In [105]:
clf.fit(dataset_train_X, dataset_train_Y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [106]:
dataset_predict_Y = clf.predict(dataset_test_X)

In [107]:
clf.score(dataset_test_X, dataset_test_Y)

0.87983539094650209

In [99]:
from sklearn.metrics import confusion_matrix

In [108]:
c_matrix = confusion_matrix(dataset_test_Y, dataset_predict_Y)

In [109]:
c_matrix

array([[649,  15,  48],
       [ 22, 106,   7],
       [ 51,   3, 314]], dtype=int64)

In [110]:
for i in range(3):
    print(c_matrix[i,i]/c_matrix[i].sum())

0.911516853933
0.785185185185
0.853260869565
