In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_agg import FigureCanvasAgg as FigureCanvas
import seaborn as sns
import librosa
import librosa.display
import os
import soundfile
import glob

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,OneHotEncoder,StandardScaler
from sklearn.metrics import accuracy_score
from opensoundscape.audio import Audio
from opensoundscape.spectrogram import Spectrogram

from tensorflow.python.keras.models import Sequential,Model
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.losses import binary_crossentropy,categorical_crossentropy
from tensorflow.keras.metrics import Accuracy
from tensorflow.keras.layers import Add,LSTM,Dense,Conv1D,Embedding
from tensorflow.keras.activations import relu,softmax

sc = StandardScaler()
enc = LabelEncoder()
mms = MinMaxScaler()
oh = OneHotEncoder()
sns.set_style('darkgrid')

emotions_encoder = {
    '01': 'Neutral',
    '02': 'Calm',
    '03': 'Happy',
    '04': 'Sad',
    '05': 'Angry',
    '06': 'Fearful',
    '07': 'Disgust',
    '08': 'Surprised'
}

observed_emotions = [x for x in emotions_encoder.values()]

In [33]:
class Feature():

    def __init__(self, mfcc=True, chroma=True, mel=True, zcr=True, spread=True, mean=True):

        self.mfcc = mfcc
        self.chroma = chroma
        self.mel = mel
        self.zcr = zcr
        self.spread = spread
        self.mean = mean


    def extract_feature(self,file_name,is_data=False,data_inp = [],sample_rate_inp=int()):

        if not is_data:
            data = []    
            with soundfile.SoundFile(file_name) as sound_file:
                X = sound_file.read(dtype="float32")
                data.append(X)
                sample_rate=sound_file.samplerate
                self.sample_rate = sample_rate
                result=np.array([])

                if self.mfcc:
                    mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T, axis=0)
                    result=np.hstack((result, mfccs))

                if self.chroma:
                    stft=np.abs(librosa.stft(X))
                    chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate,n_chroma=50).T,axis=0)
                    result=np.hstack((result, chroma))

                if self.mel:
                    mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
                    result=np.hstack((result, mel))

                if self.zcr:
                    Z = np.mean(librosa.feature.zero_crossing_rate(y=X),axis=1)
                    result=np.hstack((result, Z))

                if self.spread:
                    var = np.var(X)
                    result=np.hstack((result, var))

                if self.mean:
                    mean = np.mean(X)
                    result=np.hstack((result, mean))
        if is_data:
            data = data_inp
            self.sample_rate = sample_rate_inp
            result=np.array([])
            X = data_inp

            if self.mfcc:
                mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=50).T, axis=0)
                result=np.hstack((result, mfccs))

            if self.chroma:
                stft=np.abs(librosa.stft(X))
                chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate,n_chroma=50).T,axis=0)
                result=np.hstack((result, chroma))

            if self.mel:
                mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
                result=np.hstack((result, mel))

            if self.zcr:
                Z = np.mean(librosa.feature.zero_crossing_rate(y=X),axis=1)
                result=np.hstack((result, Z))

            if self.spread:
                var = np.var(X)
                result=np.hstack((result, var))

            if self.mean:
                mean = np.mean(X)
                result=np.hstack((result, mean))

        self.timeseries_data = data

        return result

    def engineer(self,Paths=[],isData=False,Data_inp=[],sr=int()):
        
        X = []
        self.sample_rate = sr
        self.Paths = Paths
        for file in Paths:
            feature=self.extract_feature(file,isData,Data_inp,self.sample_rate)
            X.append(feature)
        self.features = X
        return np.array(X)



X,Y=[],[]

for file in glob.glob("Data/Actor_*/*.wav"):
    file_name=os.path.basename(file)
    emotion=emotions_encoder[file_name.split("-")[2]]
    X.append(file)
    Y.append(emotion)



In [34]:
Feat = Feature()

Z = Feat.engineer(X)

In [15]:
class Augment():

    def __init__(self,noise=True,stretch=True,roll=True,pitch=True,stretch_rate = 0.6,pitch_factor = 0.8):
        self.noise = noise
        self.time_stretch = stretch
        self.time_roll = roll
        self.pitch = pitch
        self.stretch_rate = stretch_rate
        self.pitch_factor = pitch_factor

    def load(self,path):
        aud, sr = librosa.load(path=path,duration=2.5,offset=0.5)
        self.aud = aud
        self.sampling_rate = sr

    def augment(self,X):

        noised = []
        stretched = []
        rolled = []
        pitched = []

        for path in X:

            if self.noise:
                self.load(path)
                aud = self.aud
                noise_amp = 0.035*np.random.uniform()*np.amax(aud)
                aud = aud + noise_amp*np.random.normal(size=aud.shape[0])
                noised.append(aud)

            if self.time_stretch:
                self.load(path)
                aud = self.aud
                stretched.append(librosa.effects.time_stretch(aud, self.stretch_rate))

            if self.time_roll:
                self.load(path)
                aud = self.aud
                shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
                rolled.append(np.roll(aud, shift_range))

            if self.pitch:
                self.load(path)
                aud = self.aud
                sampling_rate = self.sampling_rate
                pitched.append(librosa.effects.pitch_shift(aud, sampling_rate, self.pitch_factor))

        return (noised,stretched,rolled,pitched)

aug = Augment()


In [17]:
tup = aug.augment(X)

In [36]:
model = Sequential()
model.add(In)
model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()
history_ltsm = model.fit(Z, observed_emotions, epochs=10, batch_size=60,)

ValueError: This model has not yet been built. Build the model first by calling `build()` or calling `fit()` with some data, or specify an `input_shape` argument in the first layer(s) for automatic build.