In [1]:
# %% Import the libraries for SER; the librosa is the main one for audio analysis
import pandas as pd
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import json
from PyEMD import EMD
import opensmile
import warnings
import time
import os
from tqdm import tqdm
warnings.filterwarnings(action="ignore")


SoX could not be found!

    If you do not have SoX, proceed here:
     - - - http://sox.sourceforge.net/ - - -

    If you do (or think that you should) have SoX, double-check your
    path variables.
    


In [None]:
audiopath = "/Users/talen/Documents/Datasets/IEMOCAP/Data/Ses01F_impro01_F008.wav"
sig, sr = librosa.load(audiopath, 16000)
len(sig) < 2/3 * (3*16000)

In [2]:
# %% Prepare the downloaded IEMOCAP dataset for SER
# The IEMOCAP dataset was retrieved by requesting to the IEMOCAP development team https://sail.usc.edu/iemocap/release_form.php
# Deal with the IEMOCAP metadata to gain the file paths of the improvised speeches in the four desired emotion classes
# Read the metadata about the dataset
df_descri = pd.read_csv("/Users/talen/Documents/Datasets/IEMOCAP/iemocap_metadata.csv")

# Only select the improvised samples to create a description file
df_impro = df_descri[df_descri["method"] == "impro"]

# Replace the default file-path with the local file-path after downloaded in the author's computer
new_paths = []

# Gain the old paths from "path" column of the description file
old_paths = df_impro["path"].map(str)
for old_path in old_paths:
    # Extract the file names
    path_list = str(old_path).split("/")
    file_name = path_list[-1]

    # Concatenate the filename with the local folder path and saved in new_paths variable
    new_path = "/Users/talen/Documents/Datasets/IEMOCAP/Data/" + file_name
    new_paths.append(new_path)

# Replace the old paths with the new paths in the description file
df_impro.loc[:, ["path"]] = new_paths

# Select the data about the angry, happy, sad, neutral emotions from the description file
df_ang = df_impro[df_impro["emotion"] == "ang"]
df_hap = df_impro[df_impro["emotion"] == "hap"]
df_sad = df_impro[df_impro["emotion"] == "sad"]
df_neu = df_impro[df_impro["emotion"] == "neu"]

# Concatenate the data of the four emotions
df_IEMOCAP = pd.concat([df_ang, df_hap, df_sad, df_neu])
df_IEMOCAP.shape

(2280, 7)

In [3]:
# %% Prepare the downloaded EMODB dataset for SER
#The dataset was retrieved from and processed according to EMODB data website http://www.emodb.bilderbar.info/download/
#10 speakers: 03, 08, 09, 10, 11, 12, 13, 14, 15, 16
#Emotion translation: Wut -> angry; Langeweile -> boredom; Ekel -> disgust; Angst -> fear;
                    # Freude -> happiness; Trauer -> sadness

path_EMODB = "/Users/talen/Documents/Datasets/EMODB/wav/"

speakers = []
emotions = []
file_paths = []
for folder, _, files in os.walk(path_EMODB):
    for file in files:
        #Get the speaker ID
        speakers.append(file[:2])
        #Get the emotion class
        if file[5] == "W":
            emotions.append("angry")
        elif file[5] == "L":
            emotions.append("boredom")
        elif file[5] == "A":
            emotions.append("fear")
        elif file[5] == "F":
            emotions.append("happiness")
        elif file[5] == "T":
            emotions.append("sadness")
        elif file[5] == "E":
            emotions.append("disgust")
        else:
            emotions.append("neutral")
        #Get the file path
        file_paths.append(os.path.join(path_EMODB, file))

df_EMODB = pd.DataFrame(data={
    "Speaker":speakers,
    "Emotion":emotions,
    "File_path":file_paths
})

In [4]:
df_EMODB.head()

Unnamed: 0,Speaker,Emotion,File_path
0,16,boredom,/Users/talen/Documents/Datasets/EMODB/wav/16a0...
1,14,angry,/Users/talen/Documents/Datasets/EMODB/wav/14a0...
2,10,fear,/Users/talen/Documents/Datasets/EMODB/wav/10a0...
3,13,disgust,/Users/talen/Documents/Datasets/EMODB/wav/13a0...
4,14,angry,/Users/talen/Documents/Datasets/EMODB/wav/14a0...


In [None]:
# create variables for restoring the LLDs, smfcc, their corresponding emotion classes,

#For IEMOCAP dataset: emotion classes for 3s-segment-level: ang -> 0, hap -> 1, sad -> 2, neu -> 3
Audio_features_IEMOCAP = {
    "1": {
        "M":{"LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]},
        "F":{"LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]}
    },
    "2": {
        "M":{"LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]},
        "F":{"LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]}
    },
    "3": {
        "M":{"LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]},
        "F":{"LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]}
    },
    "4": {
        "M":{"LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]},
        "F":{"LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]}
    },
    "5": {
        "M":{"LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]},
        "F":{"LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]}
    }
}

#For EMODB dataset: speakers: 03, 08, 09, 10, 11, 12, 13, 14, 15, 16 
Audio_features_EMODB = {
    "03": {
        "LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]
    },
    "08": {
        "LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]
    },
    "09": {
        "LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]
    },
    "10": {
        "LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]
    },
    "11": {
        "LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]
    },
    "12": {
        "LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]
    },
    "13": {
        "LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]
    },
    "14": {
        "LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]
    },
    "15": {
        "LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]
    },
    "16": {
        "LLDs": [], "Log-Mel-spectram": [], "smfcc": [], "class": [], "LLDs_ori":[], "spectrogram_ori":[]
    }
}

In [None]:
# %% Define the functions for preprocessing and feature extraction

# Sampling and quantising the raw audio file into the digital signal
def Sampling_and_quantising(file_path):
    audiofile = file_path

    # Sampling and quantising the audio file into digital signals with the sampling rate of 16kHz
    signal, sr = librosa.load(audiofile, sr=16000)

    return signal, sr


# Extract the LLDs of ComParE_2016 by openSMILE, except for the mfcc-related data
def Gain_LLDs(signal, sr):
    smile = opensmile.Smile(
        feature_set=opensmile.FeatureSet.ComParE_2016,
        feature_level=opensmile.FeatureLevel.Functionals,
    )

    LLDs = smile.process_signal(signal, sr)
    values_ori = LLDs.copy()
    values_ori = values_ori.values[0]
    values_ori = values_ori.tolist()

    drop_criteria = ["mfcc" in x for x in LLDs.columns]

    drop_indices = []
    for index in range(len(drop_criteria)):
        if drop_criteria[index]:
            drop_indices.append(LLDs.columns[index])

    LLDs.drop(labels=drop_indices, axis=1, inplace=True)

    values = LLDs.values[0]

    # Restore the LLDs in the Audio_features dictionary
    values = values.tolist()

    return values, values_ori


# Compute the Log-Mel-spectrogram
def Gain_Log_Mel_spectrogram(signal, sr, n_fft, n_mels, window):
    # Compute the Log-Mel-Spectrogram for each segment
    # 1.Compute the spectrogram for each segment short-time FT
    stft = librosa.core.stft(y=signal, n_fft=n_fft, window=window)
    spectrogram = np.abs(stft)

    # 2.Compute the mel-spectrogram by applying filter banks on the spectrogram coefficient
    mel_spectrogram = librosa.feature.melspectrogram(sr=sr, S=spectrogram, n_mels=n_mels)

    # 3.Compute the logarithm of the mel-spectrogram coefficient
    log_mel_spectrogram = librosa.power_to_db(mel_spectrogram)
#     # Transpose the spectrogram to denote its rows as the time, and columns as the log-mel-spectrogram coefficient
#     log_mel_spectrogram = log_mel_spectrogram.T

#     # 4.Get the log-mel-spectrum in the Audio_features dictionary
#     log_mel_spectrogram = log_mel_spectrogram.tolist()
    
    return log_mel_spectrogram


# Compute the MFCCs
def Gain_MFCCs(signal, sr, n_fft, n_mels, n_mfcc, window):
    smfccs = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc, dct_type=2, norm="ortho",
                                  n_mels=n_mels, n_fft=n_fft, window=window)

    # Transpose the SMFCCs so as the row denotes the time
    smfccs = smfccs.T

    # Store the SMFCCs and their emotion labels under the respective path in the Audio_features dictionary
    smfccs = smfccs.tolist()

    return smfccs


#Generate the spectrogram images
def Spectrogram_img(spectro, sr, path):
    librosa.display.specshow(spectro, x_axis="time", y_axis="mel", sr=sr)
    ax = plt.gca()
    ax.axes.xaxis.set_visible(False)
    ax.axes.yaxis.set_visible(False)
    plt.savefig(path)
    

In [None]:
# %% Calculate frame-level features from raw audio signals
def Extract_segment_level_features(current_row):
    global h, start_time

    # Gain the audio file path, together with its emotion classes, and session path
#         #For IEMOCAP dataset
#     audiofile = str(current_row["path"])
#     emotion = str(current_row["emotion"])
#     session = str(current_row["session"])
#     gender = str(current_row["gender"])
        
        #For EMODB dataset
    speaker = str(current_row["Speaker"])
    emoji = str(current_row["Emotion"])
    file_path = str(current_row["File_path"])

    # Sampling and quantising the raw audio into the digital signal
        #For IEMOCAP dataset -> audiofile; For EMODB dataset -> file_path
    signal, sr = Sampling_and_quantising(file_path)

    # Check the length of the audio signal to create 3s-segments
    audio_length = librosa.get_duration(signal, sr=sr)
    No_samples_in_3s = sr * 3

    signals_aligned = []
    if audio_length < 3:
        # Zero-pad the signal if it is less than 3s
        signal_padded = librosa.util.fix_length(data=signal, size=No_samples_in_3s)
        signals_aligned.append(signal_padded)
    else:
        # Segment the signal if it is more than 3s
        N_3s = int(audio_length / 3)  # How many segments of 3s in current signal
        for N_3 in range(N_3s):
            # Store the segments of 3s
            start_index = N_3 * No_samples_in_3s
            end_index = start_index + No_samples_in_3s
            signal_segment = signal[start_index:end_index]
            signals_aligned.append(signal_segment)

            # Zero-pad the rest part
        rest_part = signal[N_3s * No_samples_in_3s:]
        if len(rest_part) > 2/3 * No_samples_in_3s:
            rest_part_padded = librosa.util.fix_length(data=rest_part, size=No_samples_in_3s)
            signals_aligned.append(rest_part_padded)
        else: pass

    # Channel 1: extracting features from original signal
    s=0
    for signal_aligned in tqdm(signals_aligned):
#         # Extract the segment-level LLDs with their functionals from the original signal by openSMILE
#         llds, llds_ori = Gain_LLDs(signal=signal_aligned, sr=sr)        
        
#         #Restore the IEMOCAP data features
#         Audio_features_IEMOCAP[session][gender]["LLDs"].append(llds)
#         Audio_features_IEMOCAP[session][gender]["LLDs_ori"].append(llds_ori)

#         if emotion == "ang":
#             Audio_features_IEMOCAP[session][gender]["class"].append(0)
#         elif emotion == "hap":
#             Audio_features_IEMOCAP[session][gender]["class"].append(1)
#         elif emotion == "sad":
#             Audio_features_IEMOCAP[session][gender]["class"].append(2)
#         else:
#             Audio_features_IEMOCAP[session][gender]["class"].append(3)

#         #Restore the EMODB data features
#         Audio_features_EMODB[speaker]["LLDs"].append(llds)
#         Audio_features_EMODB[speaker]["LLDs_ori"].append(llds_ori)
        
#         if emoji == "angry":
#             Audio_features_EMODB[speaker]["class"].append(0)
#         elif emoji == "boredom":
#             Audio_features_EMODB[speaker]["class"].append(1)
#         elif emoji == "fear":
#             Audio_features_EMODB[speaker]["class"].append(2)
#         elif emoji == "happiness":
#             Audio_features_EMODB[speaker]["class"].append(3)
#         elif emoji == "sadness":
#             Audio_features_EMODB[speaker]["class"].append(4)
#         elif emoji == "disgust":
#             Audio_features_EMODB[speaker]["class"].append(5)
#         else:
#             Audio_features_EMODB[speaker]["class"].append(6)
        
        
        # Channel 2: extracting features from the signal with trend removed
        # Remove signal trend by Zero-crossing detection method
        # 1.Use Empirical Mode Decomposition (EMD) method to decompose the signal into IMFs
        emd = EMD()
        IMFs = emd.emd(signal_aligned, max_imf=9)

        # 2. Select the IMFs that satisfy particular criterion
        # 2.1 Criterion analysis: ZCR_IMF_i / ZCR_IMF_1 < 0.01  =>  N_ZC_IMF_i / N_ZC_IMF_1 < 0.01, when the IMFs has the same time length
        IMFs_selected_index = []

        # 2.2 The zero crossing of the first IMF
        R_imf_1 = librosa.core.zero_crossings(IMFs[0], pad=False, zero_pos=True)
        n_R_imf_1 = sum(R_imf_1)

        for i in range(1, len(IMFs)):
            R_imf_i = librosa.core.zero_crossings(IMFs[i], pad=False, zero_pos=True)
            n_R_imf_i = sum(R_imf_i)

            # 2.3 Check the criterion
            if n_R_imf_i / n_R_imf_1 < 0.01:
                IMFs_selected_index.append(i)

            # 3. Derive the signal trend based on the selected IMFs
        T = IMFs[0]

        for index in range(1, len(IMFs_selected_index)):
            T = T + IMFs[index]

            # 4. Subtract the signal trend from the original signal
        signal_trend_removed = signal_aligned - T


        # Extract segment-level spectrograms and SMFCCs from the signal with trend removed
        # Calculate the segment-level log-mel-spectrograms by 512 point STFT and 40 mel-filter banks
        # Apply the hamming window
        spectro = Gain_Log_Mel_spectrogram(signal=signal_trend_removed, sr=sr, n_fft=512, n_mels=40, window="ham")
        spectro_ori = Gain_Log_Mel_spectrogram(signal=signal_aligned, sr=sr, n_fft=512, n_mels=40, window="ham")

#         # Calculate the 14 smfcc by 512 point STFT and 40 mel-filter banks
#         smfcc = Gain_MFCCs(signal=signal_trend_removed, sr=sr, n_fft=512, n_mels=40, n_mfcc=14, window="ham")
        
        
#         #For storing the IEMOCAP data features
#         Audio_features_IEMOCAP[session][gender]["Log-Mel-spectram"].append(spectro)
#         Audio_features_IEMOCAP[session][gender]["spectrogram_ori"].append(spectro_ori)
#         Audio_features_IEMOCAP[session][gender]["smfcc"].append(smfcc)
        
#         #For storing the EMODB data features
#         Audio_features_EMODB[speaker]["Log-Mel-spectram"].append(spectro)
#         Audio_features_EMODB[speaker]["spectrogram_ori"].append(spectro_ori)
#         Audio_features_EMODB[speaker]["smfcc"].append(smfcc)
        
        #Store the spectrogram images
            #Store the orignial spectrogram
        Spectrogram_img(spectro=spectro_ori, sr=sr, 
                        path="/Users/talen/Desktop/Spectrograms_EMODB/ori/"+str(h)+"."+str(s)+"."+speaker+"."+emoji+"."+"ori"+".jpg")
        print("Ori spectrogram of utterance{} - segment {} is done!".format(h, s))
            #Store the spectrogram after removing signal trend
        Spectrogram_img(spectro=spectro, sr=sr, 
                        path="/Users/talen/Desktop/Spectrograms_EMODB/EMD/"+str(h)+"."+str(s)+"."+speaker+"."+emoji+"."+"EMD"+".jpg")
        print("EMD spectrogram of utterance{} - segment {} is done!".format(h, s))


    end_time = time.time()
    used_time = end_time - start_time
    rest_time_h = int((used_time / h) * (2280 - h) / 3600)
    rest_time_m = int(((used_time / h) * (2280 - h) % 3600) / 60)

    print("Sample {}/2280 is done!\nEstimated completion will be {} hour {}mins".format(h, rest_time_h, rest_time_m))
    h += 1

In [None]:
#%% Iterate all the speech samples in IEMOCAP dataset
h = 1
start_time=time.time()

for r1 in range(len(df_IEMOCAP)):
    row1 = df_IEMOCAP.iloc[r1, :]
    Extract_segment_level_features(row1)

# Store the Audio_features file locally
data_path1 = "/Users/talen/Desktop/Audio_features_IEMOCAP.json"

with open(data_path1, "w") as fp:
    json.dump(Audio_features_IEMOCAP, fp, indent=4)

In [None]:
#%% Iterate all the speech samples in EMODB dataset
h = 1
start_time=time.time()

for r2 in tqdm(range(len(df_EMODB))):
    row2 = df_EMODB.iloc[r2, :]
    Extract_segment_level_features(row2)

# # Store the Audio_features file locally
# data_path2 = "/Users/talen/Desktop/Audio_features_EMODB.json"

# with open(data_path2, "w") as fp:
#     json.dump(Audio_features_EMODB, fp, indent=4)

print("All done!")