# Introduction
Running this preprocessing notebook will result in creation of black and white images of the audio files

We set up the properties of the output and import all libraries.

In [1]:
SR = 44100
IMG_SIZE = (224, 224)
SEGMENT_DURATION = 5
SEGMENT_STEP = 2

SEGMENT_LENGTH = SEGMENT_DURATION * SR
SEGMENT_STEP_LENGTH = SEGMENT_STEP * SR

SAVE_SPLITS = True
NOICE_REDUCTION = True
CREATE_COMBINED_FILES = False

TAG = f'nr{NOICE_REDUCTION}_step{SEGMENT_STEP}s_len{SEGMENT_DURATION}s'

In [2]:
import os
from tqdm import tqdm

import pandas as pd
import numpy as np
import librosa
from pydub import AudioSegment
import noisereduce as nr

import skimage.io
from skimage.transform import resize
from skimage.util import img_as_ubyte

from scipy.io.wavfile import write

# Data Preprocessing
Next, we define emotions which we'll include in the dataset and create the output folders.

In [3]:
OUTPUT_FOLDER = f'../../data/prepared/mel-spectrogram/combined/prepared_images_{IMG_SIZE[0]}_{TAG}'
OUTPUT_FOLDER_SPLIT_AUDIO = f'../../data/prepared/audio/split_{TAG}'
OUTPUT_FOLDER_COMBINED_AUDIO = f'../../data/prepared/combined_audio'

EMOTIONS = ['happy', 'surprise', 'anger', 'sad', 'neutral', 'disgust', 'fear']
EMOTIONS_MAP = {
    'happy': 0,
    'surprise': 1,
    'anger': 2,
    'sad': 3,
    'neutral': 4,
    'disgust': 5,
    'fear': 6
}

if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

if not os.path.exists(OUTPUT_FOLDER_SPLIT_AUDIO):
    os.makedirs(OUTPUT_FOLDER_SPLIT_AUDIO)

if not os.path.exists(OUTPUT_FOLDER_COMBINED_AUDIO):
    os.makedirs(OUTPUT_FOLDER_COMBINED_AUDIO)

for emotion in EMOTIONS:
    if not os.path.exists(f'{OUTPUT_FOLDER}/{emotion}'):
        os.mkdir(f'{OUTPUT_FOLDER}/{emotion}')
    if not os.path.exists(f'{OUTPUT_FOLDER_SPLIT_AUDIO}/{emotion}'):
        os.mkdir(f'{OUTPUT_FOLDER_SPLIT_AUDIO}/{emotion}')

Read the reference dataset with all the metadata of the audio files.

In [4]:
df_ = pd.read_csv('../../data/reference_df.csv')
df_.head()

Unnamed: 0,source,dataname,speaker,emotion,intensity,duration,samplerate,gender,statement,filename
0,https://zenodo.org/record/4066235#.Yz_WRNJBwUF,vivae,vivae_S10,achievement,strong,1,44100,,,S10_achievement_strong_01.wav
1,https://zenodo.org/record/4066235#.Yz_WRNJBwUF,vivae,vivae_S02,pain,strong,1,44100,,,S02_pain_strong_05.wav
2,https://zenodo.org/record/4066235#.Yz_WRNJBwUF,vivae,vivae_S08,surprise,low,1,44100,,,S08_surprise_low_02.wav
3,https://zenodo.org/record/4066235#.Yz_WRNJBwUF,vivae,vivae_S05,anger,peak,1,44100,,,S05_anger_peak_02.wav
4,https://zenodo.org/record/4066235#.Yz_WRNJBwUF,vivae,vivae_S08,anger,moderate,1,44100,,,S08_anger_moderate_07.wav


Check which emotions are present in the dataset and keep only needed once.

In [5]:
print(np.unique(df_.emotion))

['achievement' 'anger' 'disgust' 'fear' 'happy' 'neutral' 'pain'
 'pleasure' 'sad' 'surprise']


In [6]:
df = df_[df_.emotion.isin(EMOTIONS)]

Define preprocessing functions.

In [7]:
def construct_audio_path(row):

    data_path = ''
    if row.dataname == 'tess':
        data_path = 'utoronto/data'
    elif row.dataname == 'vivae':
        data_path = 'VIVAE/core_set'
    elif row.dataname == 'ravdess':
        data_path = f'RAVDESS/Audio_Speech_Actors/Actor_{str(row.speaker).split("_")[1]}'

    audio_path = f'../../data/{data_path}/{row.filename}'
    return audio_path

In [8]:
def preprocess_audio(audio_path):

    TMP_FILE = 'tmp.wav'
    x, sr = librosa.load(audio_path, sr = SR)
    normalized_sound=librosa.util.normalize(x, norm=5)
    xt, index = librosa.effects.trim(normalized_sound, top_db = 30)

    # It seems like noice reduction makes sound more neutral
    # final_x = nr.reduce_noise(y=xt,
    #                           y_noise=xt,sr=sr)

    write(TMP_FILE, SR, xt)
    audio = AudioSegment.from_file(TMP_FILE, format="wav")
    os.remove(TMP_FILE)

    return audio

In [9]:
# preprocess_audio('../../data/RAVDESS/Audio_Speech_Actors/Actor_18/03-01-06-01-02-01-18.wav')

In [10]:
def combine_audio(data, emotion):
    data = data.sample(frac=1)
    data.index = np.arange(1,len(data)+1)
    combined = None

    for index, row in data.iterrows():

        # Person in audio samples is emotionless
        if row.filename.startswith("OAF"):
            continue

        audio_path = construct_audio_path(row)
        try:
            _ = AudioSegment.from_file(audio_path, format="wav")
            sound = preprocess_audio(audio_path)
        except:
            print(audio_path)
            continue

        if combined is None:
            combined = sound
            continue
        else:
            combined = combined + sound

    file_handle = combined.export(f"{OUTPUT_FOLDER_COMBINED_AUDIO}/{emotion}.wav", format="wav")

In [11]:
def cut_audio(audio_path):
    x, sr = librosa.load(audio_path.path, sr = SR)
    # x = librosa.util.normalize(x, norm=5)
    splits = []

    i = 0
    while (SEGMENT_STEP_LENGTH * i + SEGMENT_LENGTH) < len(x):

        # Get window of data
        w = x[SEGMENT_STEP_LENGTH * i : SEGMENT_STEP_LENGTH * i + SEGMENT_LENGTH]

        if NOICE_REDUCTION:
            w = librosa.util.normalize(w, norm=5)
            w = nr.reduce_noise(y=w, y_noise=w, sr=SR)

        splits.append(w)
        i+=1
        if SAVE_SPLITS:
            write(f'{OUTPUT_FOLDER_SPLIT_AUDIO}/{audio_path.name.split(".")[0]}/{i+1}.wav', SR, w)

    return splits

In [12]:
def scale_minmax(X, min=0.0, max=1.0):
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled

def to_spectrogram_image(data, path):
    mels = librosa.feature.melspectrogram(data)
    mels = np.log(mels + 1e-9) # add small number to avoid log(0)

    # min-max scale to fit inside 8-bit range
    img = scale_minmax(mels, 0, 255).astype(np.uint8)
    img = np.flip(img, axis=0) # put low frequencies at the bottom in image
    img = 255-img # invert. make black==more energy
    img = img_as_ubyte(img)
    img = resize(img, IMG_SIZE)

    # save as PNG
    skimage.io.imsave(path, img)

# def convert_to_images(audio_path, save_path):
#
#
#     # Loading the audio file
#     # x, sr = librosa.load(audio_path, sr = None)
#     #
#     # # Normalizing +5.0db, transform audio signals to an array
#     # normalized_sound=librosa.util.normalize(x, norm=5)
#     #
#     # # Trimming the silence in the beginning and end
#     # xt, index = librosa.effects.trim(normalized_sound, top_db = 30)
#     # padded_x = np.pad(xt, (0, sr * AUDIO_LENGTH - len(xt)), 'constant')
#     # final_x = nr.reduce_noise(y=padded_x,
#     #                           y_noise=padded_x,sr=sr)
#
#
#
#     # Saving as image
#     to_spectrogram_image(final_x, save_path)

Preprocess all audio files and save them as mel spectrograms.

In [13]:
if CREATE_COMBINED_FILES:
    for e in tqdm(EMOTIONS):
        combine_audio(df[df.emotion == e], e)

In [14]:
for file in os.scandir(OUTPUT_FOLDER_COMBINED_AUDIO):
    splits = cut_audio(file)
    for index, split in enumerate(splits):
        to_spectrogram_image(split, f'{OUTPUT_FOLDER}/{file.name.split(".")[0]}/{index}.png')
    # print(sr)
    # print(file.name.split('.')[0])
    # audio_path = construct_audio_path(row)
    # convert_to_images(file.path, file.name.split('.')[0])

 -2.8858788e-04  1.5152883e-03] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mels = librosa.feature.melspectrogram(data)
  0.00058066] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mels = librosa.feature.melspectrogram(data)
 -0.01637362] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mels = librosa.feature.melspectrogram(data)
  mels = librosa.feature.melspectrogram(data)
  0.00223904] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mels = librosa.feature.melspectrogram(data)
  2.5156417e-06 -2.5293944e-06] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mels = librosa.feature.melspectrogram(data)
  mels = librosa.feature.melspectrogram(data)
  mels = librosa.feature.melspectrogram(data)
 -1.3780198e-03 -9.8010376e-03] as keywor