# Introduction
Running this preprocessing notebook will result in creation of black and white images of the audio files

We set up the properties of the output and import all libraries.

In [1]:
AUDIO_LENGTH = 5
IMG_SIZE = (224, 224)

In [2]:
import os

import pandas as pd
import numpy as np
import librosa

import skimage.io
from skimage.transform import resize
from skimage.util import img_as_ubyte

# Data Preprocessing
Next, we define emotions which we'll include in the dataset and create the output folders.

In [3]:
OUTPUT_FOLDER = f'../../data/prepared/mel-spectrogram/pad/prepared_images_{IMG_SIZE[0]}'

EMOTIONS = ['happy', 'surprise', 'anger', 'sad', 'neutral', 'disgust', 'fear']
EMOTIONS_MAP = {
    'happy': 0,
    'surprise': 1,
    'anger': 2,
    'sad': 3,
    'neutral': 4,
    'disgust': 5,
    'fear': 6
}

if not os.path.exists(OUTPUT_FOLDER):
    os.makedirs(OUTPUT_FOLDER)

for emotion in EMOTIONS:
    if not os.path.exists(f'{OUTPUT_FOLDER}/{emotion}'):
        os.mkdir(f'{OUTPUT_FOLDER}/{emotion}')

Read the reference dataset with all the metadata of the audio files.

In [4]:
df_ = pd.read_csv('../../data/reference_df.csv')
df_.head()

Unnamed: 0,source,dataname,speaker,emotion,intensity,duration,samplerate,gender,statement,filename
0,https://zenodo.org/record/4066235#.Yz_WRNJBwUF,vivae,vivae_S10,achievement,strong,1,44100,,,S10_achievement_strong_01.wav
1,https://zenodo.org/record/4066235#.Yz_WRNJBwUF,vivae,vivae_S02,pain,strong,1,44100,,,S02_pain_strong_05.wav
2,https://zenodo.org/record/4066235#.Yz_WRNJBwUF,vivae,vivae_S08,surprise,low,1,44100,,,S08_surprise_low_02.wav
3,https://zenodo.org/record/4066235#.Yz_WRNJBwUF,vivae,vivae_S05,anger,peak,1,44100,,,S05_anger_peak_02.wav
4,https://zenodo.org/record/4066235#.Yz_WRNJBwUF,vivae,vivae_S08,anger,moderate,1,44100,,,S08_anger_moderate_07.wav


Check which emotions are present in the dataset and keep only needed once.

In [5]:
print(np.unique(df_.emotion))

['achievement' 'anger' 'disgust' 'fear' 'happy' 'neutral' 'pain'
 'pleasure' 'sad' 'surprise']


In [6]:
df = df_[df_.emotion.isin(EMOTIONS)]

Define preprocessing functions.

In [7]:
def scale_minmax(X, min=0.0, max=1.0):
    X_std = (X - X.min()) / (X.max() - X.min())
    X_scaled = X_std * (max - min) + min
    return X_scaled

def spectrogram_image(data, path):
    mels = librosa.feature.melspectrogram(data)
    mels = np.log(mels + 1e-9) # add small number to avoid log(0)

    # min-max scale to fit inside 8-bit range
    img = scale_minmax(mels, 0, 255).astype(np.uint8)
    img = np.flip(img, axis=0) # put low frequencies at the bottom in image
    img = 255-img # invert. make black==more energy
    img = img_as_ubyte(img)
    img = resize(img, IMG_SIZE)

    # save as PNG
    skimage.io.imsave(path, img)

def convert_to_image(audio_path, samplerate, save_path):

    # Loading the audio file
    x, sr = librosa.load(audio_path, sr = None)

    # Normalizing +5.0db, transform audio signals to an array
    normalized_sound=librosa.util.normalize(x, norm=5)

    # Trimming the silence in the beginning and end
    xt, index = librosa.effects.trim(normalized_sound, top_db = 30)
    padded_x = np.pad(xt, (0, samplerate * AUDIO_LENGTH - len(xt)), 'constant')

    # Saving as image
    spectrogram_image(padded_x, save_path)

Preprocess all audio files and save them as mel spectrograms.

In [8]:
%%capture
for index, row in df.iterrows():

    data_path = ''
    if row.dataname == 'tess':
        data_path = 'utoronto/data'
    elif row.dataname == 'vivae':
        data_path = 'VIVAE/core_set'
    elif row.dataname == 'ravdess':
        data_path = f'RAVDESS/Audio_Speech_Actors/Actor_{str(row.speaker).split("_")[1]}'

    audio_path = f'../../data/{data_path}/{row.filename}'
    convert_to_image(audio_path, row.samplerate, f'{OUTPUT_FOLDER}/{row.emotion}/{index}.png')