In [16]:
# import libraries
import os
import librosa as li
import numpy as np
import matplotlib.pyplot as plt

In [14]:
base_dir = "/home/gaspar/code/eloisedupenhoat/speech_emotion_recognition/raw_data"

# Vizualize all directories in raw_data
print(sorted(os.listdir(base_dir))[:10])

['Actor_01', 'Actor_02', 'Actor_03', 'Actor_04', 'Actor_05', 'Actor_06', 'Actor_07', 'Actor_08', 'Actor_09', 'Actor_10']


In [36]:
# Collect all paths of .wav files in raw_data
all_paths = []
for dirpath, dirnames, files in os.walk(base_dir):
    dirnames.sort()
    for file in sorted(files):
        if file.endswith('.wav'):
            all_paths.append(f"{dirpath}/{file}")
print(sorted(all_paths[:10]))

['/home/gaspar/code/eloisedupenhoat/speech_emotion_recognition/raw_data/Actor_01/03-01-01-01-01-01-01.wav', '/home/gaspar/code/eloisedupenhoat/speech_emotion_recognition/raw_data/Actor_01/03-01-01-01-01-02-01.wav', '/home/gaspar/code/eloisedupenhoat/speech_emotion_recognition/raw_data/Actor_01/03-01-01-01-02-01-01.wav', '/home/gaspar/code/eloisedupenhoat/speech_emotion_recognition/raw_data/Actor_01/03-01-01-01-02-02-01.wav', '/home/gaspar/code/eloisedupenhoat/speech_emotion_recognition/raw_data/Actor_01/03-01-02-01-01-01-01.wav', '/home/gaspar/code/eloisedupenhoat/speech_emotion_recognition/raw_data/Actor_01/03-01-02-01-01-02-01.wav', '/home/gaspar/code/eloisedupenhoat/speech_emotion_recognition/raw_data/Actor_01/03-01-02-01-02-01-01.wav', '/home/gaspar/code/eloisedupenhoat/speech_emotion_recognition/raw_data/Actor_01/03-01-02-01-02-02-01.wav', '/home/gaspar/code/eloisedupenhoat/speech_emotion_recognition/raw_data/Actor_01/03-01-02-02-01-01-01.wav', '/home/gaspar/code/eloisedupenhoat/s

In [41]:
for path in all_paths[:10]:

    # load the audio into librosa
	# y is a NumPy array witht he waveform data
    # sr is the sample rate (16khz)
    y, sr = li.load(path, sr=16000)

    # scales the waveform so the maximum value is 1
    y = y / np.abs(y).max()

    # remove silence
    # silence is defined by a decibel threshold (tob_db)
    # default should be around 60 db
    y, _ = li.effects.trim(y)

    # compute the mel spectogram
    # it's a 2d array (rows = frequency ; columns = time ; values = intensity (brightness))
    S = li.feature.melspectrogram(y=y, sr=sr)

    # convert the spectogram from raw energy (power) into decibel scale (log scale)
    S_dB = li.power_to_db(S, ref=np.max)

    # define image_data file path (the file where we save the images)
    image_path = "/home/gaspar/code/eloisedupenhoat/speech_emotion_recognition/image_data"

    # convert the standardized spectogram into an image file & save it
    image_path = os.path.join(image_path, os.path.basename(path).replace(".wav", ".jpg"))

    plt.axis('off')
    plt.imshow(S_dB, aspect='auto', cmap='magma', origin='lower')
    plt.savefig(image_path, bbox_inches='tight', pad_inches=0)
    plt.close()