In [1]:
import os

os.environ['LC_ALL'] ='C.UTF-8'
os.environ['LANG'] = 'C.UTF-8'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import random
import numpy as np
import pandas as pd
import time

from pathlib import Path
from tqdm.auto import tqdm

import torchaudio



In [2]:
def prepare_RAVDESS_DS(path_audios):
    """
    Generation of the dataframe with the information of the dataset. The dataframe has the following structure:
     ______________________________________________________________________________________________________________________________
    |             name            |                     path                                   |     emotion      |     actor     |
    ______________________________________________________________________________________________________________________________
    |  01-01-01-01-01-01-01.wav   |    <RAVDESS_dir>/audios_16kHz/01-01-01-01-01-01-01.wav     |     Neutral      |     1         |
    ______________________________________________________________________________________________________________________________
    ...
    :param path_audios: Path to the folder that contains all the audios in .wav format, 16kHz and single-channel(mono)
    """
    dict_emotions_ravdess = {
        0: 'Neutral',
        1: 'Calm',
        2: 'Happy',
        3: 'Sad',
        4: 'Angry',
        5: 'Fear',
        6: 'Disgust',
        7: 'Surprise'
    }
    data = []
    for path in tqdm(Path(path_audios).glob("*/*.wav")):
        name = str(path).split('/')[-1].split('.')[0]
        label = dict_emotions_ravdess[int(name.split("-")[2]) - 1]  # Start emotions in 0
        actor = int(name.split("-")[-1])

        try:
            data.append({
                "name": name,
                "path": path,
                "emotion": label,
                "actor": actor
            })
        except Exception as e:
            # print(str(path), e)
            pass
    df = pd.DataFrame(data)
    return df



def generate_train_test(fold, df, save_path=""):
    """
    Divide the data in train and test in a subject-wise 5-CV way. The division is generated before running the training
    of each fold.
    :param fold:[int] Fold to create the train and test sets [ranging from 0 - 4]
    :param df:[DataFrame] Dataframe with the complete list of files generated by prepare_RAVDESS_DS(..) function
    :param save_path:[str] Path to save the train.csv and test.csv per fold
    """
    actors_per_fold = {
        0: [2,5,14,15,16],
        1: [3, 6, 7, 13, 18],
        2: [10, 11, 12, 19, 20],
        3: [8, 17, 21, 23, 24],
        4: [1, 4, 9, 22],
    }

    test_df = df.loc[df['actor'].isin(actors_per_fold[fold])]
    train_df = df.loc[~df['actor'].isin(actors_per_fold[fold])]

    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    if(save_path!=""):
        train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
        test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)
    return train_df, test_df

In [3]:
from datasets import load_dataset, load_metric
import os

df = prepare_RAVDESS_DS('dataset')

for fold in tqdm(range(5), desc='make csv files.....'):
    save_path = os.path.join('audio_48k', "fold"+str(fold))
    os.makedirs(save_path, exist_ok=True)
    
    generate_train_test(fold, df, save_path)
    time.sleep(10)

0it [00:00, ?it/s]

make csv files.....:   0%|          | 0/5 [00:00<?, ?it/s]

In [4]:
import librosa


def speech_file_to_array_fn(path, sample_rate=48000):
    """
    Loader of audio recordings. It loads the recordings and convert them to a specific sampling rate if required, and returns
    an array with the samples of the audio.
    :param path:[str] Path to the wav file.
    """
    waveform, _ = librosa.load(path, duration=3, offset=0.5, sr=sample_rate)
    
    waveform_homo = np.zeros((int(sample_rate*3,)))
    waveform_homo[:len(waveform)] = waveform
                                      
    return waveform_homo

def label_to_id(label):
    label_list = ['Neutral', 'Calm', 'Happy', 'Sad', 'Angry', 'Fear', 'Disgust', 'Surprise']

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples, input_column = "path", output_column = "emotion"):
    """
    Load the recordings with their labels.
    :param examples:[DataFrame]  with the samples of the training or test sets.
    :param input_column:[str]  Column that contain the paths to the recordings
    :param output_column:[str]  Column that contain the emotion associated to each recording
    :param target_sampling_rate:[int] Global variable with the expected sampling rate of the model
    """
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label_to_id(label) for label in examples[output_column]]

    result = {
        'input_values': speech_list,
        'labels': target_list
    }

    return result

In [5]:
def feature_melspectrogram(
    waveform, 
    sample_rate,
    fft = 1024,
    winlen = 512,
    window='hamming',
    hop=256,
    mels=128,
    ):
    
    melspectrogram = librosa.feature.melspectrogram(
        y=waveform, 
        sr=sample_rate, 
        n_fft=fft, 
        win_length=winlen, 
        window=window, 
        hop_length=hop, 
        n_mels=mels, 
        fmax=sample_rate/2)
    
    melspectrogram = librosa.power_to_db(melspectrogram, ref=np.max)
    
    return melspectrogram

def feature_mfcc(
    waveform, 
    sample_rate,
    n_mfcc = 40,
    fft = 1024,
    winlen = 512,
    window='hamming',
    mels=128
    ):

    # Compute the MFCCs for all STFT frames 
    # 40 mel filterbanks (n_mfcc) = 40 coefficients
    mfc_coefficients=librosa.feature.mfcc(
        y=waveform, 
        sr=sample_rate, 
        n_mfcc=n_mfcc,
        n_fft=fft, 
        win_length=winlen, 
        window=window, 
        #hop_length=hop, 
        n_mels=mels, 
        fmax=sample_rate/2
        ) 

    return mfc_coefficients

def get_features(waveforms, sample_rate=48000):

    ret = []
    file_count = 0

    for waveform in waveforms:
        mfccs = feature_mfcc(waveform, sample_rate)
        ret.append(mfccs)
        file_count += 1
        
#         print('\r'+f' Processed {file_count}/{len(waveforms)} waveforms',end='')
    
    return ret

In [6]:
def awgn_waveforms(waveform, multiples=2, bits=16, snr_min=15, snr_max=30):

    # get length of waveform (should be 3*48k = 144k)
    wave_len = len(waveform)

    # Generate normally distributed (Gaussian) noises
    # one for each waveform and multiple (i.e. wave_len*multiples noises)
    noise = np.random.normal(size=(multiples, wave_len))

    # Normalize waveform and noise
    norm_constant = 2.0**(bits-1)
    norm_wave = waveform / norm_constant
    norm_noise = noise / norm_constant

    # Compute power of waveform and power of noise
    signal_power = np.sum(norm_wave ** 2) / wave_len
    noise_power = np.sum(norm_noise ** 2, axis=1) / wave_len

    # Choose random SNR in decibels in range [15,30]
    snr = np.random.randint(snr_min, snr_max)

    # Apply whitening transformation: make the Gaussian noise into Gaussian white noise
    # Compute the covariance matrix used to whiten each noise
    # actual SNR = signal/noise (power)
    # actual noise power = 10**(-snr/10)
    covariance = np.sqrt((signal_power / noise_power) * 10 ** (- snr / 10))
    # Get covariance matrix with dim: (144000, 2) so we can transform 2 noises: dim (2, 144000)
    covariance = np.ones((wave_len, multiples)) * covariance

    # Since covariance and noise are arrays, * is the haddamard product
    # Take Haddamard product of covariance and noise to generate white noise
    multiple_augmented_waveforms = waveform + covariance.T * noise

    return multiple_augmented_waveforms

def augment_awgn_waveforms(waveforms, features, emotions, multiples, sample_rate):
    # keep track of how many waveforms we've processed so we can add correct emotion label in the same order
    emotion_count = 0
    # keep track of how many augmented samples we've added
    added_count = 0
    # convert emotion array to list for more efficient appending
    emotions = emotions.tolist()

    for waveform in waveforms:

        # Generate 2 augmented multiples of the dataset, i.e. 1440 native + 1440*2 noisy = 4320 samples total
        augmented_waveforms = awgn_waveforms(waveform, multiples=multiples)

        # compute spectrogram for each of 2 augmented waveforms
        for augmented_waveform in augmented_waveforms:

            # Compute MFCCs over augmented waveforms
            augmented_mfcc = feature_mfcc(
                augmented_waveform, sample_rate=sample_rate)

            # append the augmented spectrogram to the rest of the native data
            features.append(augmented_mfcc)
            emotions.append(emotions[emotion_count])

            # keep track of new augmented samples
            added_count += 1

            # check progress
#             print('\r'+f'Processed {emotion_count + 1}/{len(waveforms)} waveforms for {added_count}/{len(waveforms)*multiples} new augmented samples', end='')

        # keep track of the emotion labels to append in order
        emotion_count += 1

    return features, emotions

In [7]:
from sklearn.preprocessing import StandardScaler

def feature_scaling(X_train, X_test):
    scaler = StandardScaler()

    N, T, F = X_train.shape
    X_train = np.reshape(X_train, (N,-1))
    X_train = scaler.fit_transform(X_train)
    X_train = np.reshape(X_train, (N,T,F))
    
    
    N, T, F = X_test.shape
    X_test = np.reshape(X_test, (N,-1))
    X_test = scaler.transform(X_test)
    X_test = np.reshape(X_test, (N,T,F))

In [8]:
import numpy as np

save_dir = 'numpy_48k_1d'

dataset_1d = []
for fold in tqdm(range(5), desc=f'fold data preprocessing.....'):
    save_path = os.path.join('audio_48k', "fold"+str(fold))
    
    data_files = {
        "train": os.path.join(save_path, "train.csv"),
        "validation": os.path.join(save_path, "test.csv"),
    }
    
    #Load data
    dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
    train_dataset = dataset["train"]
    eval_dataset = dataset["validation"]
    
    train = preprocess_function(train_dataset)
    test = preprocess_function(eval_dataset)
    
    X_train = np.array(train["input_values"])
    y_train = np.array(train['labels'])
    X_test = np.array(test["input_values"])
    y_test = np.array(test['labels'])
    
    print(f'train, test waveforms shape')
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    
    features_train = get_features(X_train, 48000)
    features_test = get_features(X_test, 48000)
    
    features_train, y_train = augment_awgn_waveforms(X_train, features_train, y_train, 2, 48000)
    features_test, y_test = augment_awgn_waveforms(X_test, features_test, y_test, 2, 48000)
    X_train = np.array(features_train)
    y_train = np.array(y_train)
    X_test = np.array(features_test)
    y_test = np.array(y_test)
    
    print(f'train, test awgn shape')
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    
    feature_scaling(X_train, X_test)
    print(f'train, test feature scaling shape')
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape, '\n\n')
    
    numpy_name = os.path.join(save_dir, str(fold) + '.npy')
    os.makedirs(save_dir, exist_ok=True)
    
    with open(numpy_name, 'wb') as f:
        np.save(f, X_train)
        np.save(f, y_train)
        np.save(f, X_test)
        np.save(f, y_test)

fold data preprocessing.....:   0%|          | 0/5 [00:00<?, ?it/s]

Using custom data configuration default-b8bcc55f28dc144d


Downloading and preparing dataset csv/default to C:/Users/devLupin/.cache/huggingface/datasets/csv/default-b8bcc55f28dc144d/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to C:/Users/devLupin/.cache/huggingface/datasets/csv/default-b8bcc55f28dc144d/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


  0%|          | 0/2 [00:00<?, ?it/s]

train, test waveforms shape
(1140, 144000) (1140,) (300, 144000) (300,)
train, test awgn shape
(3420, 40, 282) (3420,) (900, 40, 282) (900,)
train, test feature scaling shape
(3420, 40, 282) (3420,) (900, 40, 282) (900,) 




Using custom data configuration default-6481803daa7c2721


Downloading and preparing dataset csv/default to C:/Users/devLupin/.cache/huggingface/datasets/csv/default-6481803daa7c2721/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to C:/Users/devLupin/.cache/huggingface/datasets/csv/default-6481803daa7c2721/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


  0%|          | 0/2 [00:00<?, ?it/s]

train, test waveforms shape
(1140, 144000) (1140,) (300, 144000) (300,)
train, test awgn shape
(3420, 40, 282) (3420,) (900, 40, 282) (900,)
train, test feature scaling shape
(3420, 40, 282) (3420,) (900, 40, 282) (900,) 




Using custom data configuration default-551db4a7f964cc4f


Downloading and preparing dataset csv/default to C:/Users/devLupin/.cache/huggingface/datasets/csv/default-551db4a7f964cc4f/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to C:/Users/devLupin/.cache/huggingface/datasets/csv/default-551db4a7f964cc4f/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


  0%|          | 0/2 [00:00<?, ?it/s]

train, test waveforms shape
(1140, 144000) (1140,) (300, 144000) (300,)
train, test awgn shape
(3420, 40, 282) (3420,) (900, 40, 282) (900,)
train, test feature scaling shape
(3420, 40, 282) (3420,) (900, 40, 282) (900,) 




Using custom data configuration default-10664dd408e3abb0


Downloading and preparing dataset csv/default to C:/Users/devLupin/.cache/huggingface/datasets/csv/default-10664dd408e3abb0/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to C:/Users/devLupin/.cache/huggingface/datasets/csv/default-10664dd408e3abb0/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


  0%|          | 0/2 [00:00<?, ?it/s]

train, test waveforms shape
(1140, 144000) (1140,) (300, 144000) (300,)
train, test awgn shape
(3420, 40, 282) (3420,) (900, 40, 282) (900,)
train, test feature scaling shape
(3420, 40, 282) (3420,) (900, 40, 282) (900,) 




Using custom data configuration default-325a78e528c91f5d


Downloading and preparing dataset csv/default to C:/Users/devLupin/.cache/huggingface/datasets/csv/default-325a78e528c91f5d/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to C:/Users/devLupin/.cache/huggingface/datasets/csv/default-325a78e528c91f5d/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  return pd.read_csv(xopen(filepath_or_buffer, "rb", use_auth_token=use_auth_token), **kwargs)


  0%|          | 0/2 [00:00<?, ?it/s]

train, test waveforms shape
(1200, 144000) (1200,) (240, 144000) (240,)
train, test awgn shape
(3600, 40, 282) (3600,) (720, 40, 282) (720,)
train, test feature scaling shape
(3600, 40, 282) (3600,) (720, 40, 282) (720,) 


