In [1]:
import os

os.environ['LC_ALL'] ='C.UTF-8'
os.environ['LANG'] = 'C.UTF-8'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

import random
import numpy as np
import pandas as pd
import time

from pathlib import Path
from tqdm import tqdm

import torchaudio



In [2]:
from pathlib import Path
from tqdm import tqdm

def prepare_RAVDESS_DS(path_audios):
    """
    Generation of the dataframe with the information of the dataset. The dataframe has the following structure:
     ______________________________________________________________________________________________________________________________
    |             name            |                     path                                   |     emotion      |     actor     |
    ______________________________________________________________________________________________________________________________
    |  01-01-01-01-01-01-01.wav   |    <RAVDESS_dir>/audios_16kHz/01-01-01-01-01-01-01.wav     |     Neutral      |     1         |
    ______________________________________________________________________________________________________________________________
    ...
    :param path_audios: Path to the folder that contains all the audios in .wav format, 16kHz and single-channel(mono)
    """
    dict_emotions_ravdess = {
        0: 'Neutral',
        1: 'Calm',
        2: 'Happy',
        3: 'Sad',
        4: 'Angry',
        5: 'Fear',
        6: 'Disgust',
        7: 'Surprise'
    }
    data = []
    for path in tqdm(Path(path_audios).glob("*.wav")):
        name = str(path).split('/')[-1].split('.')[0]
        label = dict_emotions_ravdess[int(name.split("-")[2]) - 1]  # Start emotions in 0
        actor = int(name.split("-")[-1])

        try:
            data.append({
                "name": name,
                "path": path,
                "emotion": label,
                "actor": actor
            })
        except Exception as e:
            # print(str(path), e)
            pass
    df = pd.DataFrame(data)
    return df



def generate_train_test(fold, df, save_path=""):
    """
    Divide the data in train and test in a subject-wise 5-CV way. The division is generated before running the training
    of each fold.
    :param fold:[int] Fold to create the train and test sets [ranging from 0 - 4]
    :param df:[DataFrame] Dataframe with the complete list of files generated by prepare_RAVDESS_DS(..) function
    :param save_path:[str] Path to save the train.csv and test.csv per fold
    """
    actors_per_fold = {
        0: [2,5,14,15,16],
        1: [3, 6, 7, 13, 18],
        2: [10, 11, 12, 19, 20],
        3: [8, 17, 21, 23, 24],
        4: [1, 4, 9, 22],
    }

    test_df = df.loc[df['actor'].isin(actors_per_fold[fold])]
    train_df = df.loc[~df['actor'].isin(actors_per_fold[fold])]

    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    if(save_path!=""):
        train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
        test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)
    return train_df, test_df

In [3]:
from datasets import load_dataset, load_metric
import os

df = prepare_RAVDESS_DS('audio_16k')

for fold in range(5):
    save_path = os.path.join('via_wav2vec_fold', "fold"+str(fold))
    os.makedirs(save_path, exist_ok=True)
    
    generate_train_test(fold, df, save_path)
    time.sleep(10)

2880it [00:00, 142764.57it/s]


In [24]:
import librosa

target_sampling_rate = 16000

def speech_file_to_array_fn(path):
    """
    Loader of audio recordings. It loads the recordings and convert them to a specific sampling rate if required, and returns
    an array with the samples of the audio.
    :param path:[str] Path to the wav file.
    """
    speech, _ = librosa.load(path, duration=3, offset=0.5, sr=target_sampling_rate)
    
    ret = np.zeros(40000)
    ret = speech[:35000]
    
    return ret

def label_to_id(label):
    label_list = ['Neutral', 'Calm', 'Happy', 'Sad', 'Angry', 'Fear', 'Disgust', 'Surprise']

    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def preprocess_function(examples, input_column = "path", output_column = "emotion"):
    """
    Load the recordings with their labels.
    :param examples:[DataFrame]  with the samples of the training or test sets.
    :param input_column:[str]  Column that contain the paths to the recordings
    :param output_column:[str]  Column that contain the emotion associated to each recording
    :param target_sampling_rate:[int] Global variable with the expected sampling rate of the model
    """
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = [label_to_id(label) for label in examples[output_column]]

    result = processor(speech_list, sampling_rate=target_sampling_rate)
    result["labels"] = list(target_list)

    return result

In [25]:
from transformers import Wav2Vec2Processor

model_id = 'jonatasgrosman/wav2vec2-large-xlsr-53-english'
processor = Wav2Vec2Processor.from_pretrained(model_id, )

In [26]:
import numpy as np

save_dir = 'via_wav2vec'

dataset_1d = []
for fold in range(5):
    save_path = os.path.join('via_wav2vec_fold', "fold"+str(fold))
    os.makedirs(save_path, exist_ok=True)
    
    data_files = {
        "train": os.path.join(save_path, "train.csv"),
        "validation": os.path.join(save_path, "test.csv"),
    }
    
    #Load data
    dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
    train_dataset = dataset["train"]
    eval_dataset = dataset["validation"]
    
    train = preprocess_function(train_dataset)
    test = preprocess_function(eval_dataset)
    
    X_train = np.array(train["input_values"])
    y_train = np.array(train['labels'])
    X_test = np.array(test["input_values"])
    y_test = np.array(test['labels'])
    
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    
    numpy_name = os.path.join(save_dir, str(fold) + '.npy')
    os.makedirs(save_dir, exist_ok=True)
    
    with open(numpy_name, 'wb') as f:
        np.save(f, X_train)
        np.save(f, y_train)
        np.save(f, X_test)
        np.save(f, y_test)

Using custom data configuration default-f638cf0f2351e4c8
Found cached dataset csv (C:/Users/devLupin/.cache/huggingface/datasets/csv/default-f638cf0f2351e4c8/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/2 [00:00<?, ?it/s]

(2280, 35000) (2280,) (600, 35000) (600,)


Using custom data configuration default-a5c3999e06a65152
Found cached dataset csv (C:/Users/devLupin/.cache/huggingface/datasets/csv/default-a5c3999e06a65152/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/2 [00:00<?, ?it/s]

(2280, 35000) (2280,) (600, 35000) (600,)


Using custom data configuration default-92ae2741f8d47caf
Found cached dataset csv (C:/Users/devLupin/.cache/huggingface/datasets/csv/default-92ae2741f8d47caf/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/2 [00:00<?, ?it/s]

(2280, 35000) (2280,) (600, 35000) (600,)


Using custom data configuration default-920f55d6ce208578
Found cached dataset csv (C:/Users/devLupin/.cache/huggingface/datasets/csv/default-920f55d6ce208578/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/2 [00:00<?, ?it/s]

(2280, 35000) (2280,) (600, 35000) (600,)


Using custom data configuration default-46673fa6ba765cb7
Found cached dataset csv (C:/Users/devLupin/.cache/huggingface/datasets/csv/default-46673fa6ba765cb7/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)


  0%|          | 0/2 [00:00<?, ?it/s]

(2400, 35000) (2400,) (480, 35000) (480,)
