In [1]:
def prepare_RAVDESS_DS(path_audios):
    """
    Generation of the dataframe with the information of the dataset. The dataframe has the following structure:
     ______________________________________________________________________________________________________________________________
    |             name            |                     path                                   |     emotion      |     actor     |
    ______________________________________________________________________________________________________________________________
    |  01-01-01-01-01-01-01.wav   |    <RAVDESS_dir>/audios_16kHz/01-01-01-01-01-01-01.wav     |     Neutral      |     1         |
    ______________________________________________________________________________________________________________________________
    ...
    :param path_audios: Path to the folder that contains all the audios in .wav format, 16kHz and single-channel(mono)
    """
    dict_emotions_ravdess = {
        0: 'Neutral',
        1: 'Calm',
        2: 'Happy',
        3: 'Sad',
        4: 'Angry',
        5: 'Fear',
        6: 'Disgust',
        7: 'Surprise'
    }
    data = []
    for path in tqdm(Path(path_audios).glob("**/*.wav")):
        name = str(path).split('/')[-1].split('.')[0]
        label = dict_emotions_ravdess[int(name.split("-")[2]) - 1]  # Start emotions in 0
        actor = int(name.split("-")[-1])

        try:
            data.append({
                "name": name,
                "path": path,
                "emotion": label,
                "actor": actor
            })
        except Exception as e:
            # print(str(path), e)
            pass
    df = pd.DataFrame(data)
    return df



def generate_train_test(fold, df, save_path=""):
    """
    Divide the data in train and test in a subject-wise 5-CV way. The division is generated before running the training
    of each fold.
    :param fold:[int] Fold to create the train and test sets [ranging from 0 - 4]
    :param df:[DataFrame] Dataframe with the complete list of files generated by prepare_RAVDESS_DS(..) function
    :param save_path:[str] Path to save the train.csv and test.csv per fold
    """
    actors_per_fold = {
        0: [2,5,14,15,16],
        1: [3, 6, 7, 13, 18],
        2: [10, 11, 12, 19, 20],
        3: [8, 17, 21, 23, 24],
        4: [1, 4, 9, 22],
    }

    test_df = df.loc[df['actor'].isin(actors_per_fold[fold])]
    train_df = df.loc[~df['actor'].isin(actors_per_fold[fold])]

    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)

    if(save_path!=""):
        train_df.to_csv(f"{save_path}/train.csv", sep="\t", encoding="utf-8", index=False)
        test_df.to_csv(f"{save_path}/test.csv", sep="\t", encoding="utf-8", index=False)
    return train_df, test_df

In [2]:
def seed_libs(seed=2020):
    """
       Fix the seeds for the random generators of torch and other libraries
       :param seed: Seed to pass to the random seed generators
       """
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)


def seed_torch(seed=2020):
    """
    Fix the seeds for the random generators of torch and other libraries
    :param seed: Seed to pass to the random seed generators
    """

    seed_libs(2020)
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

In [None]:
import os
os.environ['LC_ALL'] ='C.UTF-8'
os.environ['LANG'] = 'C.UTF-8'
os.environ['CUDA_LAUNCH_BLOCKING'] = "-1"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import transformers
from transformers import AutoConfig
from transformers import EvalPrediction
from transformers import TrainingArguments

from datasets import load_dataset, load_metric

from DataCollatorCTCWithPadding import *
from Wav2VecAuxClasses import *
from CTCTrainer import *
from my_functions import *
import time
from datetime import datetime
import random

#Read input parameters
audios_dir = 'audio_16k'
cache_dir = 'MMEmotionRecognition/data/Audio/cache_dir'
out_dir = 'FineTuningWav2Vec2_out'
model_id = 'jonatasgrosman/wav2vec2-large-xlsr-53-english'

#PARAMETERS #######################
out_dir_models = os.path.join(out_dir, "trained_models/wav2vec2-xlsr-ravdess-speech-emotion-recognition") #out path to save trained models
data_path = os.path.join(out_dir,"data") #Path to save csvs generated containing the recording information (path, name, emotion...)

# We need to specify the input and output column
input_column = "path" # Name of the column that will contain the path of the recordings
output_column = "emotion" # Name of the column that will contain the labels of the recordings
pooling_mode = "mean" #Type of pooling to apply to the embeddings generated ath the output of the transformer module to collapse all the timesteps of the recordingsinto a single vector
now = datetime.now()
current_time = now.strftime("%Y%m%d_%H%M%S")
seed = 2020
epochs = 10 #Epochs to train the model

#PARAMETERS #######################
seed_torch(seed=seed) #Set random seeds

for fold in range(5): # 5-CV strategy
    #Define paths, create aux. folders and callbacks to save data
    out_dir_models_path = os.path.join(out_dir_models, current_time, "fold"+str(fold))
    save_path = os.path.join(data_path, current_time, "fold"+str(fold))
    os.environ['TRANSFORMERS_CACHE'] = os.path.join(cache_dir, current_time, "fold"+str(fold))
    os.environ['HF_DATASETS_CACHE'] = os.path.join(cache_dir, current_time, "fold"+str(fold))
    os.makedirs(save_path, exist_ok=True)
    print("SAVING DATA IN: ", save_path)
    callbackTB = transformers.integrations.TensorBoardCallback()

    #######################
    #PREPARE DATASET
    #Generate complete dataframe with RAVDESS samples
    df = prepare_RAVDESS_DS(audios_dir)
    _, _ = generate_train_test(fold, df, save_path)
    time.sleep(10) #wait some time to get the dataset ready
    data_files = {
        "train": os.path.join(save_path, "train.csv"),
        "validation": os.path.join(save_path, "test.csv"),
    }
    
    #Load data
    dataset = load_dataset("csv", data_files=data_files, delimiter="\t", )
    train_dataset = dataset["train"]
    eval_dataset = dataset["validation"]
    print("Processing fold: ", str(fold), " - actors in Train fold: ",set(train_dataset["actor"]))
    print("Processing fold: ", str(fold), " - actors in Eval fold: ", set(eval_dataset["actor"]))
    label_list = train_dataset.unique(output_column)
    label_list.sort()  # Let's sort it for determinism
    num_labels = len(label_list)
    print(f"A classification problem with {num_labels} classes: {label_list}")
    
    # LOAD PRE-TRAINED MODEL ON ASR
    # config
    config = AutoConfig.from_pretrained(
        model_id, #path to the model of HuggingFace lib. that we will use as baseline to fine-tune.
        num_labels=num_labels, # num classes
        label2id={label: i for i, label in enumerate(label_list)}, # dict that maps emotions -> numbers
        id2label={i: label for i, label in enumerate(label_list)}, # dict that maps numbers -> emotions
        finetuning_task="wav2vec2_clf",
    )
    
    #Add in the config variable the 'pooling_mode'
    setattr(config, 'pooling_mode', pooling_mode)
    
    #Load the processor for the type of model (Wav2Vec2.0 in our case) and get the expected sampling rate (16kHZ in our case)
    processor = Wav2Vec2Processor.from_pretrained(model_id, )
    target_sampling_rate = processor.feature_extractor.sampling_rate
    print(f"The target sampling rate: {target_sampling_rate}")
    print("Generating training...")
    train_dataset = train_dataset.map(
        preprocess_function,
        batch_size=100,
        batched=True,
        num_proc=4
    )
    print("Generating test...")
    eval_dataset = eval_dataset.map(
        preprocess_function,
        batch_size=100,
        batched=True,
        num_proc=4
    )
    
    #MODEL
    print("Training model...")
    data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)
    is_regression = False
    
    #Create the architecture: Wav2Vec2.0 model + mean pooling + MLP (1024, 8)
    model = Wav2Vec2ForSpeechClassification.from_pretrained(
        model_id,
        config=config,
    )
    
    #Freeze feature encoder layers (CNNs) of wav2vec2.0 & train the transformer module and the MLP that we have added (randomly initialized)
    model.freeze_feature_extractor()
    
    #Set trainig arguments/parameters
    training_args = TrainingArguments(
        output_dir=out_dir_models_path,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        gradient_accumulation_steps=2,
        evaluation_strategy="steps",
        prediction_loss_only=False,
        num_train_epochs=epochs,
        fp16=True,
        save_steps=10,
        eval_steps=10,
        logging_steps=10,
        learning_rate=1e-4,
        save_total_limit=5,
        load_best_model_at_end=True,
        metric_for_best_model="eval_accuracy",
        seed=seed, )
    
    #Set data collator to pad the small recordings
    trainer = CTCTrainer(
        model=model,
        data_collator=data_collator,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=processor.feature_extractor,
        callbacks = [callbackTB])
    
    #Start training the network using the train_dataset & evaluating it on the eval_dataset passed as parameters
    # to the CTCTrainer
    trainer.train()

SAVING DATA IN:  FineTuningWav2Vec2_out\data\20220925_201018\fold0


2880it [00:00, 93147.10it/s]
Using custom data configuration default-ba4989c74941ac51


Downloading and preparing dataset csv/default to C:\Users\devLupin\.cache\huggingface\datasets\csv\default-ba4989c74941ac51\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to C:\Users\devLupin\.cache\huggingface\datasets\csv\default-ba4989c74941ac51\0.0.0\6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

Processing fold:  0  - actors in Train fold:  {1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 17, 18, 19, 20, 21, 22, 23, 24}
Processing fold:  0  - actors in Eval fold:  {2, 5, 14, 15, 16}
A classification problem with 8 classes: ['Angry', 'Calm', 'Disgust', 'Fear', 'Happy', 'Neutral', 'Sad', 'Surprise']
The target sampling rate: 16000
Generating training...
Generating test...
Training model...


Some weights of the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english were not used when initializing Wav2Vec2ForSpeechClassification: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at jonatasgrosman/wav2vec2-large-xlsr-53-english and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this mode

Step,Training Loss,Validation Loss,Accuracy
10,2.0912,2.168486,0.133333
20,2.1124,2.081263,0.141667
30,2.0984,2.111488,0.141667
40,2.1068,2.081519,0.133333
50,2.0263,2.10352,0.141667


The following columns in the evaluation set  don't have a corresponding argument in `Wav2Vec2ForSpeechClassification.forward` and have been ignored: actor, emotion, path, name.
***** Running Evaluation *****
  Num examples = 600
  Batch size = 4
Saving model checkpoint to FineTuningWav2Vec2_out\trained_models/wav2vec2-xlsr-ravdess-speech-emotion-recognition\20220925_201018\fold0\checkpoint-10
Configuration saved in FineTuningWav2Vec2_out\trained_models/wav2vec2-xlsr-ravdess-speech-emotion-recognition\20220925_201018\fold0\checkpoint-10\config.json
Model weights saved in FineTuningWav2Vec2_out\trained_models/wav2vec2-xlsr-ravdess-speech-emotion-recognition\20220925_201018\fold0\checkpoint-10\pytorch_model.bin
Configuration saved in FineTuningWav2Vec2_out\trained_models/wav2vec2-xlsr-ravdess-speech-emotion-recognition\20220925_201018\fold0\checkpoint-10\preprocessor_config.json
  return (input_length - kernel_size) // stride + 1
The following columns in the evaluation set  don't have a c