https://huggingface.co/blog/fine-tune-wav2vec2-english

https://github.com/Demfier/multimodal-speech-emotion-recognition/blob/master/2_build_audio_vectors.ipynb

In [3]:
import logging
import pathlib
import re
import sys
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, List, Optional, Set, Union

import datasets
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from packaging import version

import librosa
from lang_trans import arabic

import soundfile as sf
from transformers.trainer_utils import get_last_checkpoint
import os

In [4]:
from transformers import (
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    Wav2Vec2CTCTokenizer,
    Wav2Vec2FeatureExtractor,
    Wav2Vec2Processor,
    is_apex_available,
    trainer_utils,
)

In [108]:
import yaml

# Read configuration file with all the necessary parameters
with open('conf.yaml') as file:
  config = yaml.safe_load(file)

In [6]:
@dataclass
class Orthography:
    """
    Orthography scheme used for text normalization and tokenization.

    Args:
        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`False`):
            Whether or not to accept lowercase input and lowercase the output when decoding.
        vocab_file (:obj:`str`, `optional`, defaults to :obj:`None`):
            File containing the vocabulary.
        word_delimiter_token (:obj:`str`, `optional`, defaults to :obj:`"|"`):
            The token used for delimiting words; it needs to be in the vocabulary.
        translation_table (:obj:`Dict[str, str]`, `optional`, defaults to :obj:`{}`):
            Table to use with `str.translate()` when preprocessing text (e.g., "-" -> " ").
        words_to_remove (:obj:`Set[str]`, `optional`, defaults to :obj:`set()`):
            Words to remove when preprocessing text (e.g., "sil").
        untransliterator (:obj:`Callable[[str], str]`, `optional`, defaults to :obj:`None`):
            Function that untransliterates text back into native writing system.
    """

    do_lower_case: bool = False
    vocab_file: Optional[str] = None
    word_delimiter_token: Optional[str] = "|"
    translation_table: Optional[Dict[str, str]] = field(default_factory=dict)
    words_to_remove: Optional[Set[str]] = field(default_factory=set)
    untransliterator: Optional[Callable[[str], str]] = None
    tokenizer: Optional[str] = None

    @classmethod
    def from_name(cls, name: str):
        if name == "librispeech":
            return cls()
        if name == "timit":
            return cls(
                do_lower_case=True,
                # break compounds like "quarter-century-old" and replace pauses "--"
                translation_table=str.maketrans({"-": " "}),
            )
        if name == "buckwalter":
            translation_table = {
                "-": " ",  # sometimes used to represent pauses
                "^": "v",  # fixing "tha" in arabic_speech_corpus dataset
            }
            return cls(
                vocab_file=pathlib.Path(__file__).parent.joinpath("vocab/buckwalter.json"),
                word_delimiter_token="/",  # "|" is Arabic letter alef with madda above
                translation_table=str.maketrans(translation_table),
                words_to_remove={"sil"},  # fixing "sil" in arabic_speech_corpus dataset
                untransliterator=arabic.buckwalter.untransliterate,
            )
        raise ValueError(f"Unsupported orthography: '{name}'.")

    def preprocess_for_training(self, text: str) -> str:
        if len(self.translation_table) > 0:
            text = text.translate(self.translation_table)
        
        if len(self.words_to_remove) == 0:
            try:
                text = " ".join(text.split())  # clean up whitespaces
            except Exception:
                text = "NULL"
        else:
            text = " ".join(w for w in text.split() if w not in self.words_to_remove)  # and clean up whilespaces
        return text

    def create_processor(self, model_args: dict) -> Wav2Vec2Processor:
        feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
                                model_args["name"], 
                                cache_dir=model_args['cache_dir']
                            )
        if self.vocab_file:
            tokenizer = Wav2Vec2CTCTokenizer(
                self.vocab_file,
                cache_dir=model_args['cache_dir'],
                do_lower_case=self.do_lower_case,
                word_delimiter_token=self.word_delimiter_token,
            )
        else:
            tokenizer = Wav2Vec2CTCTokenizer.from_pretrained(
                self.tokenizer,
                cache_dir=model_args['cache_dir'],
                do_lower_case=self.do_lower_case,
                word_delimiter_token=self.word_delimiter_token,
            )
        return Wav2Vec2Processor(feature_extractor, tokenizer)


In [7]:
orthography = Orthography.from_name(config['dataset']['orthography'])
orthography.tokenizer = config['model']['tokenizer']

processor = orthography.create_processor(config['model'])


  "Passing `gradient_checkpointing` to a config initialization is deprecated and will be removed in v5 "


In [16]:
processor

Wav2Vec2Processor:
- feature_extractor: Wav2Vec2FeatureExtractor {
  "do_normalize": true,
  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
  "feature_size": 1,
  "padding_side": "right",
  "padding_value": 0.0,
  "return_attention_mask": false,
  "sampling_rate": 16000
}

- tokenizer: PreTrainedTokenizer(name_or_path='facebook/wav2vec2-base', vocab_size=32, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'})

In [8]:
INPUT_IEMOCAP_FOLDER = config['dataset']['folder_path']
SPLIT_ID = config['dataset']['split_id']

In [103]:
import pandas as pd

df_train = pd.read_csv(f'{INPUT_IEMOCAP_FOLDER}/iemocap_{SPLIT_ID}.train.csv')
df_val = pd.read_csv(f'{INPUT_IEMOCAP_FOLDER}/iemocap_{SPLIT_ID}.test.csv')

# Change file location to previous folder '../'
df_train['file'] = df_train['file'].map(lambda x: f"../{x}")
df_val['file'] = df_val['file'].map(lambda x: f"../{x}")

In [104]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_val)
cls_label_map = {"e0":0, "e1":1, "e2":2, "e3":3}

In [96]:
from model import Wav2Vec2ForCTCnCLS

model = Wav2Vec2ForCTCnCLS.from_pretrained(
    config['model']['name'],
    cache_dir=config['model']['cache_dir'],
    gradient_checkpointing=config['training']['gradient_checkpointing'],
    vocab_size=len(processor.tokenizer),
    cls_len=len(cls_label_map),
    alpha=config['model']['alpha'],
)

loading configuration file https://huggingface.co/facebook/wav2vec2-base/resolve/main/config.json from cache at cache\c7746642f045322fd01afa31271dd490e677ea11999e68660a92619ec7c892b4.ce1f96bfaf3d7475cb8187b9668c7f19437ade45fb9ceb78d2b06a2cec198015
Model config Wav2Vec2Config {
  "activation_dropout": 0.0,
  "adapter_kernel_size": 3,
  "adapter_stride": 2,
  "add_adapter": false,
  "apply_spec_augment": true,
  "architectures": [
    "Wav2Vec2ForPreTraining"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 1,
  "classifier_proj_size": 256,
  "codevector_dim": 256,
  "contrastive_logits_temperature": 0.1,
  "conv_bias": false,
  "conv_dim": [
    512,
    512,
    512,
    512,
    512,
    512,
    512
  ],
  "conv_kernel": [
    10,
    3,
    3,
    3,
    3,
    2,
    2
  ],
  "conv_stride": [
    5,
    2,
    2,
    2,
    2,
    2,
    2
  ],
  "ctc_loss_reduction": "sum",
  "ctc_zero_infinity": false,
  "diversity_loss_weight": 0.1,
  "do_stable_layer_norm": false,
  "eos_toke

In [97]:
wer_metric = datasets.load_metric("wer")

target_sr = processor.feature_extractor.sampling_rate #if config['training']['target_feature_extractor_sampling_rate'] else None

vocabulary_chars_str = "".join(t for t in processor.tokenizer.get_vocab().keys() if len(t) == 1)
vocabulary_text_cleaner = re.compile(  # remove characters not in vocabulary
    f"[^\s{re.escape(vocabulary_chars_str)}]",  # allow space in addition to chars in vocabulary
    flags=re.IGNORECASE if processor.tokenizer.do_lower_case else 0,
)

In [98]:
train_dataset

Dataset({
    features: ['file', 'emotion', 'text'],
    num_rows: 5024
})

In [91]:
text_updates = []

def prepare_sample(sample, audio_only=False):
    sample["speech"], sample["sampling_rate"] = librosa.load(sample[config['dataset']['speech_file_column']], 
                                                                    sr=target_sr)
    
    if audio_only is False:
        # Normalize and clean up text; order matters!
        updated_text = orthography.preprocess_for_training(sample[config['dataset']['target_text_column']])
        updated_text = vocabulary_text_cleaner.sub("", updated_text)
        if updated_text != sample[config['dataset']['target_text_column']]:
            text_updates.append((sample[config['dataset']['target_text_column']], updated_text))
            sample[config['dataset']['target_text_column']] = updated_text
    return sample

In [105]:
if config['training']['do_train']:
    train_dataset = train_dataset.map(prepare_sample, remove_columns=[config['dataset']['speech_file_column']])

if config['training']['do_eval']:
    val_dataset = val_dataset.map(prepare_sample, remove_columns=[config['dataset']['speech_file_column']])

100%|██████████| 5024/5024 [01:32<00:00, 54.58ex/s]
100%|██████████| 507/507 [00:12<00:00, 41.24ex/s]


In [82]:
val_dataset._data

InMemoryTable
emotion: string
text: string
speech: list<item: float>
  child 0, item: float
sampling_rate: int64
----
emotion: [["e0","e0","e0","e0","e2","e0","e2","e2","e3","e3"]]
text: [["EXCUSE ME ","YEAH ","IS THERE A PROBLEM ","WELL WHAT'S THE PROBLEM LET ME CHANGE IT ","THAT'S OUT OF CONTROL ","CLEARLY  YOU KNOW DO YOU HAVE LIKE A SUPERVISOR OR SOMETHING ","I DON'T UNDERSTAND WHY THIS IS SO COMPLICATED FOR PEOPLE WHEN THEY GET HERE  IT'S JUST A SIMPLE FORM I JUST NEED AN ID ","YEAH DO YOU WANT  TO SEE MY SUPERVISOR HUH YEAH DO YOU WANT TO SEE MY SUPERVISOR FINE I'LL BE RIGHT BACK ","DID YOU GET THE MAIL SO YOU SAW MY LETTER ","YEAH I KNOW "]]
speech: [[[-0.0050354004,-0.0049743652,-0.0038146973,-0.0032653809,-0.0025939941,...,-0.0033569336,-0.0030212402,-0.0026550293,-0.0031738281,-0.004180908],[0.0009460449,-0.0009460449,-0.0007019043,-0.0005493164,-0.0021972656,...,-0.0010986328,-0.0011901855,-0.00045776367,-0.00033569336,-0.0012817383],...,[0.0007324219,0.00048828125,0.0006103

In [32]:
with processor.as_target_processor():
    a = processor(train_dataset[0]['text'])
a

{'input_ids': [9, 5, 28, 6, 4], 'attention_mask': [1, 1, 1, 1, 1]}

In [106]:

def prepare_dataset(batch, processor, cls_label_map: dict, 
                    config: dict, audio_only=False):

    # Check that all files have the correct sampling rate
    assert (
        len(set(batch["sampling_rate"])) == 1
    ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}."
    
    batch["input_values"] = processor(batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values
    if audio_only is False:
        # Map labels to their respective integers
        cls_labels = list(map(lambda e: cls_label_map[e], batch["emotion"]))
        
        with processor.as_target_processor():
            batch["labels"] = processor(batch[config['dataset']['target_text_column']]).input_ids
        for i in range(len(cls_labels)):
            batch["labels"][i].append(cls_labels[i]) # batch["labels"] element has to be a single list
    return batch

In [109]:
if config['training']['do_train']:
    prepared_train_dataset = train_dataset.map(
                        prepare_dataset,
                        fn_kwargs={'processor': processor, 'cls_label_map': cls_label_map,
                                   'config': config},
                        batch_size=config['training']['per_device_train_batch_size'],
                        batched=True,
                        num_proc=config['training']['preprocessing_num_workers'],
                    )

if config['training']['do_eval']:
    prepared_val_dataset = val_dataset.map(
                        prepare_dataset,
                        fn_kwargs={'processor': processor, 'cls_label_map': cls_label_map,
                                   'config': config},
                        batch_size=config['training']['per_device_train_batch_size'],
                        batched=True,
                        num_proc=config['training']['preprocessing_num_workers'],
                    )

  tensor = as_tensor(value)
 36%|███▋      | 914/2512 [01:21<02:22, 11.20ba/s]


ArrowMemoryError: realloc of size 2147483648 failed

In [84]:
prepared_train_dataset

Dataset({
    features: ['emotion', 'text', 'speech', 'sampling_rate', 'input_values', 'labels'],
    num_rows: 10
})

In [59]:
@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for processing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """

    processor: Wav2Vec2Processor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None
    audio_only = False

    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # split inputs and labels since they have to be of different lenghts and need
        # different padding methods
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        
        if self.audio_only is False: # Training
            label_features = [{"input_ids": feature["labels"][:-1]} for feature in features]
            cls_labels = [feature["labels"][-1] for feature in features]

        batch = self.processor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        if self.audio_only is False:
            with self.processor.as_target_processor():
                labels_batch = self.processor.pad(
                    label_features,
                    padding=self.padding,
                    max_length=self.max_length_labels,
                    pad_to_multiple_of=self.pad_to_multiple_of_labels,
                    return_tensors="pt",
                )

            # Replace padding with -100 to ignore loss correctly
            ctc_labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
            batch["labels"] = (ctc_labels, torch.tensor(cls_labels)) # labels = (ctc_labels, cls_labels)

        return batch

In [60]:
data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True)

In [62]:
def compute_metrics(pred):
    cls_pred_logits = pred.predictions[1]
    cls_pred_ids = np.argmax(cls_pred_logits, axis=-1)
    total = len(pred.label_ids[1])
    correct = (cls_pred_ids == pred.label_ids[1]).sum().item() # label = (ctc_label, cls_label)

    ctc_pred_logits = pred.predictions[0]
    ctc_pred_ids = np.argmax(ctc_pred_logits, axis=-1)
    pred.label_ids[0][pred.label_ids[0] == -100] = processor.tokenizer.pad_token_id
    ctc_pred_str = processor.batch_decode(ctc_pred_ids)
    # we do not want to group tokens when computing the metrics
    ctc_label_str = processor.batch_decode(pred.label_ids[0], group_tokens=False)
    
    wer = wer_metric.compute(predictions=ctc_pred_str, references=ctc_label_str)
    return {"acc": correct/total, "wer": wer, "correct": correct, "total": total, "strlen": len(ctc_label_str)}


In [63]:
class CTCTrainer(Trainer):
    def _prepare_inputs(self, inputs: Dict[str, Union[torch.Tensor, Any]]) -> Dict[str, Union[torch.Tensor, Any]]:
        for k, v in inputs.items():
            if isinstance(v, torch.Tensor):
                kwargs = dict(device=self.args.device)
                # if self.deepspeed and inputs[k].dtype != torch.int64:
                #     kwargs.update(dict(dtype=self.args.hf_deepspeed_config.dtype()))
                inputs[k] = v.to(**kwargs)

            if k == 'labels': # labels are list of tensor, not tensor, special handle here
                for i in range(len(inputs[k])):
                    kwargs = dict(device=self.args.device)
                    # if self.deepspeed and inputs[k][i].dtype != torch.int64:
                    #     kwargs.update(dict(dtype=self.args.hf_deepspeed_config.dtype()))
                    inputs[k][i] = inputs[k][i].to(**kwargs)

        if self.args.past_index >= 0 and self._past is not None:
            inputs["mems"] = self._past

        return inputs

    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        """
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (:obj:`nn.Module`):
                The model to train.
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.

        Return:
            :obj:`torch.Tensor`: The tensor with training loss on this batch.
        """

        model.train()
        inputs = self._prepare_inputs(inputs)

        if self.use_amp:
            with autocast():
                loss = self.compute_loss(model, inputs)
        else:
            loss = self.compute_loss(model, inputs)

        if self.args.n_gpu > 1:
            loss = loss.mean()

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        if self.use_amp:
            self.scaler.scale(loss).backward()
        elif self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        elif self.deepspeed:
            self.deepspeed.backward(loss)
        else:
            loss.backward()

        return loss.detach()

In [None]:
trainer = CTCTrainer(
        model=model,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        train_dataset=prepared_train_dataset,
        eval_dataset=prepared_val_dataset,
        tokenizer=processor.feature_extractor,
    )

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `Wav2Vec2ForCTCnCLS.forward` and have been ignored: sampling_rate, text, emotion, speech. If sampling_rate, text, emotion, speech are not expected by `Wav2Vec2ForCTCnCLS.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 6
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
100%|██████████| 6/6 [00:08<00:00,  1.22s/it]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 6/6 [00:08<00:00,  1.39s/it]

{'train_runtime': 8.3723, 'train_samples_per_second': 3.583, 'train_steps_per_second': 0.717, 'train_loss': 562.1226399739584, 'epoch': 3.0}





TrainOutput(global_step=6, training_loss=562.1226399739584, metrics={'train_runtime': 8.3723, 'train_samples_per_second': 3.583, 'train_steps_per_second': 0.717, 'train_loss': 562.1226399739584, 'epoch': 3.0})