In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.1


In [2]:
!pip install fsspec==2023.9.0 datasets==2.11.0

Collecting datasets==2.11.0
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting dill<0.3.7,>=0.3.0 (from datasets==2.11.0)
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.11.0)
  Downloading multiprocess-0.70.14-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, datasets
  Attempting uninstall: dill
    Found existing installation: dill 0.3.7
    Uninstalling dill-0.3.7:
      Su

In [3]:
import torchaudio
import librosa
import soundfile
import os, glob
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
from datasets import load_dataset, concatenate_datasets
from dataclasses import dataclass
from typing import Optional, Tuple, Dict, List, Union
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from transformers.models.wav2vec2.modeling_wav2vec2 import Wav2Vec2PreTrainedModel, Wav2Vec2Model
from transformers.file_utils import ModelOutput
from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor
from transformers import TrainingArguments, AutoConfig, EvalPrediction
import evaluate

In [4]:
from typing import Any, Dict, Union

import torch
from packaging import version
from torch import nn

from transformers import (
    Trainer,
    is_apex_available,
)

if is_apex_available():
    from apex import amp

if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_native_amp_available = True
    from torch.cuda.amp import autocast

**Preparing data for wWav2Vec training**

If we make a split by sklearn.train_test_split(), then the records of one person can get into both the training and validation sets, which will lead us to data leak. So I decided to split the sample into 6 folds of 4 people in each one (approximately 80/20). I wanted to run training six times and then average parameters (as it's said in Yandex Handbook), but there was not enough time.

**P.S.** It can be seen that each actor has about the same set of emotions in terms of the number of recordings, so such a separation is OK.

In [5]:
#note that there are no broken paths or files
def wav2vec_load_data(num_actors_in_fold : int) -> list:
    #data consists of pairs: {path, emotion}
    #emotion is the third id in the basename
    data = [[]]
    for file in sorted(glob.glob("media_converted/Actor_*/*.wav")):
        path = os.path.abspath(file)
        name = os.path.basename(file).split('.')[0]
        actor_num = int(name.split('-')[-1])
        if actor_num / num_actors_in_fold > len(data):
            data.append([])
        emotion = name.split('-')[2]
        data[-1].append({
            "path": path, 
            "emotion": emotion})
    return data

In [6]:
data = wav2vec_load_data(num_actors_in_fold=4)

pd.DataFrame(data[0]).to_csv("folds/fold_1.csv", sep="\t", encoding="utf-8", index=False)
pd.DataFrame(data[1]).to_csv("folds/fold_2.csv", sep="\t", encoding="utf-8", index=False)
pd.DataFrame(data[2]).to_csv("folds/fold_3.csv", sep="\t", encoding="utf-8", index=False)
pd.DataFrame(data[3]).to_csv("folds/fold_4.csv", sep="\t", encoding="utf-8", index=False)
pd.DataFrame(data[4]).to_csv("folds/fold_5.csv", sep="\t", encoding="utf-8", index=False)
pd.DataFrame(data[5]).to_csv("folds/fold_6.csv", sep="\t", encoding="utf-8", index=False)

In [7]:
data_files = {
    "fold_1": "folds/fold_1.csv", 
    "fold_2": "folds/fold_2.csv",
    "fold_3": "folds/fold_3.csv",
    "fold_4": "folds/fold_4.csv",
    "fold_5": "folds/fold_5.csv",
    "fold_6": "folds/fold_6.csv"
}
dataset = load_dataset("csv", data_files=data_files, delimiter="\t")

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-1e8571c7d31fa2b1/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/6 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/6 [00:00<?, ?it/s]

Generating fold_1 split: 0 examples [00:00, ? examples/s]

Generating fold_2 split: 0 examples [00:00, ? examples/s]

Generating fold_3 split: 0 examples [00:00, ? examples/s]

Generating fold_4 split: 0 examples [00:00, ? examples/s]

Generating fold_5 split: 0 examples [00:00, ? examples/s]

Generating fold_6 split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-1e8571c7d31fa2b1/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/6 [00:00<?, ?it/s]

In [8]:
train_dataset = concatenate_datasets([
    dataset['fold_1'],
    dataset['fold_2'],
    dataset['fold_3'],
    dataset['fold_4'],
    dataset['fold_5']
])

eval_dataset = dataset['fold_6']

In [9]:
label_list = train_dataset.unique("emotion")
label_list.sort()
num_labels = len(label_list)

**Data preprocessing**

Wavs were downsampled in advance using a script that lies on the github

I decided to use following model, because other once didn't fit my RAM.

In [10]:
model_name_or_path = "facebook/wav2vec2-base-960h"

In [11]:
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=8,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="wav2vec2_clf",
)
setattr(config, 'pooling_mode', 'mean')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

In [12]:
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)

Downloading (…)rocessor_config.json:   0%|          | 0.00/159 [00:00<?, ?B/s]

In [13]:
def preprocess_function(data):
    speech_list = [librosa.load(path)[0] for path in data["path"]]
    target_list = [label-1 for label in data["emotion"]]

    result = feature_extractor(speech_list, sampling_rate=16000)
    result["labels"] = list(target_list)

    return result

In [14]:
train_dataset = train_dataset.map(
    preprocess_function,
    batch_size=8,
    batched=True,
)
eval_dataset = eval_dataset.map(
    preprocess_function,
    batch_size=8,
    batched=True,
)

Map:   0%|          | 0/1200 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

In [15]:
@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

In [16]:
class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

In [17]:
class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()
    
    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = torch.mean(hidden_states, dim=1)
        logits = self.classifier(hidden_states)
        
        #dealing with single-label classification problem
        loss = None
        self.config.problem_type = "single_label_classification"
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

**Set up trainer**

In [18]:
@dataclass
class DataCollatorCTCWithPadding:
    feature_extractor: Wav2Vec2FeatureExtractor
    padding: Union[bool, str] = True
    max_length: Optional[int] = None
    max_length_labels: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    pad_to_multiple_of_labels: Optional[int] = None
        
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_values": feature["input_values"]} for feature in features]
        label_features = [feature["labels"] for feature in features]

        d_type = torch.long if isinstance(label_features[0], int) else torch.float

        batch = self.feature_extractor.pad(
            input_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

I couldn't figure out which metric to use in this task (because there are no obvious metrics like WER for speech recognition), so I calculate the basic ones: accuracy, precision, recall and f1. It's worth noting that I use **micro-averaging** because the classes are evenly distributed.

In [19]:
def compute_metrics(p : EvalPrediction):
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    precision_metric = evaluate.load("precision")
    recall_metric = evaluate.load("recall")
    
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis=1)
    
    results = {}
    results.update(accuracy_metric.compute(predictions=preds, references=p.label_ids))
    results.update(f1_metric.compute(predictions=preds, references=p.label_ids, average='micro'))
    results.update(precision_metric.compute(predictions=preds, references=p.label_ids, average='micro'))
    results.update(recall_metric.compute(predictions=preds, references=p.label_ids, average='micro'))
    return results

The easiest way to train a pre-trained model is to use the class Trainer, so that's it

In [20]:
class CTCTrainer(Trainer):
    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        model.train()
        inputs = self._prepare_inputs(inputs)

        if self.use_cuda_amp:
            with autocast():
                loss = self.compute_loss(model, inputs)
        else:
            loss = self.compute_loss(model, inputs)

        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        if self.use_cuda_amp:
            self.scaler.scale(loss).backward()
        elif self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        elif self.deepspeed:
            self.deepspeed.backward(loss)
        else:
            loss.backward()

        return loss.detach()

In [21]:
data_collator = DataCollatorCTCWithPadding(feature_extractor=feature_extractor, padding=True)

In [22]:
model = Wav2Vec2ForSpeechClassification.from_pretrained(
    model_name_or_path,
    config=config,
)
model.freeze_feature_extractor()

Downloading model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.weight', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


As it is usually done, I won't change the standard constants $\beta_1, \beta_2, \varepsilon$ and will iterate over the $\gamma$. Unfortunately, I did not have enough time to implement a search of hyperparameters, but it is known (https://twitter.com/karpathy/status/801621764144971776) that optimal $\gamma$ somewhere between $0.0001$ and $0.0005$, so I chose $\gamma = 0.0003$.

Unfortunately, I had to run training on CPU.

In [23]:
training_args = TrainingArguments(
    adam_beta1=0.9,
    adam_beta2=0.999,
    adam_epsilon=1e-08,
    dataloader_pin_memory=True,
    
    ddp_find_unused_parameters=True,
    gradient_checkpointing=False,
    
    debug=[],
    deepspeed=None,
    disable_tqdm=False,
    do_eval=True,
    do_predict=True,
    do_train=True,
    eval_accumulation_steps=None,
    eval_steps=100,
    evaluation_strategy="steps",
    fp16=False,
    fp16_full_eval=False,
    gradient_accumulation_steps=2,
    greater_is_better=True,
    group_by_length=False,
    ignore_data_skip=False,
    label_names=None,
    label_smoothing_factor=0.2,
    learning_rate=3e-4,
    load_best_model_at_end=False,
    
    #for clipping:
    max_grad_norm=1.0,
    
    metric_for_best_model=None,
    
    num_train_epochs=5.0,
    optim="adamw_torch",
    output_dir="model",
    overwrite_output_dir=True,
    past_index=-1,
    per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    remove_unused_columns=True,
    report_to=['tensorboard'],
    resume_from_checkpoint=None,
    save_on_each_node=False,
    save_steps=100,
    logging_steps=100,
    save_total_limit=2,
    seed=42,
    sharded_ddp=[],
    skip_memory_metrics=True,
    tf32=None,
    tpu_metrics_debug=False,
    tpu_num_cores=None,
    use_legacy_prediction_loop=False,
    warmup_ratio=0.0,
    warmup_steps=0,
    weight_decay=0.0,
)

In [24]:
trainer = CTCTrainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=feature_extractor,
)

In [25]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
100,2.147,2.105827,0.133333,0.133333,0.133333,0.133333
200,2.1246,2.111636,0.133333,0.133333,0.133333,0.133333
300,2.1439,2.074491,0.133333,0.133333,0.133333,0.133333
400,2.1054,2.075143,0.133333,0.133333,0.133333,0.133333
500,2.0963,2.103988,0.133333,0.133333,0.133333,0.133333
600,2.0897,2.0798,0.133333,0.133333,0.133333,0.133333
700,2.083,2.078034,0.133333,0.133333,0.133333,0.133333
800,2.0986,2.077122,0.133333,0.133333,0.133333,0.133333
900,2.0864,2.07039,0.133333,0.133333,0.133333,0.133333
1000,2.0856,2.074863,0.133333,0.133333,0.133333,0.133333


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

TrainOutput(global_step=3000, training_loss=2.0876193389892577, metrics={'train_runtime': 15401.6308, 'train_samples_per_second': 0.39, 'train_steps_per_second': 0.195, 'total_flos': 2.7743577889190323e+17, 'train_loss': 2.0876193389892577, 'epoch': 5.0})

**Prediction**

Let's add the ability to save the model for prediction and trainer for inference.

In [28]:
model.save_pretrained("/kaggle/working/")
trainer.save_model("/kaggle/working/")

Preparing test dataset

In [33]:
def wav2vec_load_test() -> list:
    data_test = []
    for file in sorted(glob.glob("vk-test/original/*.wav")):
        path = os.path.abspath(file)
        name = os.path.basename(file).split('.')[0]
        data_test.append({
            "path": path, 
            "name": name})
    return data_test

In [34]:
data_test = wav2vec_load_test()
pd.DataFrame(data_test).to_csv("test.csv", sep="\t", encoding="utf-8", index=False)

In [45]:
test_dataset = load_dataset("csv", data_files={"test": "/kaggle/working/test.csv"}, delimiter='\t')

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-616b0267e1d337c1/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-616b0267e1d337c1/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [36]:
model_name_or_path = "/kaggle/working/"

Loading the model from last checkpoint

In [40]:
device = torch.device("cpu")
config = AutoConfig.from_pretrained(model_name_or_path)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
sampling_rate = feature_extractor.sampling_rate
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)

In [46]:
def speech_file_to_array_fn(batch):
    speech_array, sampling_rate = torchaudio.load(batch["path"])
    speech_array = speech_array.squeeze().numpy()
    speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=feature_extractor.sampling_rate)

    batch["speech"] = speech_array
    return batch

In [47]:
test_dataset = test_dataset.map(speech_file_to_array_fn)

Map:   0%|          | 0/1550 [00:00<?, ? examples/s]

In [48]:
def predict(batch):
    features = feature_extractor(batch["speech"], sampling_rate=feature_extractor.sampling_rate, return_tensors="pt", padding=True)

    input_values = features.input_values.to(device)

    with torch.no_grad():
        logits = model(input_values).logits 

    batch["predicted"] = logits
    return batch

In [49]:
result = test_dataset.map(predict, batched=True, batch_size=8)

Map:   0%|          | 0/1550 [00:00<?, ? examples/s]

In [62]:
result

DatasetDict({
    test: Dataset({
        features: ['path', 'name', 'speech', 'predicted'],
        num_rows: 1550
    })
})

I don't know why the first one is negative...

In [63]:
preds = result['test']['predicted']
names = result['test']['name']

In [75]:
for _ in range(1550):
    print(f"{names[_]}|{preds[_][0]}|{preds[_][1]}|{preds[_][2]}|"
          f"{preds[_][3]}|{preds[_][4]}|{preds[_][5]}|"
          f"{preds[_][6]}|{preds[_][7]}")

10_20_0|-0.454452782869339|0.044134195894002914|0.0208074152469635|0.06429166346788406|0.05212129279971123|0.06579016894102097|0.015442373231053352|0.07688643038272858
10_20_1|-0.454452782869339|0.04413419961929321|0.02080741710960865|0.06429165601730347|0.05212129279971123|0.06579016149044037|0.015442375093698502|0.07688643038272858
10_20_2|-0.4544528126716614|0.04413419961929321|0.0208074152469635|0.06429165601730347|0.05212128907442093|0.06579017639160156|0.015442373231053352|0.07688643038272858
10_20_3|-0.454452782869339|0.044134195894002914|0.0208074152469635|0.06429165601730347|0.05212128907442093|0.06579016149044037|0.015442376025021076|0.07688643038272858
10_20_4|-0.454452782869339|0.04413419961929321|0.0208074189722538|0.06429165601730347|0.05212128907442093|0.06579016149044037|0.015442374162375927|0.07688642293214798
10_21_0|-0.4544528126716614|0.044134195894002914|0.02080741710960865|0.06429165601730347|0.05212128907442093|0.06579016149044037|0.015442373231053352|0.076886430