In [None]:
%env LC_ALL=C.UTF-8
%env LANG=C.UTF-8
%env TRANSFORMERS_CACHE=cache
%env HF_DATASETS_CACHE=cache
%env CUDA_LAUNCH_BLOCKING=1

In [None]:
import os
import pandas as pd
import numpy as np
import random
import torchaudio
from datasets import load_dataset

In [None]:
# We need to specify the input and output column
input_column = "path"
output_despair_column = "despair"
output_sadness_column = "sadness"
output_pain_column = "pain"
output_guilt_column = "guilt"
output_confuse_column = "confuse"
output_helplessness_column = "helplessness"
output_resentment_column = "resentment"
output_fear_column = "fear"
output_numbness_column = "numbness"
output_anxiety_column = "anxiety"
output_grievance_column = "grievance"

key_list = [
    output_despair_column,
    output_sadness_column,
    output_pain_column,
    output_guilt_column,
    output_confuse_column,
    output_helplessness_column,
    output_resentment_column,
    output_fear_column,
    output_numbness_column,
    output_anxiety_column,
    output_grievance_column
]

In [None]:
# we need to distinguish the unique labels in our SER dataset
label_list = [0, 1]
num_labels = len(label_list)

In [None]:
# !source /etc/network_turbo
from transformers import AutoConfig, AutoFeatureExtractor
model_name_or_path = "../models/openai-whisper-medium"
pooling_mode = "mean"

# config
config = AutoConfig.from_pretrained(
    model_name_or_path,
    num_labels=num_labels,
    label2id={label: i for i, label in enumerate(label_list)},
    id2label={i: label for i, label in enumerate(label_list)},
    finetuning_task="whisper_clf",
)
setattr(config, 'pooling_mode', pooling_mode)

In [None]:
feature_extractor = AutoFeatureExtractor.from_pretrained(model_name_or_path,)
target_sampling_rate = feature_extractor.sampling_rate

In [None]:
def speech_file_to_array_fn(path):
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech

def label_to_id(label, label_list):
    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1

    return label

def label_to_id_float(label, label_list):
    return float(label_to_id(label, label_list))

def preprocess_function(examples):
    speech_list = [speech_file_to_array_fn(path) for path in examples[input_column]]
    target_list = []
    for output_column in key_list:
        single_col = [label_to_id(label, label_list) for label in examples[output_column]]
        target_list.append(single_col)

    target_list = np.array(target_list).transpose()
    
    res = {}
    result = feature_extractor(speech_list, return_tensors="pt", sampling_rate=target_sampling_rate)
    res['input_features'] = result.input_features
    res["labels"] = target_list

    return res

In [None]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput


@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None


In [None]:
import torch
import torch.nn as nn
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss

from transformers.models.whisper.modeling_whisper import WhisperModel

# 这个头部模块负责将从Whisper模型的特征中提取的音频表示转化为用于进行分类任务的输出
class WhisperClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(0.)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    # 这个方法定义了数据在模块中的前向传播过程。
    # 它首先通过self.dropout应用丢弃以随机丢弃一些特征，然后通过self.dense进行线性映射
    # 并应用tanh激活函数，最后通过self.out_proj进行线性映射，以生成分类任务的输出分数。
    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

# 将Whisper的特征提取器和分类头部组合在一起，以创建一个端到端的语音分类模型。
class WhisperForSpeechClassification(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.whisper = WhisperModel.from_pretrained(model_name_or_path)
        
        # self.classifier = WhisperClassificationHead(config)
        
        self.despair_classifier = WhisperClassificationHead(config)
        self.sadness_classifier = WhisperClassificationHead(config)
        self.pain_classifier = WhisperClassificationHead(config)
        self.guilt_classifier = WhisperClassificationHead(config)
        self.confuse_classifier = WhisperClassificationHead(config)
        self.helplessness_classifier = WhisperClassificationHead(config)
        self.resentment_classifier = WhisperClassificationHead(config)
        self.fear_classifier = WhisperClassificationHead(config)
        self.numbness_classifier = WhisperClassificationHead(config)
        self.anxiety_classifier = WhisperClassificationHead(config)
        self.grievance_classifier = WhisperClassificationHead(config)
        
        self._init_weights(self)
    
    def _init_weights(self, module):
        std = self.config.init_std
        if isinstance(module, (nn.Linear, nn.Conv1d)):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()

    def freeze_feature_extractor(self):
        self.whisper.encoder._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_features,
            labels=None
    ):
        decoder_input_ids = torch.ones([input_features.shape[0], 1], dtype=torch.long) * self.whisper.config.decoder_start_token_id
        decoder_input_ids = decoder_input_ids.cuda()
        hidden_states = self.whisper(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        
        despair_logits = self.despair_classifier(hidden_states).unsqueeze(1)
        sadness_logits = self.sadness_classifier(hidden_states).unsqueeze(1)
        pain_logits = self.pain_classifier(hidden_states).unsqueeze(1)
        guilt_logits = self.guilt_classifier(hidden_states).unsqueeze(1)
        confuse_logits = self.confuse_classifier(hidden_states).unsqueeze(1)
        helplessness_logits = self.helplessness_classifier(hidden_states).unsqueeze(1)
        resentment_logits = self.resentment_classifier(hidden_states).unsqueeze(1)
        fear_logits = self.fear_classifier(hidden_states).unsqueeze(1)
        numbness_logits = self.numbness_classifier(hidden_states).unsqueeze(1)
        anxiety_logits = self.anxiety_classifier(hidden_states).unsqueeze(1)
        grievance_logits = self.grievance_classifier(hidden_states).unsqueeze(1)
        
        loss = None
        if labels is not None:
            loss_fct = CrossEntropyLoss()
            despair_loss = loss_fct(despair_logits.view(-1, self.num_labels), labels[:, 0])
            sadness_loss = loss_fct(sadness_logits.view(-1, self.num_labels), labels[:, 1])
            pain_loss = loss_fct(pain_logits.view(-1, self.num_labels), labels[:, 2])
            guilt_loss = loss_fct(guilt_logits.view(-1, self.num_labels), labels[:, 3])
            confuse_loss = loss_fct(confuse_logits.view(-1, self.num_labels), labels[:, 4])
            helplessness_loss = loss_fct(helplessness_logits.view(-1, self.num_labels), labels[:, 5])
            resentment_loss = loss_fct(resentment_logits.view(-1, self.num_labels), labels[:, 6])
            fear_loss = loss_fct(fear_logits.view(-1, self.num_labels), labels[:, 7])
            numbness_loss = loss_fct(numbness_logits.view(-1, self.num_labels), labels[:, 8])
            anxiety_loss = loss_fct(anxiety_logits.view(-1, self.num_labels), labels[:, 9])
            grievance_loss = loss_fct(grievance_logits.view(-1, self.num_labels), labels[:, 10])
            
            loss = despair_loss + \
                sadness_loss + \
                pain_loss + \
                guilt_loss + \
                confuse_loss + \
                helplessness_loss + \
                resentment_loss + \
                fear_loss + \
                numbness_loss + \
                anxiety_loss + \
                grievance_loss
    
        
        output = torch.concat((
            despair_logits,
            sadness_logits,
            pain_logits,
            guilt_logits,
            confuse_logits,
            helplessness_logits,
            resentment_logits,
            fear_logits,
            numbness_logits,
            anxiety_logits,
            grievance_logits),
            dim=1
        )
        return {'loss': loss, 'output': output} if loss is not None else output

In [None]:
from dataclasses import dataclass
from typing import Dict, List, Union
import torch


@dataclass
class DataCollatorCTCWithPadding:
    """
    Data collator that will dynamically pad the inputs received.
    Args:
        processor (:class:`~transformers.Wav2Vec2Processor`)
            The processor used for proccessing the data.
        padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:
            * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
              sequence if provided).
            * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the
              maximum acceptable input length for the model if that argument is not provided.
            * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of
              different lengths).
        max_length (:obj:`int`, `optional`):
            Maximum length of the ``input_values`` of the returned list and optionally padding length (see above).
        max_length_labels (:obj:`int`, `optional`):
            Maximum length of the ``labels`` returned list and optionally padding length (see above).
        pad_to_multiple_of (:obj:`int`, `optional`):
            If set will pad the sequence to a multiple of the provided value.
            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
    """
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        # input_features = [{"input_features": feature["input_features"]} for feature in features]
        input_features = [feature["input_features"] for feature in features]
        label_features = [feature["labels"] for feature in features]

        # d_type = torch.long if isinstance(label_features[0], int) else torch.float
        d_type = torch.long

        batch = {}
        batch['input_features'] = torch.tensor(input_features, dtype=torch.half)
        batch["labels"] = torch.tensor(label_features, dtype=d_type)

        return batch

In [None]:
data_collator = DataCollatorCTCWithPadding()

Next, the evaluation metric is defined. There are many pre-defined metrics for classification/regression problems, but in this case, we would continue with just Accuracy for classification and MSE for regression. You can define other metrics on your own.

In [None]:
import numpy as np
from transformers import EvalPrediction
from sklearn.metrics import f1_score, recall_score

is_regression = False

def compute_metrics(p: EvalPrediction):
    if isinstance(p.predictions, dict):
        preds = p.predictions['output']
    elif isinstance(p.predictions, tuple) or isinstance(p.predictions, list):
        if len(p.predictions) == 1:
            preds = p.predictions[0]
        else:
            preds = p.predictions[1]
    else:
        preds = p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=2)

    if is_regression:
        mse = ((preds - p.label_ids) ** 2).mean().item()
        return {"mse": mse}
    else:
        accuracy = (preds == p.label_ids).astype(np.float32).mean().item()
        
        # f1 = f1_score(p.label_ids, preds, average='weighted')
        # recall = recall_score(p.label_ids, preds, average='weighted')
        
        f1 = 0
        recall = 0
        for i in range(preds.shape[-1]):
            f1 += f1_score(p.label_ids[:, i], preds[:, i], average='weighted')
            recall += recall_score(p.label_ids[:, i], preds[:, i], average='weighted')
        f1 /= preds.shape[-1]
        recall /= preds.shape[-1]

        return {"accuracy": accuracy, "f1": f1, "recall": recall}

In [None]:
model = WhisperForSpeechClassification(config)
model.freeze_feature_extractor()

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="../outputs_multi/Whisper-medium-output",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    evaluation_strategy="steps",
    num_train_epochs=3.0,
    fp16=True,
    save_steps=50,
    eval_steps=25,
    logging_steps=10,
    learning_rate=1e-4,
    save_total_limit=2,
    report_to="none"
)

For future use we can create our training script, we do it in a simple way. You can add more on you own.

In [None]:
from typing import Any, Dict, Union

import torch
from packaging import version
from torch import nn

from transformers import (
    Trainer,
    is_apex_available,
)

if is_apex_available():
    from apex import amp

if version.parse(torch.__version__) >= version.parse("1.6"):
    _is_native_amp_available = True
    from torch.cuda.amp import autocast


class CTCTrainer(Trainer):
    def training_step(self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]) -> torch.Tensor:
        """
        Perform a training step on a batch of inputs.

        Subclass and override to inject custom behavior.

        Args:
            model (:obj:`nn.Module`):
                The model to train.
            inputs (:obj:`Dict[str, Union[torch.Tensor, Any]]`):
                The inputs and targets of the model.

                The dictionary will be unpacked before being fed to the model. Most models expect the targets under the
                argument :obj:`labels`. Check your model's documentation for all accepted arguments.

        Return:
            :obj:`torch.Tensor`: The tensor with training loss on this batch.
        """
        model.train()
        inputs = self._prepare_inputs(inputs)

        with self.compute_loss_context_manager():
            loss = self.compute_loss(model, inputs)
        
        if self.args.gradient_accumulation_steps > 1:
            loss = loss / self.args.gradient_accumulation_steps

        if self.use_apex:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            self.accelerator.backward(loss)

        return loss.detach()


Now, all instances can be passed to Trainer and we are ready to start training!

In [None]:
total_train_data = pd.read_csv('../data/total.csv')
eval_data_num = int(len(total_train_data) / 5)
for i in range(5):
    if i == 0:
        eval_data = total_train_data.iloc[:eval_data_num, :]
        train_data = total_train_data.iloc[eval_data_num:, :]
    elif i == 4:
        eval_data = total_train_data.iloc[-eval_data_num:, :]
        train_data = total_train_data.iloc[:-eval_data_num, :]
    else:
        eval_data = total_train_data.iloc[i * eval_data_num : (i + 1) * eval_data_num, :]
        train_data = pd.concat([total_train_data.iloc[:i * eval_data_num, :], total_train_data.iloc[(i + 1) * eval_data_num:, :]], axis=0, ignore_index=True).sample(frac=1)

    eval_data.to_csv('../data/5fold_eval_{}.csv'.format(i), index=False)
    train_data.to_csv('../data/5fold_train_{}.csv'.format(i), index=False)
    
    data_files = {
        "train": "../data/5fold_train_{}.csv".format(i),
        "validation": "../data/5fold_eval_{}.csv".format(i),
    }

    dataset = load_dataset("csv", data_files=data_files, delimiter=",", )
    train_dataset = dataset["train"]
    eval_dataset = dataset["validation"]

    train_dataset = train_dataset.map(
        preprocess_function,
        batch_size=100,
        batched=True,
        drop_last_batch=False
    )
    eval_dataset = eval_dataset.map(
        preprocess_function,
        batch_size=100,
        batched=True,
        drop_last_batch=False
    )
    trainer = CTCTrainer(
        model=model,
        data_collator=data_collator,
        args=training_args,
        compute_metrics=compute_metrics,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=feature_extractor
    )

    trainer.train()

    metrics = trainer.evaluate()
    max_eval_samples = len(eval_dataset)
    metrics["eval_samples"] = min(max_eval_samples, len(eval_dataset))

    trainer.log_metrics("eval", metrics)
    trainer.save_metrics("eval", metrics)


测试代码

In [None]:
from safetensors import safe_open

# 加载模型
feature_extractor = AutoFeatureExtractor.from_pretrained('../outputs_multi/Whisper-medium-output/checkpoint-1200')
target_sampling_rate = feature_extractor.sampling_rate
model = WhisperForSpeechClassification(config)

tensors = {}
with safe_open('../outputs_multi/Whisper-medium-output/checkpoint-1200/model.safetensors', framework="pt", device='cpu') as f:
    for k in f.keys():
        tensors[k] = f.get_tensor(k)
model.load_state_dict(tensors)

# 读取测试集
data_files = {
    "test": "../data/test.csv"
}
dataset = load_dataset("csv", data_files=data_files, delimiter=",", )
test_dataset = dataset["test"]
test_dataset = test_dataset.map(
    preprocess_function,
    batch_size=100,
    batched=True,
    drop_last_batch=False
)

# 进行测试评估
trainer = CTCTrainer(
    model=model,
    data_collator=data_collator,
    args=training_args,
    compute_metrics=compute_metrics,
    eval_dataset=test_dataset,
    tokenizer=feature_extractor
)

metrics = trainer.evaluate()
max_eval_samples = len(test_dataset)
metrics["eval_samples"] = min(max_eval_samples, len(test_dataset))

trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)