In [34]:
from datasets import load_dataset

In [35]:
dataset = load_dataset("audiofolder", data_dir="music-segments-16k/")

Resolving data files: 100%|██████████| 17394/17394 [00:03<00:00, 5661.57it/s]
Found cached dataset audiofolder (/home/akhmed.sakip/.cache/huggingface/datasets/audiofolder/default-a678e5f9fb0932ec/0.0.0/6cbdd16f8688354c63b4e2a36e1585d05de285023ee6443ffd71c4182055c0fc)
100%|██████████| 1/1 [00:00<00:00, 19.46it/s]


In [37]:
dataset

DatasetDict({
    train: Dataset({
        features: ['audio'],
        num_rows: 17394
    })
})

In [59]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from transformers import AutoConfig, Wav2Vec2FeatureExtractor, Wav2Vec2ForSequenceClassification, Wav2Vec2PreTrainedModel
from transformers import Wav2Vec2Model

import librosa
import IPython.display as ipd
import numpy as np
import pandas as pd

In [62]:
from dataclasses import dataclass
from typing import Optional, Tuple
import torch
from transformers.file_utils import ModelOutput

@dataclass
class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None

In [63]:
class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""

    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)

    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x

class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config

        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)

        self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states = self.merged_strategy(hidden_states, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)

        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"

            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)

        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

In [64]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_name_or_path = "m3hrdadfi/wav2vec2-base-100k-gtzan-music-genres"
config = AutoConfig.from_pretrained(model_name_or_path)
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name_or_path)
sampling_rate = feature_extractor.sampling_rate
model = Wav2Vec2ForSpeechClassification.from_pretrained(model_name_or_path).to(device)

In [65]:
def speech_file_to_array_fn(path, sampling_rate):
    speech_array, _sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return speech


def predict(path, sampling_rate):
    speech = speech_file_to_array_fn(path, sampling_rate)
    inputs = feature_extractor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
    inputs = {key: inputs[key].to(device) for key in inputs}

    with torch.no_grad():
        logits = model(**inputs).logits

    scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
    outputs = [{"Label": config.id2label[i], "Score": f"{round(score * 100, 3):.1f}%"} for i, score in enumerate(scores)]
    return outputs


In [76]:
path = "/home/akhmed.sakip/Documents/NLP703/Project/nlp703-speech-processing/notebooks/music-segments-16k/item699_21.wav"
outputs = predict(path, sampling_rate)

In [77]:
outputs

[{'Label': 'blues', 'Score': '0.0%'},
 {'Label': 'classical', 'Score': '0.0%'},
 {'Label': 'country', 'Score': '23.7%'},
 {'Label': 'disco', 'Score': '0.4%'},
 {'Label': 'hiphop', 'Score': '0.0%'},
 {'Label': 'jazz', 'Score': '0.6%'},
 {'Label': 'metal', 'Score': '0.0%'},
 {'Label': 'pop', 'Score': '75.0%'},
 {'Label': 'reggae', 'Score': '0.1%'},
 {'Label': 'rock', 'Score': '0.1%'}]

In [9]:
model_name = 'facebook/wav2vec2-base-960h'

In [13]:
from transformers import Wav2Vec2Processor, Wav2Vec2ForPreTraining

processor = Wav2Vec2Processor.from_pretrained(model_name)
model = Wav2Vec2ForPreTraining.from_pretrained(model_name)

Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForPreTraining: ['lm_head.bias', 'lm_head.weight']
- This IS expected if you are initializing Wav2Vec2ForPreTraining from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForPreTraining from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForPreTraining were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['quantizer.weight_proj.bias', 'project_q.weight', 'project_hid.bias', 'project_hid.weight', 'quantizer.codevectors', 'wav2vec2.masked_spec_embed', 'project_q.bias', 'quantizer.weight_proj.weight']
You should probably 

In [16]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer")

In [26]:
from typing import Any, Dict
import torch

class Wav2Vec2DataCollator:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, features: Dict[str, Any]) -> Dict[str, torch.Tensor]:
        batch = self.processor.pad(
            features,
            padding=True,
            return_tensors="pt"
        )
        return batch


In [27]:
data_collator = Wav2Vec2DataCollator(processor)

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    data_collator = Wav2Vec2DataCollator(processor),
)

In [30]:
trainer.train()

IndexError: Invalid key: 16824 is out of bounds for size 0

In [31]:
print(dataset['train'])

Dataset({
    features: ['audio'],
    num_rows: 17394
})
