In [2]:
import pandas as pd
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor, AutoConfig
from transformers.models.wav2vec2.modeling_wav2vec2 import (
    Wav2Vec2PreTrainedModel,
    Wav2Vec2Model
)
import torch
import numpy as np
from tqdm import tqdm
tqdm.pandas()
from torch import nn



  from .autonotebook import tqdm as notebook_tqdm


In [34]:
def pool(input : np.array , mode : str) -> np.array:
    """
    Supported modes are 'mean', 'max' and 'median'
    Given an array with one dimension, we take the mean max or
    median of it and return it
    """
    if mode == 'mean':
        return torch.Tensor(input.mean(0))
    elif mode == 'max':
        return torch.Tensor(input.max(0))
    elif mode == 'median':
        return torch.Tensor(np.median(input,0))
    else:
        raise NotImplementedError("The supported modes are 'mean', 'max' and 'median'")

In [35]:
def speech_file_to_array_fn(path , target_sampling_rate):

    # path = path[6:]
    
    speech_array, sampling_rate = torchaudio.load(path)
    resampler = torchaudio.transforms.Resample(sampling_rate, target_sampling_rate)
    speech = resampler(speech_array).squeeze().numpy()
    return pool(speech , "median")

In [33]:
speech_file_to_array_fn(batch[0] , 16000).shape

torch.Size([36181])

In [36]:
def pad_and_process_values(input_val, processor , target_sampling_rate):
    # m = 166571 # takes 8 minutes to get this value on a pre-processing step with 10K data points
    m = max(map(np.shape , input_val))[0]
    inp = []
    for matrix in input_val:
        n = matrix.shape[0]
        mat = np.pad(matrix, (0, m-n), 'constant')
        inp.append(mat)
    

    result = processor(inp, sampling_rate=target_sampling_rate)

    result = result['input_values']

    return result 


In [37]:
def Processor(batch):
    model_path = "facebook/wav2vec2-large-960h"
    processor = Wav2Vec2Processor.from_pretrained(model_path)
    target_sampling_rate = processor.feature_extractor.sampling_rate
    speech_list = []



    for (input_path) in batch:
        speech_list.append(speech_file_to_array_fn(input_path , target_sampling_rate))


    # label_list = torch.tensor(label_list, dtype=torch.int64)
    speech_list = pad_and_process_values(speech_list , processor , target_sampling_rate )
    return torch.Tensor(np.array(speech_list))

In [38]:
p = Processor(batch) 
p[1].shape

torch.Size([36181])

In [2]:
from torch import nn
class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
       
        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config)  
        # self.init_weights()

    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()

    def merged_strategy(self, hidden_states, mode="mean"):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")

        return outputs

    def forward(self, input_values, attention_mask=None, output_attentions=None, output_hidden_states=None, return_dict=None, labels=None,):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        return self.wav2vec2(input_values,attention_mask=attention_mask,output_attentions=output_attentions,output_hidden_states=output_hidden_states,return_dict=return_dict)
        

In [3]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained("facebook/wav2vec2-large-960h")


In [4]:
model = Wav2Vec2ForSpeechClassification.from_pretrained( "facebook/wav2vec2-large-960h" , config=config)

Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForSpeechClassification: ['lm_head.weight', 'lm_head.bias']
- This IS expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForSpeechClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForSpeechClassification were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
model.freeze_feature_extractor()
model._get_feat_extract_output_lengths(1024).item()

In [23]:
model._get_feat_extract_output_lengths(3280).item()

10

In [6]:
def FeatureExtractor(batch):    
    model_path = "facebook/wav2vec2-base-100k-voxpopuli"
    feature_extractor =  Wav2Vec2FeatureExtractor.from_pretrained(model_path) 
    target_sampling_rate = feature_extractor.sampling_rate

    speech_list = []

    number_of_channels = 6


    for (input_path) in batch:
        speech_list.append(speech_file_to_array_fn(input_path , target_sampling_rate))


    # label_list = torch.tensor(label_list, dtype=torch.int64)
    speech_list = pad_and_process_values(speech_list , number_of_channels , feature_extractor , target_sampling_rate )
    return torch.Tensor(np.array(speech_list))

In [8]:
batch = ["../data/test_splits_wav/dia0_utt0.wav", "../data/test_splits_wav/dia1_utt0.wav" ]

first wav file outputs 217086 values

In [7]:
p = Processor(batch) 
p[1].shape

torch.Size([36181])

In [31]:
fe = FeatureExtractor(batch)
fe[0].shape

torch.Size([217086])

In [32]:
if len(p) == len(fe) == len(batch):
    print(len(p))

2


In [33]:
config = AutoConfig.from_pretrained(
        "facebook/wav2vec2-base-100k-voxpopuli",
    )
wav2vec2 = Wav2Vec2Model(config)

In [None]:
outputs = wav2vec2(fe,attention_mask=None,output_attentions=None,output_hidden_states=None,return_dict=None).shape

In [4]:
from transformers.modeling_outputs import SequenceClassifierOutput
import torch
output = SequenceClassifierOutput(loss=None, logits=torch.Tensor([[-4.4651e-02,  7.5752e-05, -2.2422e-02, -4.2450e-03,  2.3928e-02,
         -2.7536e-02, -4.1361e-02],
        [-5.3067e-02,  9.4667e-03, -1.5667e-02, -6.3375e-03,  4.1833e-02,
         -1.1422e-02, -4.0160e-02],
        [-3.5620e-02,  2.4163e-03, -1.4061e-02,  1.4004e-02,  2.4372e-02,
         -2.5579e-02, -4.3301e-02],
        [-5.9695e-02, -8.3358e-03, -1.8136e-02, -1.5357e-02,  6.5647e-02,
         -3.3532e-02, -5.0336e-02]]), hidden_states=None, attentions=None)

In [6]:
torch.argmax(output.logits , dim = 1)

tensor([4, 4, 4, 4])

In [5]:
torch.mean(output.logits, dim=1)

tensor([-0.0166, -0.0108, -0.0111, -0.0171])