#### NOTE: Also need to identify how fairseq evaluates model by wer/cer on librispeech

In [1]:
from load_fsq_model import load_model

In [2]:
model = load_model('wav2vec_small_960h.pt')

In [3]:
model.eval();

In [4]:
# Get one audio sample from librispeech
import torchaudio
test_data = torchaudio.datasets.LIBRISPEECH("../", "test-clean", download=True)
sample = test_data[0][0]
sample.shape

torch.Size([1, 166960])

In [5]:
json_dict = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3, "|": 4, "E": 5, "T": 6, "A": 7, "O": 8, "N": 9, "I": 10, "H": 11, "S": 12, "R": 13, "D": 14, "L": 15, "U": 16, "M": 17, "W": 18, "C": 19, "F": 20, "G": 21, "Y": 22, "P": 23, "B": 24, "V": 25, "K": 26, "'": 27, "X": 28, "J": 29, "Q": 30, "Z": 31}

In [6]:
import numpy as np
from itertools import groupby

class Decoder:
    def __init__(self, json_dict):
        self.dict = json_dict
        self.look_up = np.asarray(list(self.dict.keys()))

    def decode(self, ids):
        converted_tokens = self.look_up[ids]
        fused_tokens = [tok[0] for tok in groupby(converted_tokens)]
        output = ' '.join(''.join(''.join(fused_tokens).split("<s>")).split("|"))
        return output

In [7]:
decoder = Decoder(json_dict=json_dict)

In [8]:
model.cuda();

In [9]:
logits = model(source=sample.cuda(), padding_mask=None)["encoder_out"]

In [10]:
from tqdm.auto import tqdm
import torch
from jiwer import wer

wer_ = []

for i, data in enumerate(tqdm(test_data)):
    logits = model(source=data[0].cuda(), padding_mask=None)["encoder_out"]
    predicted_ids = torch.argmax(logits[:, 0], axis=-1)
    predictions = decoder.decode(predicted_ids.cpu())
    labels = data[2]
    
    wer_.append(wer(labels, predictions))
    
print(f"WER: {np.mean(wer_)}")

  0%|          | 0/2620 [00:00<?, ?it/s]

WER: 0.038536101512586685


---

In [None]:
from torch.utils.data import DataLoader
test_dataloader = DataLoader(self.train_data, batch_size=6, collate_fn=data_collator, num_workers=8)

In [None]:
from torch.nn.utils.rnn import pad_sequence

def data_collator(features):
    # split inputs and labels since they have to be of different lengths and need
    # different padding methods
    words = [torch.tensor(word) for word, _ in batch]
    src = pad_sequence(words, batch_first=True, padding_value=0)
    input_features = [feature[0] for feature in features]
    label_features = [feature[2] for feature in features]

    batch = self.processor.pad(
        input_features,
        padding=self.padding,
        max_length=self.max_length,
        pad_to_multiple_of=self.pad_to_multiple_of,
        return_tensors="pt",
    )
    with self.processor.as_target_processor():
        labels_batch = self.processor.pad(
            label_features,
            padding=self.padding,
            max_length=self.max_length_labels,
            pad_to_multiple_of=self.pad_to_multiple_of_labels,
            return_tensors="pt",
        )

    # replace padding with -100 to ignore loss correctly
    labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)

    batch["labels"] = labels

    return batch