In [1]:
from datasets import load_dataset
from utils import mp3_compress, opus_compress, encodec_compress
from transformers import pipeline
from evaluate import evaluator
import encodec

2023-10-12 12:37:13.941857: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-12 12:37:13.962497: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def mp3_compress_cv(sample):
    audio = sample['audio']['array'].unsqueeze(0)
    fs = sample['audio']['sampling_rate']
    audio,bps = mp3_compress(audio,fs)
    sample['audio']['array'] = audio[0][0]
    sample['bps'] = bps
    return sample
def opus_compress_cv(sample):
    audio = sample['audio']['array'].unsqueeze(1)
    fs = sample['audio']['sampling_rate']
    audio,bps = opus_compress(audio,fs)
    sample['audio']['array'] = audio[0]
    sample['bps'] = bps
    return sample

device = "cuda"
encodec_model_48_6 = encodec.EncodecModel.encodec_model_48khz()
encodec_model_48_6.set_target_bandwidth(6)
encodec_model_48_6.to(device)
def encodec_48_6_compress(sample):
    audio = sample['audio']['array'].unsqueeze(0)
    fs = sample['audio']['sampling_rate']
    audio,bps = encodec_compress(audio,fs, encodec_model_48_6, device)
    sample['audio']['array'] = audio
    sample['bps'] = bps
    return sample 

In [3]:
audio_compression_methods = [
    mp3_compress_cv,
    opus_compress_cv,
    encodec_48_6_compress
]
models = [
    "openai/whisper-tiny",
    "openai/whisper-small",
    "openai/whisper-medium"
]

In [4]:
common_voice = [load_dataset("mozilla-foundation/common_voice_11_0",
                             "en",
                             split="validation[:300]"
                            ).with_format("torch")]

In [5]:
for method in audio_compression_methods:
    common_voice.append(common_voice[0].map(method))

In [None]:
pipe = []
for model in models:
    pipe.append(
        pipeline(
            task="automatic-speech-recognition",
            model=model,
            device="cuda:0"
        )
    )

In [7]:
task_evaluator = evaluator("automatic-speech-recognition")
task_evaluator.PIPELINE_KWARGS.pop('truncation', None)

True

In [8]:
eval_results = []
for i_method,method in enumerate(common_voice):
    eval_results.append([])
    for i_model,model in enumerate(pipe):
        eval_results[i_method].append(
            task_evaluator.compute(
                model_or_pipeline=model,
                data=method.with_format("numpy"),
                input_column="audio",
                label_column="sentence",
                metric="wer",
            )
        )

In [9]:
[[rij['wer'] for rij in r] for r in eval_results]

[[0.32797202797202796, 0.17097902097902098, 0.1486013986013986],
 [0.622027972027972, 0.2409090909090909, 0.19545454545454546],
 [0.48671328671328673, 0.2947552447552448, 0.2444055944055944],
 [0.5863636363636363, 0.26153846153846155, 0.20454545454545456]]