In [45]:
from datasets import load_dataset
from utils import mp3_compress, opus_compress, encodec_compress
from utils import hf_audio_encode
from transformers import pipeline
from evaluate import evaluator
import encodec
import matplotlib.pyplot as plt
import numpy as np
import cdpam
import torch
import gc

In [2]:
def mp3_compress_cv(sample):
    audio = sample['audio']['array'].unsqueeze(0)
    fs = sample['audio']['sampling_rate']
    audio,bps = mp3_compress(audio,fs)
    encoded = hf_audio_encode(audio,fs)
    sample['audio'] = encoded
    sample['bps'] = bps
    return sample
def opus_compress_cv(sample):
    audio = sample['audio']['array'].unsqueeze(0)
    fs = sample['audio']['sampling_rate']
    audio,bps = opus_compress(audio,fs)
    encoded = hf_audio_encode(audio,fs)
    sample['audio'] = encoded
    sample['bps'] = bps
    return sample

device = "cuda"
encodec_model_48_3 = encodec.EncodecModel.encodec_model_48khz()
encodec_model_48_3.set_target_bandwidth(6)
encodec_model_48_3.to(device)
def encodec_48_3_compress(sample):
    audio = sample['audio']['array'].unsqueeze(0)
    fs = sample['audio']['sampling_rate']
    audio,bps = encodec_compress(audio,fs, encodec_model_48_3, device)
    encoded = hf_audio_encode(audio,fs)
    sample['audio'] = encoded
    sample['bps'] = bps
    return sample 

In [3]:
audio_compression_methods = [
    mp3_compress_cv,
    opus_compress_cv,
    encodec_48_3_compress
]

In [4]:
common_voice = load_dataset("mozilla-foundation/common_voice_11_0",
                             "en",
                             split="validation[:100]"
                            ).with_format("torch")

In [5]:
exclude_idx = [362, 711]
common_voice = [common_voice.select(
    (
        i for i in range(len(common_voice)) 
        if i not in set(exclude_idx)
    )
)]



In [None]:
for method in audio_compression_methods:
    common_voice.append(common_voice[0].map(method))

In [54]:
cdpam_metric = cdpam.CDPAM()
mse_metric = torch.nn.MSELoss()
cdpam_distance = []
mse_distance = []
for dataset in common_voice:
    cdpam_distance.append([])
    mse_distance.append([])
    for i_sample,sample in enumerate(common_voice[0]):
        compressed_sample = dataset[i_sample]
        sample_rate = sample['audio']['sampling_rate']
        reference = sample['audio']['array'].unsqueeze(0)
        distorted = compressed_sample['audio']['array'].unsqueeze(0)
        cdpam_distance[-1].append(cdpam_metric.forward(reference,distorted).detach().cpu().item())
        mse_distance[-1].append(mse_metric.forward(reference,distorted).detach().cpu().item())
        gc.collect()
        torch.cuda.empty_cache()

In [63]:
PSNR = [-10*np.log10(np.mean(d)) for d in mse_distance][1:]
PSNR

  PSNR = [-10*np.log10(np.mean(d)) for d in mse_distance][1:]


[33.898382799187345, 26.702614854902365, 29.0428171579391]

In [64]:
cdpam_PSNR = [-10*np.log10(np.mean(d)) for d in cdpam_distance][1:]
cdpam_PSNR

  cdpam_PSNR = [-10*np.log10(np.mean(d)) for d in cdpam_distance][1:]


[37.89975888664649, 38.27911434099799, 46.34223939544932]

In [65]:
audio_bps = [method['bps'].mean().item() for method in common_voice[1:]]
audio_bps

[0.6696820259094238, 0.14399589598178864, 0.12622858583927155]