In [1]:
from datasets import load_dataset
from IPython.display import Audio as AudioDisplay
from torchmetrics.functional.audio.pesq import perceptual_evaluation_speech_quality as pesq
from transformers import AutoFeatureExtractor, MimiModel, DacModel, AutoProcessor
import torch

In [2]:
from codec_latent_denoiser import CodecLatentDenoiser

In [3]:
model = CodecLatentDenoiser.from_pretrained("gokulkarthik/codec-latent-denoiser-default")
model

config.json:   0%|          | 0.00/814 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/157M [00:00<?, ?B/s]

Some weights of the model checkpoint at gokulkarthik/codec-latent-denoiser-default were not used when initializing CodecLatentDenoiser: ['denoiser.layer1.bias', 'denoiser.layer2.bias', 'denoiser.layer_norm.bias']
- This IS expected if you are initializing CodecLatentDenoiser from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CodecLatentDenoiser from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CodecLatentDenoiser(
  (codec): DacModel(
    (encoder): DacEncoder(
      (conv1): Conv1d(1, 64, kernel_size=(7,), stride=(1,), padding=(3,))
      (block): ModuleList(
        (0): DacEncoderBlock(
          (res_unit1): DacResidualUnit(
            (snake1): Snake1d()
            (conv1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(3,))
            (snake2): Snake1d()
            (conv2): Conv1d(64, 64, kernel_size=(1,), stride=(1,))
          )
          (res_unit2): DacResidualUnit(
            (snake1): Snake1d()
            (conv1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(9,), dilation=(3,))
            (snake2): Snake1d()
            (conv2): Conv1d(64, 64, kernel_size=(1,), stride=(1,))
          )
          (res_unit3): DacResidualUnit(
            (snake1): Snake1d()
            (conv1): Conv1d(64, 64, kernel_size=(7,), stride=(1,), padding=(27,), dilation=(9,))
            (snake2): Snake1d()
            (conv2): Conv1d(64, 64, kernel_size=(1,), st

In [2]:
data_path = "JacobLinCool/VoiceBank-DEMAND-16k"

In [3]:
ds = load_dataset("JacobLinCool/VoiceBank-DEMAND-16k", num_proc=32)
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'clean', 'noisy'],
        num_rows: 11572
    })
    test: Dataset({
        features: ['id', 'clean', 'noisy'],
        num_rows: 824
    })
})

In [4]:
sample_idx = 0
sample = ds['train'][sample_idx]
sample

{'id': 'p226_001',
 'clean': {'path': 'p226_001.wav',
  'array': array([-0.00286865, -0.00497437, -0.00469971, ..., -0.00201416,
         -0.00152588, -0.00021362], shape=(36480,)),
  'sampling_rate': 16000},
 'noisy': {'path': 'p226_001.wav',
  'array': array([-0.00283813, -0.00488281, -0.00473022, ...,  0.006073  ,
          0.00564575,  0.00540161], shape=(36480,)),
  'sampling_rate': 16000}}

In [None]:
AudioDisplay(sample['clean']['array'], rate=sample['clean']['sampling_rate'])

In [6]:
AudioDisplay(sample['noisy']['array'], rate=sample['noisy']['sampling_rate'])

In [7]:
preds = torch.from_numpy(sample['noisy']['array'])
target = torch.from_numpy(sample['clean']['array'])
sampling_rate = sample['noisy']['sampling_rate']
assert sampling_rate == 16000
print(preds.shape, target.shape)
score = pesq(preds=preds, target=target, fs=sampling_rate, mode="wb").item()
print(score)

torch.Size([36480]) torch.Size([36480])


ModuleNotFoundError: PESQ metric requires that pesq is installed. Either install as `pip install torchmetrics[audio]` or `pip install pesq`.

In [8]:
score = pesq(preds=preds, target=preds, fs=sampling_rate, mode="wb").item()
print(score)
score = pesq(preds=target, target=target, fs=sampling_rate, mode="wb").item()
print(score)
score = pesq(preds=target, target=preds, fs=sampling_rate, mode="wb").item()
print(score)

4.643888473510742
4.643888473510742
1.1161136627197266


# Let's try DAC

In [8]:
dac_model_path = "descript/dac_16khz"
dac_model = DacModel.from_pretrained(dac_model_path)
dac_processor = AutoProcessor.from_pretrained(dac_model_path)

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [9]:
def dac_forward(audio_sample, n_quantizers=12):
    global dac_model, dac_processor

    inputs = dac_processor(raw_audio=audio_sample, sampling_rate=dac_processor.sampling_rate, return_tensors="pt")
    encoder_outputs = dac_model.encode(inputs["input_values"], n_quantizers=n_quantizers)
    audio_values = dac_model.decode(quantized_representation=encoder_outputs.quantized_representation).audio_values.cpu().detach()[0]

    audio_values_tensor = torch.zeros_like(audio_sample)
    audio_values_tensor[:audio_values.shape[0]] = audio_values
    
    return audio_values_tensor

In [10]:
noisy = torch.from_numpy(sample['noisy']['array'])
clean = torch.from_numpy(sample['clean']['array'])
noisy_transformed = dac_forward(noisy)
clean_transformed = dac_forward(clean)
assert clean.shape == clean_transformed.shape
assert noisy.shape == noisy_transformed.shape

In [11]:
AudioDisplay(clean, rate=16000)

In [12]:
AudioDisplay(noisy, rate=16000)

In [13]:
AudioDisplay(clean_transformed, rate=16000)

In [14]:
AudioDisplay(noisy_transformed, rate=16000)

In [15]:
# how much does dac preserve pesq?
print(pesq(preds=clean_transformed, target=clean, fs=sampling_rate, mode="wb").item())
print(pesq(preds=noisy_transformed, target=noisy, fs=sampling_rate, mode="wb").item())

ModuleNotFoundError: PESQ metric requires that pesq is installed. Either install as `pip install torchmetrics[audio]` or `pip install pesq`.

In [16]:
print(pesq(preds=noisy, target=clean, fs=sampling_rate, mode="wb").item())
print(pesq(preds=noisy_transformed, target=clean, fs=sampling_rate, mode="wb").item())

ModuleNotFoundError: PESQ metric requires that pesq is installed. Either install as `pip install torchmetrics[audio]` or `pip install pesq`.

## Batch operations