In [2]:
from datasets import load_dataset, Audio
from transformers import EncodecModel, AutoProcessor
import numpy as np
import torch
import torchaudio

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# dummy dataset, however you can swap this with an dataset on the 🤗 hub or bring your own
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

Found cached dataset librispeech_asr_dummy (/home/gmongaras/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_dummy/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)


In [4]:
# load the model + processor (for pre-processing the audio)
model = EncodecModel.from_pretrained("facebook/encodec_24khz").eval().cuda()
processor = AutoProcessor.from_pretrained("facebook/encodec_24khz")

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [5]:
# cast the audio data to the correct sampling rate for the model
librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
audio_sample = librispeech_dummy[0]["audio"]["array"]

In [8]:
# Save the audio clip
torchaudio.save("sample.wav", torch.tensor(audio_sample).unsqueeze(0).float(), sample_rate=processor.sampling_rate)

In [159]:
# Process as batch
inputs = processor(audio_sample.tolist(), sampling_rate=processor.sampling_rate, return_tensors="pt", padding=True)

In [160]:
inputs

{'padding_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32), 'input_values': tensor([[[0.0023, 0.0025, 0.0019,  ..., 0.0006, 0.0010, 0.0008]]])}

In [161]:
inputs["input_values"].shape

torch.Size([1, 1, 140520])

In [162]:
# Encode inputs
encoder_outputs = model.encode(inputs["input_values"].cuda(), inputs["padding_mask"].cuda(), bandwidth=24.0)

In [163]:
# Decode inputs
audio_values = model.decode(encoder_outputs.audio_codes, encoder_outputs.audio_scales, inputs["padding_mask"].cuda())[0]

In [164]:
# Save the audio clip
torchaudio.save("test2.wav", audio_values.reshape(-1).cpu().squeeze().unsqueeze(0).float(), sample_rate=processor.sampling_rate)

In [165]:
# or the equivalent with a forward pass
audio_values = model(inputs["input_values"].cuda(), inputs["padding_mask"].cuda(), bandwidth=24.0).audio_values

# you can also extract the discrete codebook representation for LM tasks
# output: concatenated tensor of all the representations
audio_codes = model(inputs["input_values"].cuda(), inputs["padding_mask"].cuda(), bandwidth=24.0).audio_codes

In [166]:
audio_values.shape

torch.Size([1, 1, 140520])

In [168]:
# Output is of shape (batch_size, num_features/num_filters, num_timesteps)
audio_codes.shape

torch.Size([1, 1, 32, 440])

In [23]:
# Get the quantizer from the model
quantizer = model.quantizer

In [32]:
# Dequantize the outputs
audio_values = [quantizer.decode(audio_code) for audio_code in audio_codes]

In [34]:
audio_values[0].shape

torch.Size([2, 128, 440])

In [35]:
# How do the dequantize outptus compare with the outptus before quantization?
audio_values

[tensor([[[-0.6202, -0.0275, -0.0275,  ..., -0.0275, -0.0275, -0.0275],
          [ 8.5276,  8.8629,  8.8629,  ...,  8.8629,  8.8629,  8.8629],
          [-3.5410, -3.4825, -3.4825,  ..., -3.4825, -3.4825, -3.4825],
          ...,
          [-0.5991, -0.4798, -0.4798,  ..., -0.4798, -0.4798, -0.4798],
          [-1.7363, -1.9271, -1.9271,  ..., -1.9271, -1.9271, -1.9271],
          [ 1.7694,  1.8864,  1.8864,  ...,  1.8864,  1.8864,  1.8864]],
 
         [[ 2.9739,  2.9739,  2.9739,  ...,  1.2346,  5.7468,  5.7468],
          [12.2339, 12.2339, 12.2339,  ..., 11.7971, 13.1144, 13.1144],
          [-4.0380, -4.0380, -4.0380,  ..., -4.6215, -4.3802, -4.3802],
          ...,
          [ 0.9357,  0.9357,  0.9357,  ..., -0.8098,  2.1371,  2.1371],
          [-2.6692, -2.6692, -2.6692,  ..., -3.3131, -4.3044, -4.3044],
          [ 2.1866,  2.1866,  2.1866,  ...,  2.3223,  4.2336,  4.2336]]],
        device='cuda:0')]

In [None]:
import torch
import torchaudio
import matplotlib.pyplot as plt
import matplotlib

In [None]:
import sys
sys.path.append('../')
from src.encodec.model import EncodecModel
from src.encodec.modules.seanet import SEANetEncoder, SEANetDecoder
from src.encodec.quantization.vq import ResidualVectorQuantizer

In [3]:
# Let's say we have a batch of audio, each 10 seconds each
batch = torch.randn(20, 1, 16000*10)

In [4]:
# Create the model
dim = 128
encoder, decoder, vq = SEANetEncoder(dimension=dim), SEANetDecoder(dimension=dim), ResidualVectorQuantizer(dim)
Model = EncodecModel(
    encoder=encoder,
    decoder=decoder,
    quantizer=vq,
    target_bandwidths=[6.0, 8.0, 10.0, 12.0, 14.0, 16.0],
    sample_rate=16000,
    channels=1
)

In [5]:
# Put batch and model on GPU
batch = batch.cuda()
Model = Model.cuda()

In [6]:
Model(batch)

tensor([[[-0.0789, -0.0704, -0.0792,  ..., -0.0790, -0.0896, -0.0600]],

        [[-0.0789, -0.0704, -0.0792,  ..., -0.0790, -0.0896, -0.0600]],

        [[-0.0789, -0.0704, -0.0792,  ..., -0.0790, -0.0896, -0.0600]],

        ...,

        [[-0.0789, -0.0704, -0.0792,  ..., -0.0790, -0.0896, -0.0600]],

        [[-0.0789, -0.0704, -0.0792,  ..., -0.0790, -0.0896, -0.0600]],

        [[-0.0789, -0.0704, -0.0792,  ..., -0.0790, -0.0896, -0.0600]]],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [7]:
# Adam optimizer
optimizer = torch.optim.Adam(Model.parameters(), lr=1e-4)

In [8]:
# Do a single pass and loss step to see gpu memory usage
optimizer.zero_grad()
loss = Model(batch).mean()
loss.backward()
optimizer.step()

In [10]:
# Get memory usage
print(f"{torch.cuda.memory_allocated() / 1e9} GB")

9.257893376 GB
