In [1]:
from datasets import load_dataset, Audio
from transformers import EncodecModel, AutoProcessor
import numpy as np
import torch
import torchaudio

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# dummy dataset, however you can swap this with an dataset on the 🤗 hub or bring your own
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

Found cached dataset librispeech_asr_dummy (/home/gmongaras/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_dummy/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)


In [3]:
# load the model + processor (for pre-processing the audio)
model = EncodecModel.from_pretrained("facebook/encodec_24khz").eval().cuda()
processor = AutoProcessor.from_pretrained("facebook/encodec_24khz")

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


In [4]:
# cast the audio data to the correct sampling rate for the model
librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
audio_sample = librispeech_dummy[0]["audio"]["array"]

In [5]:
# Save the audio clip
torchaudio.save("sample.wav", torch.tensor(audio_sample).unsqueeze(0).float(), sample_rate=processor.sampling_rate)

In [6]:
# Process as batch
inputs = processor(audio_sample.tolist(), sampling_rate=processor.sampling_rate, return_tensors="pt", padding=True)

In [7]:
inputs

{'padding_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32), 'input_values': tensor([[[0.0023, 0.0025, 0.0019,  ..., 0.0006, 0.0010, 0.0008]]])}

In [8]:
inputs["input_values"].shape

torch.Size([1, 1, 140520])

In [9]:
# Encode inputs
encoder_outputs = model.encode(inputs["input_values"].cuda(), inputs["padding_mask"].cuda(), bandwidth=24.0)

In [10]:
# Decode inputs
audio_values = model.decode(encoder_outputs.audio_codes, encoder_outputs.audio_scales, inputs["padding_mask"].cuda())[0]

In [11]:
# Save the audio clip
torchaudio.save("test2.wav", audio_values.reshape(-1).cpu().squeeze().unsqueeze(0).float(), sample_rate=processor.sampling_rate)

In [12]:
# or the equivalent with a forward pass
audio_values = model(inputs["input_values"].cuda(), inputs["padding_mask"].cuda(), bandwidth=24.0).audio_values

# you can also extract the discrete codebook representation for LM tasks
# output: concatenated tensor of all the representations
audio_codes = model(inputs["input_values"].cuda(), inputs["padding_mask"].cuda(), bandwidth=24.0).audio_codes

In [13]:
audio_values.shape

torch.Size([1, 1, 140520])

In [14]:
# Output is of shape (batch_size, num_features/num_filters/number of codebooks, num_timesteps)
# So we have 32 codebooks. Each codebook has an index between [0, 1023]. So this is represented
# by a matrix where each token is 32 values long and there are T tokens.
audio_codes.shape

torch.Size([1, 1, 32, 440])

In [15]:
audio_codes

tensor([[[[  62,  835,  835,  ...,  835,  835,  835],
          [1007, 1007, 1007,  ...,  424,  518,  518],
          [ 786,  678,  821,  ...,   36,  653,   36],
          ...,
          [ 387,  764,  557,  ...,  640,  978,  652],
          [ 659,  804,  947,  ...,  799,  610,  525],
          [ 567,  925,  657,  ...,  665,  683,  679]]]], device='cuda:0')

In [16]:
# Get the quantizer from the model
quantizer = model.quantizer

In [25]:
audio_codes.shape

torch.Size([1, 1, 32, 440])

In [26]:
# Dequantize the outputs
audio_values = quantizer.decode(audio_codes.reshape(32, 1, 440))

In [27]:
audio_values[0].shape

torch.Size([128, 440])

In [28]:
# How do the dequantize outptus compare with the outptus before quantization?
print(((quantizer.decode(audio_codes.reshape(32, 1, 440)) - model.encoder(inputs["input_values"].cuda()))**2).mean())

tensor(0.0144, device='cuda:0', grad_fn=<MeanBackward0>)


In [29]:
# What about batching?
print(((quantizer.decode(audio_codes.reshape(32, 1, 440).repeat(1, 2, 1)) - model.encoder(inputs["input_values"].cuda()).repeat(2, 1, 1))**2).mean())

tensor(0.0144, device='cuda:0', grad_fn=<MeanBackward0>)


In [None]:
### So we know the decoder is expecting shape (CB, B, T)

In [None]:
import torch
import torchaudio
import matplotlib.pyplot as plt
import matplotlib

In [None]:
import sys
sys.path.append('../')
from src.encodec.model import EncodecModel
from src.encodec.modules.seanet import SEANetEncoder, SEANetDecoder
from src.encodec.quantization.vq import ResidualVectorQuantizer

In [3]:
# Let's say we have a batch of audio, each 10 seconds each
batch = torch.randn(20, 1, 16000*10)

In [4]:
# Create the model
dim = 128
encoder, decoder, vq = SEANetEncoder(dimension=dim), SEANetDecoder(dimension=dim), ResidualVectorQuantizer(dim)
Model = EncodecModel(
    encoder=encoder,
    decoder=decoder,
    quantizer=vq,
    target_bandwidths=[6.0, 8.0, 10.0, 12.0, 14.0, 16.0],
    sample_rate=16000,
    channels=1
)

In [5]:
# Put batch and model on GPU
batch = batch.cuda()
Model = Model.cuda()

In [6]:
Model(batch)

tensor([[[-0.0789, -0.0704, -0.0792,  ..., -0.0790, -0.0896, -0.0600]],

        [[-0.0789, -0.0704, -0.0792,  ..., -0.0790, -0.0896, -0.0600]],

        [[-0.0789, -0.0704, -0.0792,  ..., -0.0790, -0.0896, -0.0600]],

        ...,

        [[-0.0789, -0.0704, -0.0792,  ..., -0.0790, -0.0896, -0.0600]],

        [[-0.0789, -0.0704, -0.0792,  ..., -0.0790, -0.0896, -0.0600]],

        [[-0.0789, -0.0704, -0.0792,  ..., -0.0790, -0.0896, -0.0600]]],
       device='cuda:0', grad_fn=<SliceBackward0>)

In [7]:
# Adam optimizer
optimizer = torch.optim.Adam(Model.parameters(), lr=1e-4)

In [8]:
# Do a single pass and loss step to see gpu memory usage
optimizer.zero_grad()
loss = Model(batch).mean()
loss.backward()
optimizer.step()

In [10]:
# Get memory usage
print(f"{torch.cuda.memory_allocated() / 1e9} GB")

9.257893376 GB
