This notebook showcases how to use the voice encoder based on `encodec`

In [None]:
import sys
sys.path.append('..')

In [None]:
from queue import Queue
from src.encoder import VoiceEncoder
from src.decoder import VoiceDecoder
from src.utils import read_audio

from IPython.display import Audio
from encodec.utils import save_audio

In [None]:
voice_encoder = VoiceEncoder(
   bandwidth=6.0,
   single_segment_duration=5,
   batch_size=128,
   overlap=0,
   device='cuda:0'
)

voice_decoder = VoiceDecoder(
   bandwidth=6.0,
   single_segment_duration=5,
   overlap=0,
   device='cuda:0'
)

In [None]:
audio_files_path = [
    '../data/test-clean/LibriSpeech/test-clean/260/123286/260-123286-0019.flac'
]

# Prepare a Queue for sending to the voice encoder
audio_files = Queue()
original_audio = []

for a in audio_files_path:
    wav = read_audio(a, 24_000)
    wav = wav[:, :24_000*30]
    audio_files.put((wav, a))
    original_audio.append(wav)

### Batched encodec - encoder and decoder

Both encoder and decoder requires a Queue
- Encoder requires a queue where each element is an audio tensor
- Decoder requires a queue where each element is a batch of (B, K, T) where B is the batch size, K is the number of codebooks and T is the sequence length

In [None]:
encoder = voice_encoder(audio_files)

# Save the results of the encoded audio in a queue to send to the decoder
encoded_q = Queue()

for batch in encoder:
    encoded_q.put(batch[0])
    print(f'Filename and start idx, end idx: {batch[1]}')

out_audio = []
decoder = voice_decoder(encoded_q)

for batch in decoder:
    out_audio.append(batch)

### Naive encodec

The encodec by default takes a single audio file and encodes/decodes it in one go

In [None]:
temp = original_audio[0].unsqueeze(0).to('cuda:0')
original_impl = voice_encoder.model(temp)

Play all 3 audios and compare:

1. Original audio
2. Naive encoding, decoding
3. Batched encoding, decoding

In [None]:
# Original audio
Audio(original_audio[0], rate=24_000)

In [None]:
# Reconstructed audio all at one go
Audio(original_impl.detach().cpu().numpy().flatten(), rate=24_000)

In [None]:
# Reconstructed audio in segments
Audio(out_audio[0].cpu().flatten(), rate=24_000)

In [None]:
save_audio(original_impl.detach().cpu().squeeze(0), 'naive_encodec.wav', sample_rate=24_000)
save_audio(out_audio[0].detach().cpu().unsqueeze(0), 'batched_encodec.wav', sample_rate=24_000)