In [None]:
from pathlib import Path
from IPython.display import display, Audio

# Only 1 import required from AudioToken
from audiotoken import AudioToken, Tokenizers

In [None]:
audio_path = '..'
device = 'cuda:0'

## Acoustic tokenizer

We can use an acoustic tokenizer to encode audio and then use its decoder to generate the same audio back. This works as a lossy compression and preserves audio characteristics like speech style, loudness, pitch, etc.

Acoustic tokens are very tough to model directly using sequence to sequence models.

In [None]:
acoustic_tokenizer = AudioToken(tokenizer=Tokenizers.acoustic, device=device)

In [None]:
encoded_audio = acoustic_tokenizer.encode(Path(audio_path).expanduser())
decoded_audio = acoustic_tokenizer.decode(encoded_audio)

Compare the original audio with the reconstructed audio

In [None]:
print('Original audio')
display(Audio(Path(audio_path).expanduser()))
print('Reconstructed audio')
display(Audio(decoded_audio, rate=24_000))

## Semantic Tokenizer

We can use a semantic tokenizer to encode audio and extract semantic meaning from it. Although this tokenization loses many of the acoustic characteristics that an acoustic tokenizer preserves, semantic tokens are useful for modeling long-term audio content.
To reconstruct the original audio from semantic tokens, we follow two steps:

1. We use an autoregressive sequence-to-sequence model to translate semantic tokens into acoustic tokens (with 2 codebooks).
2. We use a non-autoregressive model to generate acoustic tokens for 6 additional codebooks.

Once we have acoustic tokens for all 8 codebooks, we can use the acoustic tokenizer to decode these tokens back into audio.

In [None]:
semantic_tokenizer = AudioToken(tokenizer=Tokenizers.semantic_m, device=device)

In [None]:
audio_semantic_tokens = semantic_tokenizer.encode(Path(audio_path).expanduser(), chunk_size=30)
audio_acoustic_tokens = semantic_tokenizer.decode(audio_semantic_tokens)

In [None]:
audio_semantic_tokens.shape, audio_acoustic_tokens.shape

In [None]:
reconstructed_audio_semantic = acoustic_tokenizer.decode(audio_acoustic_tokens)

In [None]:
print('Original audio')
display(Audio(Path(audio_path).expanduser()))
print('Reconstructed audio')
display(Audio(reconstructed_audio_semantic, rate=24_000))