In [1]:
from huggingface_hub import hf_hub_download
import torch
from moshi.models import loaders, LMGen

  from .autonotebook import tqdm as notebook_tqdm


### Mimi

In [2]:
mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME)
mimi = loaders.get_mimi(mimi_weight, device='cpu')
mimi.set_num_codebooks(8)  # up to 32 for mimi, but limited to 8 for moshi.

wav = torch.randn(1, 1, 24000 * 10)  # should be [B, C=1, T]
with torch.no_grad():
    codes = mimi.encode(wav)  # [B, K = 8, T]
    decoded = mimi.decode(codes)

    # Supports streaming too.
    frame_size = int(mimi.sample_rate / mimi.frame_rate)
    all_codes = []
    with mimi.streaming(batch_size=1):
        for offset in range(0, wav.shape[-1], frame_size):
            frame = wav[:, :, offset: offset + frame_size]
            codes = mimi.encode(frame)
            assert codes.shape[-1] == 1, codes.shape
            all_codes.append(codes)
            

In [5]:
all_codes

[tensor([[[ 430],
          [1213],
          [1513],
          [ 991],
          [ 377],
          [ 173],
          [ 462],
          [1232]]]),
 tensor([[[ 605],
          [  49],
          [ 569],
          [ 991],
          [ 322],
          [ 173],
          [ 462],
          [1232]]]),
 tensor([[[1964],
          [  49],
          [ 569],
          [ 991],
          [1636],
          [ 173],
          [ 462],
          [1966]]]),
 tensor([[[1964],
          [  49],
          [1513],
          [ 991],
          [ 713],
          [ 173],
          [1496],
          [ 607]]]),
 tensor([[[1464],
          [  49],
          [1513],
          [ 991],
          [ 713],
          [ 173],
          [1163],
          [1793]]]),
 tensor([[[2015],
          [  49],
          [1513],
          [ 991],
          [ 713],
          [ 173],
          [ 462],
          [ 607]]]),
 tensor([[[2015],
          [  49],
          [1513],
          [ 991],
          [ 713],
          [ 173],
          

### Moshi

In [6]:
mimi.cuda()
moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME)
moshi = loaders.get_moshi_lm(moshi_weight, device='cuda')
lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)  # this handles sampling params etc.
out_wav_chunks = []
# Now we will stream over both Moshi I/O, and decode on the fly with Mimi.
with torch.no_grad(), lm_gen.streaming(1), mimi.streaming(1):
    for idx, code in enumerate(all_codes):
        tokens_out = lm_gen.step(code.cuda())
        # tokens_out is [B, 1 + 8, 1], with tokens_out[:, 1] representing the text token.
        if tokens_out is not None:
            wav_chunk = mimi.decode(tokens_out[:, 1:])
            out_wav_chunks.append(wav_chunk)
        print(idx, end='\r')
out_wav = torch.cat(out_wav_chunks, dim=-1)

OutOfMemoryError: CUDA out of memory. Tried to allocate 250.00 MiB. GPU 0 has a total capacity of 14.75 GiB of which 49.06 MiB is free. Including non-PyTorch memory, this process has 14.70 GiB memory in use. Of the allocated memory 14.36 GiB is allocated by PyTorch, and 229.00 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)