In [10]:
import torchaudio
from audiocraft.models import MusicGen
from audiocraft.data.audio import audio_write

import os
import typing as tp

import torch

from audiocraft.models.encodec import CompressionModel
from audiocraft.models.lm import LMModel
from audiocraft.models.builders import get_debug_compression_model, get_debug_lm_model
from audiocraft.models.loaders import load_compression_model, load_lm_model, HF_MODEL_CHECKPOINTS_MAP
from audiocraft.data.audio_utils import convert_audio
from audiocraft.modules.conditioners import ConditioningAttributes, WavCondition
from audiocraft.utils.autocast import TorchAutocast

model = MusicGen.get_pretrained('small')

self = model

In [18]:
self.lm.load_state_dict(torch.load('/home/ubuntu/saved_models/lm_300.pt'))

<All keys matched successfully>

In [19]:
attributes, prompt_tokens = self._prepare_tokens_and_attributes(["jazz, bossa nova, by jobim, barzil music, peacefull, pleasant"], None)
print("attributes:", attributes)
print("prompt_tokens:", prompt_tokens)

attributes: [ConditioningAttributes(text={'description': 'jazz, bossa nova, by jobim, barzil music, peacefull, pleasant'}, wav={'self_wav': WavCondition(wav=tensor([[0.]], device='cuda:0'), length=tensor([0], device='cuda:0'), path='null_wav')})]
prompt_tokens: None


In [20]:
duration = 30

use_sampling = True
top_k = 250
top_p = 0.0
temperature = 1.0
cfg_coef = 3.0
two_step_cfg = False

assert duration <= 30, "The MusicGen cannot generate more than 30 seconds"

self.generation_params = {
    'max_gen_len': int(duration * self.frame_rate),
    'use_sampling': use_sampling,
    'temp': temperature,
    'top_k': top_k,
    'top_p': top_p,
    'cfg_coef': cfg_coef,
    'two_step_cfg': two_step_cfg,
}

In [21]:
with self.autocast:
    gen_tokens = self.lm.generate(prompt_tokens, attributes, callback=None, **self.generation_params)

In [22]:
assert gen_tokens.dim() == 3
print("gen_tokens information")
print("Shape:", gen_tokens.shape)
print("Dtype:", gen_tokens.dtype)
print("Contents:", gen_tokens)

gen_tokens information
Shape: torch.Size([1, 4, 1500])
Dtype: torch.int64
Contents: tensor([[[1978,  328,  480,  ..., 1073, 1306, 1064],
         [1386, 1968, 1802,  ..., 1732, 1757, 1732],
         [ 114, 1945, 1708,  ...,   47,   77,   31],
         [ 866, 1670, 1434,  ...,  327, 1992, 1407]]], device='cuda:0')


In [23]:
with torch.no_grad():
    gen_audio = self.compression_model.decode(gen_tokens, None)
print("gen_audio information")
print("Shape:", gen_audio.shape)
print("Dtype:", gen_audio.dtype)
print("Contents:", gen_audio)


gen_audio information
Shape: torch.Size([1, 1, 960000])
Dtype: torch.float32
Contents: tensor([[[0.0397, 0.0378, 0.0393,  ..., 0.0136, 0.0029, 0.0177]]],
       device='cuda:0')


In [24]:
gen_audio = gen_audio.cpu()
torchaudio.save("/home/ubuntu/test.wav", gen_audio[0], self.sample_rate)

In [11]:
from audiocraft.data.audio import normalize_audio
torchaudio.save("/home/ubuntu/test_norm.wav", normalize_audio(gen_audio[0]), self.sample_rate)

In [14]:
wav = torchaudio.load("/home/ubuntu/archivo/segment_008.wav")[0]
wav.shape

torch.Size([2, 1323008])

In [18]:
wav, sr = torchaudio.load("/home/ubuntu/dataset/child.wav")
wav = torchaudio.functional.resample(wav, sr, self.sample_rate)

wav = wav.mean(dim=0, keepdim=True)

wav = wav.unsqueeze(0)

tmp = model.compression_model.encode(wav.cuda())
tmp = model.compression_model.decode(tmp[0], None)
tmp = tmp.cpu()

torchaudio.save("/home/ubuntu/child_decenc.wav", tmp[0], self.sample_rate)
