In [1]:
from open_musiclm.open_musiclm import MusicLM

from open_musiclm.config import create_musiclm_from_config

from open_musiclm.config import load_model_config

In [2]:
import torch

# Проверка доступных GPU
if torch.cuda.is_available():
    print(f"Доступно GPU: {torch.cuda.device_count()}")
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("GPU недоступны")

Доступно GPU: 2
GPU 0: NVIDIA GeForce RTX 3090
GPU 1: NVIDIA GeForce GTX 1050 Ti


In [3]:
# Текущая память
props = torch.cuda.get_device_properties(0)

allocated = torch.cuda.memory_allocated(0) / 1024 ** 2
reserved = torch.cuda.memory_reserved(0) / 1024 ** 2
free_memory = props.total_memory / 1024 ** 2 - reserved

print(f"Allocated Memory: {allocated:.2f} MB")
print(f"Reserved Memory: {reserved:.2f} MB")
print(f"Free Memory: {free_memory:.2f} MB")


Allocated Memory: 0.00 MB
Reserved Memory: 0.00 MB
Free Memory: 24257.56 MB


In [4]:
semantic_path = "checkpoints_new/semantic.transformer.14000.pt"
coarse_path = "checkpoints_new/coarse.transformer.18000.pt"
fine_path = "checkpoints_new/fine.transformer.24000.pt"
rvq_path = 'checkpoints_new/clap.rvq.950_no_fusion.pt'
kmeans_path = "checkpoints_new/kmeans_10s_no_fusion.joblib"
model_config = "./configs/model/musiclm_large_small_context.json"
results_folder = "results"

In [5]:
from open_musiclm.config import load_model_config

model_config = load_model_config(model_config)

In [6]:
model_config

MusicLMModelConfig(clap_rvq_cfg=ClapRVQConfig(rq_num_quantizers=12, codebook_size=1024, enable_fusion=False, rq_ema_decay=0.95, threshold_ema_dead_code=0.5, checkpoint_path=None, amodel_type='HTSAT-tiny'), hubert_kmeans_cfg=HubertKmeansConfig(model_name='m-a-p/MERT-v0', normalize_embeds=True, embed_layer=7, target_sample_hz=16000, seq_len_multiple_of=320, codebook_size=1024, output_hz=50), encodec_cfg=EncodecConfig(bandwidth=6.0, codebook_size=1024, output_hz=75), semantic_cfg=SemanticConfig(dim=1024, depth=24, heads=16, attn_dropout=0.0, ff_dropout=0.1, use_conv_ff=True, grad_shrink_alpha=0.1, non_causal_prefix_size=0, relative_position_bias_type='continuous', use_memory_efficient_attention=False, use_absolute_position_embeddings=False, max_absolute_position_embeddings=262), coarse_cfg=CoarseConfig(dim=1024, depth=24, heads=16, attn_dropout=0.0, ff_dropout=0.1, use_conv_ff=True, grad_shrink_alpha=0.1, non_causal_prefix_size=0, relative_position_bias_type='continuous', use_memory_effic

In [7]:
import torch
from open_musiclm.config import (create_semantic_transformer_from_config, 
                                 create_coarse_transformer_from_config, 
                                 create_fine_transformer_from_config,
                                 create_clap_quantized_from_config,
                                 create_hubert_kmeans_from_config,
                                 create_encodec_from_config
)
                                 
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [8]:
device

'cuda:0'

In [9]:
semantic_transformer = create_semantic_transformer_from_config(model_config, semantic_path, device)
print("semantic успешно загружен")
coarse_transformer = create_coarse_transformer_from_config(model_config, coarse_path, device)
print("coarse успешно загружен")
fine_transformer = create_fine_transformer_from_config(model_config, fine_path, device)
print("fine успешно загружен")

semantic успешно загружен
coarse успешно загружен
fine успешно загружен


In [10]:
clap = create_clap_quantized_from_config(model_config, rvq_path, device)
print("clap успешно загружен")
wav2vec = create_hubert_kmeans_from_config(model_config, kmeans_path, device)
print("wav2vec успешно загружен")
encodec_wrapper = create_encodec_from_config(model_config, device)
print("encodec успешно загружен")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


clap успешно загружен


You are using a model of type mert_model to instantiate a model of type hubert. This is not supported for all configurations of models and can yield errors.
  return self.fget.__get__(instance, owner)()
Some weights of HubertModel were not initialized from the model checkpoint at m-a-p/MERT-v0 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


wav2vec успешно загружен
encodec успешно загружен


In [11]:
clap_token_ids = clap(text_input="woman singing jazz melody")
clap_token_ids

tensor([[[211],
         [517],
         [459],
         [853],
         [542],
         [263],
         [ 90],
         [396],
         [856],
         [359],
         [343],
         [442]]], device='cuda:0')

In [11]:
from open_musiclm.open_musiclm import SemanticStage, CoarseStage, FineStage

semantic = SemanticStage(
            semantic_transformer=semantic_transformer,
            wav2vec=wav2vec,
            clap=clap,
        )
coarse = CoarseStage(
            coarse_transformer=coarse_transformer,
            wav2vec=wav2vec,
            clap=clap,
            neural_codec=encodec_wrapper
        )
fine = FineStage(
            fine_transformer=fine_transformer,
            clap=clap,
            neural_codec=encodec_wrapper
        )

In [12]:
output_seconds = 10
semantic_window_seconds = 10
coarse_window_seconds = 8
fine_window_seconds = 4
semantic_steps_per_second = 50
semantic_sliding_window_step_percent = 0.5
semantic_token_adjustment = 0

all_semantic_token_ids = semantic.generate(
            clap_token_ids=clap_token_ids,
            semantic_token_ids=None,
            max_time_steps=int(min(output_seconds, semantic_window_seconds) * semantic_steps_per_second),
            include_eos_in_output=False,
            append_eos_to_conditioning_tokens=True,
        )

while all_semantic_token_ids.shape[1] < int(output_seconds * semantic_steps_per_second):
    condition_length = int(semantic_window_seconds * semantic_steps_per_second * (1 - semantic_sliding_window_step_percent))
    condition_semantic_token_ids = all_semantic_token_ids[:, -condition_length:]
    pred_semantic_token_ids = semantic.generate(
                clap_token_ids=clap_token_ids,
                semantic_token_ids=condition_semantic_token_ids,
                max_time_steps=int(semantic_window_seconds * semantic_steps_per_second),
                include_eos_in_output=False,
                append_eos_to_conditioning_tokens=True,
            )
    pred_semantic_token_ids = pred_semantic_token_ids[:, condition_length:]
    all_semantic_token_ids = torch.cat([all_semantic_token_ids, pred_semantic_token_ids], dim=1)

        # crop semantic tokens to line up with coarse tokens
all_semantic_token_ids = all_semantic_token_ids[:, semantic_token_adjustment:]

generating predicted tokens: 100%|██████████| 500/500 [00:17<00:00, 27.95it/s]


In [13]:
all_semantic_token_ids.shape

torch.Size([1, 500, 1])

In [14]:
from open_musiclm.utils import exists
from einops import rearrange


In [15]:
semantic_steps_per_second = 50
acoustic_steps_per_second = 75
semantic_sliding_window_step_percent = 0.5
coarse_sliding_window_step_percent = 0.5

window_size = int(coarse_window_seconds * semantic_steps_per_second - 1)
step_size = int(window_size * coarse_sliding_window_step_percent)
all_semantic_token_ids = all_semantic_token_ids.unfold(1, window_size, step_size)
all_semantic_token_ids = rearrange(all_semantic_token_ids, 'b n q w -> n b w q')

all_coarse_token_ids = None
for semantic_token_ids in all_semantic_token_ids:
    if exists(all_coarse_token_ids):
        condition_length = int(coarse_window_seconds * acoustic_steps_per_second * (1 - coarse_sliding_window_step_percent))
        condition_coarse_token_ids = all_coarse_token_ids[:, -condition_length:]
    else:
        condition_coarse_token_ids = None

        pred_coarse_token_ids = coarse.generate(
                clap_token_ids=clap_token_ids,
                semantic_token_ids=semantic_token_ids,
                coarse_token_ids=condition_coarse_token_ids,
                max_time_steps=int(coarse_window_seconds * acoustic_steps_per_second),
                reconstruct_wave=False,
                include_eos_in_output=False,
                append_eos_to_conditioning_tokens=True,
                temperature=0.95,
            )

    if not exists(all_coarse_token_ids):
            all_coarse_token_ids = pred_coarse_token_ids
    else:
            pred_coarse_token_ids = pred_coarse_token_ids[:, condition_length:]
            all_coarse_token_ids = torch.cat([all_coarse_token_ids, pred_coarse_token_ids], dim=1)

        # crop coarse tokens to line up with fine tokens
all_coarse_token_ids = all_coarse_token_ids[:, 0:]

generating predicted tokens: 100%|██████████| 600/600 [05:52<00:00,  1.70it/s]


In [16]:
all_coarse_token_ids.shape

torch.Size([1, 600, 3])

In [17]:
fine_sliding_window_step_percent = 1
audio_condition_fine_token_ids = None
fine_token_adjustment = 0

fine_window_size = int(fine_window_seconds * acoustic_steps_per_second)
fine_step_size = int(fine_window_size * fine_sliding_window_step_percent)
all_coarse_token_ids_unfolded = all_coarse_token_ids.unfold(1, fine_window_size, fine_step_size)
all_coarse_token_ids_unfolded = rearrange(all_coarse_token_ids_unfolded, 'b n q w -> n b w q')

all_fine_token_ids = None
for coarse_token_ids in all_coarse_token_ids_unfolded:
    if exists(all_fine_token_ids):
                condition_length = int(fine_window_size * (1 - fine_sliding_window_step_percent))
                condition_fine_token_ids = all_fine_token_ids[:, -condition_length:] if condition_length > 0 else None
    else:
        condition_fine_token_ids = audio_condition_fine_token_ids

        pred_fine_token_ids = fine.generate(
                clap_token_ids=clap_token_ids,
                coarse_token_ids=coarse_token_ids,
                fine_token_ids=condition_fine_token_ids,
                max_time_steps=fine_window_size,
                reconstruct_wave=False,
                include_eos_in_output=False,
                append_eos_to_conditioning_tokens=True,
                temperature=0.4,
            )
    if not exists(all_fine_token_ids):
                all_fine_token_ids = pred_fine_token_ids
    else:
        pred_fine_token_ids = pred_fine_token_ids[:, condition_length:]
        all_fine_token_ids = torch.cat([all_fine_token_ids, pred_fine_token_ids], dim=1)

        # crop fine tokens to remove conditioning audio
all_fine_token_ids = all_fine_token_ids[:, fine_token_adjustment:]

generating predicted tokens: 100%|██████████| 300/300 [06:55<00:00,  1.38s/it]


In [18]:
all_fine_token_ids.shape

torch.Size([1, 600, 5])

In [19]:
all_coarse_token_ids.shape

torch.Size([1, 600, 3])

In [20]:
all_audio_condition_coarse_token_ids = None
all_audio_condition_fine_token_ids = None

if exists(all_audio_condition_coarse_token_ids) and exists(all_audio_condition_fine_token_ids):
    all_fine_token_ids = torch.cat([all_audio_condition_fine_token_ids, all_fine_token_ids], dim=1)
    all_coarse_token_ids = torch.cat([all_audio_condition_coarse_token_ids, all_coarse_token_ids], dim=1)

    print(all_fine_token_ids.shape)
    print(all_coarse_token_ids.shape)

all_acoustic_token_ids = torch.cat([all_coarse_token_ids, all_fine_token_ids], dim=-1)
wave = encodec_wrapper.decode_from_codebook_indices(all_acoustic_token_ids)
wave = rearrange(wave, 'b 1 n -> b n')
wave

tensor([[-0.0251, -0.0239, -0.0233,  ..., -0.0014, -0.0013,  0.0003]],
       device='cuda:0')

In [21]:
wave.shape

torch.Size([1, 192000])

In [22]:
generated_wave = rearrange(wave, 'b n -> b 1 n')

In [23]:
generated_wave

tensor([[[-0.0251, -0.0239, -0.0233,  ..., -0.0014, -0.0013,  0.0003]]],
       device='cuda:0')

In [24]:
generated_wave.shape

torch.Size([1, 1, 192000])

In [25]:
import IPython.display as ipd  # type: ignore

def display_audio(samples: torch.Tensor, sample_rate: int):
    """Renders an audio player for the given audio samples.

    Args:
        samples (torch.Tensor): a Tensor of decoded audio samples
            with shapes [B, C, T] or [C, T]
        sample_rate (int): sample rate audio should be displayed with.
    """
    assert samples.dim() == 2 or samples.dim() == 3

    samples = samples.detach().cpu()
    if samples.dim() == 2:
        samples = samples[None, ...]

    for audio in samples:
        ipd.display(ipd.Audio(audio, rate=sample_rate))

In [30]:
display_audio(wave.detach().cpu(), sample_rate=32000)

In [34]:
all_acoustic_token_ids

tensor([[[1022,  820,  615,  ...,  975,  696,  252],
         [ 495,  455,  513,  ...,  851,  944,  936],
         [ 901,  228,  656,  ...,  634,  944,  828],
         ...,
         [ 257,  913,  786,  ...,  349,  526,   31],
         [ 257,  913,  678,  ...,  349,  526,   31],
         [ 257,  913,  706,  ...,  349,  526,   31]]], device='cuda:0')

In [27]:
# import torchaudio
# torchaudio.save('test_wave.wav', wave.detach().cpu(), 24000)

In [12]:
from open_musiclm.config import load_model_config, create_musiclm_from_config

In [13]:
musiclm = create_musiclm_from_config(
        model_config=model_config,
        semantic_path=semantic_path,
        coarse_path=coarse_path,
        fine_path=fine_path,
        rvq_path=rvq_path,
        kmeans_path=kmeans_path,
        device=device)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You are using a model of type mert_model to instantiate a model of type hubert. This is not supported for all configurations of models and can yield errors.
Some weights of HubertModel were not initialized from the model checkpoint at m-a-p/MERT-v0 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [17]:
musiclm()

AssertionError: text needs to be passed in if one of the transformer requires conditioning