In [1]:
from transformers import pipeline
import soundfile as sf
#from transformers import AutoProcessor, MusicgenMelodyForConditionalGeneration, MusicgenForConditionalGeneration
from diffusers import StableAudioPipeline
import numpy as np
import sys, torch
from tqdm import tqdm as notebook_tqdm

MAX_TOKENS: int = 1503 # 30 seconds of tokens

In [None]:
pipe = StableAudioPipeline.from_pretrained("stabilityai/stable-audio-open-1.0", torch_dtype=torch.float16)
pipe = pipe.to("cuda:1")

# define the prompts
prompt = "a calm creek with a gentle stream, birds chirping, and a soft breeze"
negative_prompt = "Low quality."

# set the seed for generator
generator = torch.Generator("cuda:1").manual_seed(0)

num_waveforms: int = 1

In [None]:
# run the generation
audio = pipe(
    prompt,
    negative_prompt=negative_prompt,
    num_inference_steps=200,
    audio_end_in_s=10.0,
    num_waveforms_per_prompt=num_waveforms,
    generator=generator,
).audios

print(audio.shape)

for i in range(num_waveforms):
    output = audio.squeeze(0).T.float().cpu().numpy()
    sf.write(f"sounds/test.wav", output, pipe.vae.sampling_rate)

In [2]:
a, sr = sf.read('sounds/sound_0_1.wav', dtype='float32')
print(a.shape, sr)

(441000, 2) 44100


## Embeddings + textual inversion

In [None]:
import torch
from diffusers import StableAudioPipeline
import laion_clap
import soundfile as sf
import accelerate
import torchaudio

In [None]:
# 1. Initialize the CLAP_Module without loading
model = laion_clap.CLAP_Module(enable_fusion=False, amodel="HTSAT-base").to("cuda:1")

model.load_ckpt()

# # 2. Load the raw checkpoint
# ckpt = torch.load("630k-audioset-best.pt", map_location="cuda:1")

# # 3. Remove the rogue key(s)
# ckpt_clean = {k: v for k, v in ckpt.items()
#               if "text_branch.embeddings.position_ids" not in k}

# # 4. Manually load into the underlying PyTorch model
# model.model.load_state_dict(ckpt_clean, strict=False)

In [None]:
from diffusers.models.transformers.stable_audio_transformer import StableAudioDiTModel
import sys
sys.path.append('./src/')
from diffusers.pipelines.stable_audio import StableAudioPipeline

transformer = StableAudioDiTModel.from_pretrained(
        "stabilityai/stable-audio-open-1.0",
        subfolder="transformer",
        torch_dtype=torch.float16
    )

pipe = StableAudioPipeline.from_pretrained(
        "stabilityai/stable-audio-open-1.0",
        torch_dtype=torch.float16,
        transformer=transformer,
    )

pipe = pipe.to("cuda:1")

In [None]:
projector = torch.nn.Linear(512, 768).to("cuda:1")

# -----------------------------------------------
# Audio embedding

audio_file = [
    'sounds/sound_0_1.wav',
]
audio_embed = projector(model.get_audio_embedding_from_filelist(x = audio_file, use_tensor=True)).unsqueeze(0).to(torch.float16)
print('Audio -> ', audio_embed.shape)

# -----------------------------------------------
# text embedding

text_data = ["high quality version of X"]
text_embed = projector(model.get_text_embedding(text_data, use_tensor=True)).unsqueeze(0).to(torch.float16)
print('Text -> ', text_embed.shape)

print(text_embed.dtype, audio_embed.dtype)

In [None]:
_ = pipe.to("cuda:1")

text_embed = text_embed.to("cuda:1")
audio_embed = audio_embed.to("cuda:1")
print(text_embed.shape, audio_embed.shape)

# use __call__ to generate
audio = pipe.__call__(
    num_inference_steps=200,
    audio_end_in_s=10.0,
    num_waveforms_per_prompt=num_waveforms,
    generator=generator,
    prompt_embeds=audio_embed + text_embed, # add the audio and text embeddings 
).audios

# out.audios is a list of tensors; take the first one
print(audio.shape)
generated = audio.squeeze(0).T.float().cpu().numpy()

# 5. Save it
sf.write("sounds/something.wav", generated, samplerate=pipe.vae.sampling_rate)