In [2]:
import torch
from transformers import ClapProcessor, ClapModel
from diffusers import StableAudioPipeline
import soundfile as sf

device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=1)

In [3]:
# 1. Load CLAP and extract both audio- and text-embeddings :contentReference[oaicite:0]{index=0}
clap_model = ClapModel.from_pretrained("laion/clap-htsat-fused").to(device)
processor  = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

In [4]:
# 2. Load StableAudioPipeline :contentReference[oaicite:1]{index=1}
pipe = StableAudioPipeline.from_pretrained(
    "stabilityai/stable-audio-open-1.0",
    torch_dtype=torch.float16
).to(device)
pipe.enable_attention_slicing()

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]



In [5]:
# 3. Get your CLAP audio embedding
audio, sr = sf.read("sounds/test_48k.wav")
clap_inputs = processor(audios=audio, sampling_rate=sr, return_tensors="pt").to(device)
with torch.no_grad():
    audio_emb_clap = clap_model.get_audio_features(**clap_inputs).audio_embeds  # (1, clap_dim)

KeyboardInterrupt: 

In [None]:
# 4. Get your CLAP text embedding for the prompt
txt = ["A calm river at sunrise"]  # example prompt
txt_inputs = processor(text=txt, return_tensors="pt", padding=True).to(device)
with torch.no_grad():
    text_emb_clap = clap_model.get_text_features(**txt_inputs).text_embeds    # (1, clap_dim)



In [None]:
# 5. Project CLAP’s embed_dim → transformer hidden_size
#    so we can inject it into prompt_embeds
proj = torch.nn.Linear(audio_emb_clap.size(-1), pipe.transformer.config.hidden_size).to(device)
proj_audio = proj(audio_emb_clap)  # (1, hidden)
proj_text  = proj(text_emb_clap)    # (1, hidden)



In [None]:
# 6. Build the normal text prompt_embeds via the pipeline’s tokenizer+text_encoder
tok = pipe.tokenizer(
    txt,
    return_tensors="pt",
    padding="max_length",
    truncation=True,
    max_length=pipe.tokenizer.model_max_length,
).to(device)
enc = pipe.text_encoder(**tok)
prompt_embeds = enc.last_hidden_state  # (1, seq_len, hidden)



In [None]:
# 7. Fuse in your CLAP audio information by simple addition
#    (you could also concat+linear, or more complex fusion)
fused_prompt_embeds = prompt_embeds + proj_audio.unsqueeze(1) + proj_text.unsqueeze(1)

# 8. Generate, passing only prompt_embeds (no `prompt` string) :contentReference[oaicite:2]{index=2}
out = pipe(
    prompt_embeds=fused_prompt_embeds,
    audio_end_in_s=5.0,
    num_inference_steps=50,
    guidance_scale=7.0,
    output_type="np",
)
generated = out.audios[0]  # numpy array

# 9. Save your result
sf.write("clap_plus_text.wav", generated, samplerate=pipe.vae.config.sampling_rate)