# Set-up environment

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
!nvidia-smi

In [None]:
!pip install --quiet --upgrade git+https://github.com/huggingface/diffusers.git git+https://github.com/huggingface/transformers.git accelerate

In [None]:
import scipy
import torch
import random
import numpy as np
from diffusers import AudioLDM2Pipeline
from diffusers import DPMSolverMultistepScheduler
from IPython.display import Audio
import scipy.io.wavfile as wavfile # Save the generated audio to a file

In [None]:
repo_audioldm2 = "cvssp/audioldm2"
pipe_audioldm2 = AudioLDM2Pipeline.from_pretrained(repo_audioldm2, torch_dtype=torch.float16)

In [None]:
repo_audioldm2_large = "cvssp/audioldm2-large"
pipe_audioldm2_large = AudioLDM2Pipeline.from_pretrained(repo_audioldm2_large, torch_dtype=torch.float16)

In [None]:
repo_audioldm2_music = "cvssp/audioldm2-music"
pipe_audioldm2_music = AudioLDM2Pipeline.from_pretrained(repo_audioldm2_music, torch_dtype=torch.float16)

# Loop & Optimize inference speed and GPU memory (template)


In [None]:
### Without loop over pipes and seeds ###

pipe = pipe_audioldm2.to("cuda")
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
# pipe.enable_model_cpu_offload()

# define the prompts
prompt = "The sound of Brazilian samba drums with waves gently crashing in the background"
negative_prompt = "Low quality, average quality."

# set the seed for generator
generator = torch.Generator("cuda").manual_seed(0)

# run the generation
audio = pipe(
    prompt,
    negative_prompt=negative_prompt,
    num_inference_steps=20,
    audio_length_in_s=150,
    num_waveforms_per_prompt=4,
    generator=generator,
).audios[0]

Audio(audio, rate=16000)

In [None]:
### Loop over pipes and seeds ###

# Controllable parameters
pipes_dict = {"audioldm2": pipe_audioldm2, "audioldm2_large": pipe_audioldm2_large, "audioldm2_music": pipe_audioldm2_music}
prompt = "The sound of Brazilian samba drums with waves gently crashing in the background"
negative_prompt = "Low quality, average quality."
num_seeds = 3
audio_length_in_s = 10
num_inference_steps = 20
num_waveforms_per_prompt = 3
guidance_scale = 3.5
framerate = 44100 # sample rate


for pipe_name, pipe in pipes_dict.items():
  pipe = pipe.to("cuda")
  pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)

  for idx_seed in range(num_seeds):
    seed = random.randint(0, 10000)

    audio = pipe(
        prompt=prompt,
        negative_prompt=negative_prompt,
        num_inference_steps=num_inference_steps,
        audio_length_in_s=audio_length_in_s*num_seeds,
        guidance_scale=guidance_scale,
        num_waveforms_per_prompt=num_waveforms_per_prompt,
        generator=torch.Generator("cuda").manual_seed(seed),
    ).audios[0]

    display(f"Checkpoint: {pipe_name}")
    display(f"Seed value: {seed}")
    display(Audio(data=audio, rate=framerate))
    # Save the audio after connecting to the drive
    # wavfile.write("generated_audio.wav", rate=framerate, data=audio)


In [None]:
### Loop over pipes and seeds and prompts ###

# Controllable parameters
pipes_dict = {"audioldm2": pipe_audioldm2, "audioldm2_music": pipe_audioldm2_music}
prompts = ["The sound of dreams colliding, creating a surreal and wondrous composition.",
          "A traditional Japanese taiko drum ensemble, exuding power and precision.",
          "A modern hip-hop beat with punchy bass and rapid-fire lyrics."]
negative_prompt = "Low quality, average quality."
num_seeds = 2
audio_length_in_s = 10
num_inference_steps = 20
num_waveforms_per_prompt = 3
guidance_scale = 3.5
framerate = 44100 # sample rate

for prompt in prompts:
  for pipe_name, pipe in pipes_dict.items():
    pipe = pipe.to("cuda")
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)

    for idx_seed in range(num_seeds):
      seed = random.randint(0, 10000)

      audio = pipe(
          prompt=prompt,
          negative_prompt=negative_prompt,
          num_inference_steps=num_inference_steps,
          audio_length_in_s=audio_length_in_s*num_waveforms_per_prompt,
          guidance_scale=guidance_scale,
          num_waveforms_per_prompt=num_waveforms_per_prompt,
          generator=torch.Generator("cuda").manual_seed(seed),
      ).audios[0]

      display(f"Prompt: {prompt}")
      display(f"Checkpoint: {pipe_name}")
      display(f"Seed value: {seed}")
      display(Audio(data=audio, rate=framerate))
      # Save the audio after connecting to the drive
      # wavfile.write("generated_audio.wav", rate=framerate, data=audio)


# Generation from Prelude to Listening text - 20231104

## Strutured Prompt Version 0.2 (Selections of Segments 2 & 3; Splitting paragraphs for Segments 1 & 4)

In [None]:
prompt_1_1 = """
 (Introduction)

Today we are embarking on
an exercise in listening.
We are in the present, but
thinking of the past and the future.
We are listening across time.
"""

prompt_1_2 = """
When we listen to a piece of music,
sound operates across multiple time-scales.
Phrases may be repeated,
with or without variation.
We may recall melodies and themes
which are revisited,
forming larger structures.
At shorter time-scales,
we also have rhythm, pitch and timbre
which are repeated and varied.
"""

prompt_1_3 = """
Today we are listening for sounds
which either occurred
before the start of this piece
or will occur after the end of this piece –
or to be more precise,
before or after the time at which
these field recordings were made.
We are operating across
much longer time-scales
than we might otherwise be used to.
"""

prompt_2 = """
The sounds originate from a series of ritual performances.
These include Cantonese operas like "Six States Installation of Minister,"
which are performed to honor the Tin Hau Temple during the Tin Hau Festival.
"""

prompt_3 = """
The Dragon Boat Race takes place in Picnic Bay on Lamma Island during the Dragon Boat Festival.
One can imagine the raucous sounds that must have accompanied the intense rowing of the competitors.
"""

prompt_4_1 = """
(Scenes around and inside the Kamikaze Caves)

If we can hear a few weeks into the past,
or a week into the future,
what about 80 years into the past?
Despite the numerous activities which take place here,
the story which appears to fascinate visitors the most
involves what are known as the Kamikaze Caves.
"""

prompt_4_2 = """
During the Japanese occupation of Hong Kong
during World War II
the occupying forces constructed short tunnels
into the side of the hills facing the bay.

Boats laden with explosives
were housed here,
primed to target incoming enemy ships
on Kamikaze missions.
"""

prompt_4_3 = """
The long tunnels create a curious auditory effect whereby one feels acoustically isolated from the vicinity
but the muffling of nearby sounds results in certain distant acoustic events being amplified.
The resonance is an indelible mark from the War as is the curious construction responsible.
Whilst it is impossible to ignore the historical context of the Caves, there is also a significant material dimension.
"""

prompt_4_4 = """
The constituent stone forming the tunnel walls, the surface of the interiors due to the haphazard procedures used in their construction and the dimensions determined by the Japanese motorboats are all audible, being a manifestation of the strange architecture in vibrational form.
The Caves and their acoustic resonance now appear
as curious remnants of Hong Kong’s past
and a portal to a previous era.
"""

## Version 0.2 - LLM (audioldm2_music) - Seeds 1st (First set of seeds)

In [None]:
### Loop over pipes and seeds and prompts ###

# Controllable parameters
pipes_dict = {"audioldm2_music": pipe_audioldm2_music}
prompts = [prompt_1_1, prompt_1_2, prompt_1_3, prompt_2, prompt_3, prompt_4_1, prompt_4_2, prompt_4_3, prompt_4_4]
negative_prompt = "Low quality, average quality."

num_seeds = 1
audio_length_in_s = 30
num_inference_steps = 50
num_waveforms_per_prompt = 3
guidance_scale = 3.5
framerate = 48000 # sample rate

audio_list_1 = []

for prompt in prompts:
  for pipe_name, pipe in pipes_dict.items():
    pipe = pipe.to("cuda")
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)

    for idx_seed in range(num_seeds):
      seed = random.randint(0, 10000)

      audio = pipe(
          prompt=prompt,
          negative_prompt=negative_prompt,
          num_inference_steps=num_inference_steps,
          audio_length_in_s=audio_length_in_s*num_waveforms_per_prompt,
          guidance_scale=guidance_scale,
          num_waveforms_per_prompt=num_waveforms_per_prompt,
          generator=torch.Generator("cuda").manual_seed(seed),
      ).audios[0]

      audio_list_1.append(audio)

      display(f"Prompt: {prompt}")
      display(f"Checkpoint: {pipe_name}")
      display(f"Seed value: {seed}")
      display(Audio(data=audio, rate=framerate))
      # Save the audio after connecting to the drive
      # wavfile.write("generated_audio.wav", rate=framerate, data=audio)

In [None]:
combined_audio_data_1 = np.concatenate(audio_list_1)

display(Audio(data=combined_audio_data_1, rate=framerate))
# Save the audio after connecting to the drive
# wavfile.write("combined_audio_data_0.wav", rate=framerate, data=combined_audio_data_0)

In [None]:
wavfile.write(result_dir + "v02_combined_audio_music.wav", rate=framerate, data=combined_audio_data_1)

wavfile.write(result_dir + "v02_audio_seg_1_1_music.wav", rate=framerate, data=audio_list_1[0])
wavfile.write(result_dir + "v02_audio_seg_1_2_music.wav", rate=framerate, data=audio_list_1[1])
wavfile.write(result_dir + "v02_audio_seg_1_3_music.wav", rate=framerate, data=audio_list_1[2])

wavfile.write(result_dir + "v02_audio_seg_2_music.wav", rate=framerate, data=audio_list_1[3])
wavfile.write(result_dir + "v02_audio_seg_3_music.wav", rate=framerate, data=audio_list_1[4])

wavfile.write(result_dir + "v02_audio_seg_4_1_music.wav", rate=framerate, data=audio_list_1[5])
wavfile.write(result_dir + "v02_audio_seg_4_2_music.wav", rate=framerate, data=audio_list_1[6])
wavfile.write(result_dir + "v02_audio_seg_4_3_music.wav", rate=framerate, data=audio_list_1[7])
wavfile.write(result_dir + "v02_audio_seg_4_4_music.wav", rate=framerate, data=audio_list_1[8])

## Version 0.2 - LLM (audioldm2_music) - Seeds 2nd (Second set of seeds)

In [None]:
### Loop over pipes and seeds and prompts ###

# Controllable parameters
pipes_dict = {"audioldm2_music": pipe_audioldm2_music}
prompts = [prompt_1_1, prompt_1_2, prompt_1_3, prompt_2, prompt_3, prompt_4_1, prompt_4_2, prompt_4_3, prompt_4_4]
negative_prompt = "Low quality, average quality."

num_seeds = 1
audio_length_in_s = 30
num_inference_steps = 50
num_waveforms_per_prompt = 3
guidance_scale = 3.5
framerate = 48000 # sample rate

audio_list_1 = []

for prompt in prompts:
  for pipe_name, pipe in pipes_dict.items():
    pipe = pipe.to("cuda")
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)

    for idx_seed in range(num_seeds):
      seed = random.randint(0, 10000)

      audio = pipe(
          prompt=prompt,
          negative_prompt=negative_prompt,
          num_inference_steps=num_inference_steps,
          audio_length_in_s=audio_length_in_s*num_waveforms_per_prompt,
          guidance_scale=guidance_scale,
          num_waveforms_per_prompt=num_waveforms_per_prompt,
          generator=torch.Generator("cuda").manual_seed(seed),
      ).audios[0]

      audio_list_1.append(audio)

      display(f"Prompt: {prompt}")
      display(f"Checkpoint: {pipe_name}")
      display(f"Seed value: {seed}")
      display(Audio(data=audio, rate=framerate))
      # Save the audio after connecting to the drive
      # wavfile.write("generated_audio.wav", rate=framerate, data=audio)

In [None]:
combined_audio_data_1 = np.concatenate(audio_list_1)

display(Audio(data=combined_audio_data_1, rate=framerate))
# Save the audio after connecting to the drive
# wavfile.write("combined_audio_data_0.wav", rate=framerate, data=combined_audio_data_0)

In [None]:
# wavfile.write(result_dir + "v01_combined_audio_music.wav", rate=framerate, data=combined_audio_data_1)

# wavfile.write(result_dir + "v01_audio_seg_1_music.wav", rate=framerate, data=audio_list_1[0])
# wavfile.write(result_dir + "v01_audio_seg_2_music.wav", rate=framerate, data=audio_list_1[1])
# wavfile.write(result_dir + "v01_audio_seg_3_music.wav", rate=framerate, data=audio_list_1[2])
# wavfile.write(result_dir + "v01_audio_seg_4_music.wav", rate=framerate, data=audio_list_1[3])