# Set-up environment

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
!nvidia-smi

In [None]:
!pip install --quiet --upgrade git+https://github.com/huggingface/diffusers.git git+https://github.com/huggingface/transformers.git accelerate

In [None]:
import scipy
import torch
import random
import numpy as np
from diffusers import AudioLDM2Pipeline
from diffusers import DPMSolverMultistepScheduler
from IPython.display import Audio
import scipy.io.wavfile as wavfile # Save the generated audio to a file

In [None]:
repo_audioldm2_music = "cvssp/audioldm2-music"
pipe_audioldm2_music = AudioLDM2Pipeline.from_pretrained(repo_audioldm2_music, torch_dtype=torch.float16)

# Generation from Prelude to Listening text

## Structured Prompt Version 0.3 (Splitting paragraphs for Segments 2 & 3)

In [None]:
prompt_1_1 = """
 (Introduction)

Today we are embarking on
an exercise in listening.
We are in the present, but
thinking of the past and the future.
We are listening across time.
"""

prompt_1_2 = """
When we listen to a piece of music,
sound operates across multiple time-scales.
Phrases may be repeated,
with or without variation.
We may recall melodies and themes
which are revisited,
forming larger structures.
At shorter time-scales,
we also have rhythm, pitch and timbre
which are repeated and varied.
"""

prompt_1_3 = """
Today we are listening for sounds
which either occurred
before the start of this piece
or will occur after the end of this piece –
or to be more precise,
before or after the time at which
these field recordings were made.
We are operating across
much longer time-scales
than we might otherwise be used to.
"""

prompt_2_1 = """
(Scenes from Bamboo theatre)

Here, we are listening for sounds
from a series of [ritual] performances
which ended several weeks earlier.
"""

prompt_2_2 = """
Looking carefully
at the bamboo structure
which is all that remains,
we may be able to imagine
the vividly decorated theatre set
which it temporarily housed.
"""

prompt_2_3 = """
Knowing that Cantonese Chinese operas such as "Six States Installation of Minister" were performed for the benefit of the Tin Hau Temple behind us
may colour how we listen to this field recording.
"""

prompt_2_4 = """
We may even be able to hear
the cacophony from the previous weeks.
"""

prompt_3_1 = """
(Scenes from the bay)

Here, we are listening for sounds
from the Dragon Boat Race
due to take place
several days after this was recorded.
"""

prompt_3_2 = """
Whilst recording,
I was not aware of
what was in store at the weekend here
in Picnic Bay as it’s known in English.
But listening back to it now,
the serene calm appears like
an ominous prelude
to the raucous sounds
which must have accompanied
the rowing of the competitors.
"""

prompt_3_3 = """
Or in other words,
the sounds we might imagine from the Dragon Boat Race
appear as a fitting resolution to what we are hearing now.
"""

prompt_4_1 = """
(Scenes around and inside the Kamikaze Caves)

If we can hear a few weeks into the past,
or a week into the future,
what about 80 years into the past?
Despite the numerous activities which take place here,
the story which appears to fascinate visitors the most
involves what are known as the Kamikaze Caves.
"""

prompt_4_2 = """
During the Japanese occupation of Hong Kong
during World War II
the occupying forces constructed short tunnels
into the side of the hills facing the bay.

Boats laden with explosives
were housed here,
primed to target incoming enemy ships
on Kamikaze missions.
"""

prompt_4_3 = """
The long tunnels create a curious auditory effect whereby one feels acoustically isolated from the vicinity
but the muffling of nearby sounds results in certain distant acoustic events being amplified.
The resonance is an indelible mark from the War as is the curious construction responsible.
Whilst it is impossible to ignore the historical context of the Caves, there is also a significant material dimension.
"""

prompt_4_4 = """
The constituent stone forming the tunnel walls, the surface of the interiors due to the haphazard procedures used in their construction and the dimensions determined by the Japanese motorboats are all audible, being a manifestation of the strange architecture in vibrational form.
The Caves and their acoustic resonance now appear
as curious remnants of Hong Kong’s past
and a portal to a previous era.
"""

## Version 0.3 - LLM (audioldm2_music) - Seeds 1st (First set of seeds)

In [None]:
# Controllable parameters
pipes_dict = {"audioldm2_music": pipe_audioldm2_music}

negative_prompt = "Low quality, average quality."

num_seeds = 1
audio_length_in_s = 60
num_inference_steps = 30
num_waveforms_per_prompt = 3
guidance_scale = 3.5
framerate = 48000 # sample rate

audio_list_1 = []

In [None]:
prompts = [prompt_1_1, prompt_1_2, prompt_1_3, \
           prompt_2_1, prompt_2_2, prompt_2_3, prompt_2_4]


for prompt in prompts:
  for pipe_name, pipe in pipes_dict.items():
    pipe = pipe.to("cuda")
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)

    for idx_seed in range(num_seeds):
      seed = random.randint(0, 10000)

      audio = pipe(
          prompt=prompt,
          negative_prompt=negative_prompt,
          num_inference_steps=num_inference_steps,
          audio_length_in_s=audio_length_in_s*num_waveforms_per_prompt,
          guidance_scale=guidance_scale,
          num_waveforms_per_prompt=num_waveforms_per_prompt,
          generator=torch.Generator("cuda").manual_seed(seed),
      ).audios[0]

      audio_list_1.append(audio)

      display(f"Prompt: {prompt}")
      display(f"Checkpoint: {pipe_name}")
      display(f"Seed value: {seed}")
      display(Audio(data=audio, rate=framerate))
      # Save the audio after connecting to the drive
      # wavfile.write("generated_audio.wav", rate=framerate, data=audio)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
wavfile.write(result_dir + "v03_audio_seg_1_1_music.wav", rate=framerate, data=audio_list_1[0])
wavfile.write(result_dir + "v03_audio_seg_1_2_music.wav", rate=framerate, data=audio_list_1[1])
wavfile.write(result_dir + "v03_audio_seg_1_3_music.wav", rate=framerate, data=audio_list_1[2])

wavfile.write(result_dir + "v03_audio_seg_2_1_music.wav", rate=framerate, data=audio_list_1[3])
wavfile.write(result_dir + "v03_audio_seg_2_2_music.wav", rate=framerate, data=audio_list_1[4])
wavfile.write(result_dir + "v03_audio_seg_2_3_music.wav", rate=framerate, data=audio_list_1[5])
wavfile.write(result_dir + "v03_audio_seg_2_4_music.wav", rate=framerate, data=audio_list_1[6])

In [None]:
prompts = [prompt_3_1, prompt_3_2, prompt_3_3, \
           prompt_4_1, prompt_4_2, prompt_4_3, prompt_4_4]


for prompt in prompts:
  for pipe_name, pipe in pipes_dict.items():
    pipe = pipe.to("cuda")
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)

    for idx_seed in range(num_seeds):
      seed = random.randint(0, 10000)

      audio = pipe(
          prompt=prompt,
          negative_prompt=negative_prompt,
          num_inference_steps=num_inference_steps,
          audio_length_in_s=audio_length_in_s*num_waveforms_per_prompt,
          guidance_scale=guidance_scale,
          num_waveforms_per_prompt=num_waveforms_per_prompt,
          generator=torch.Generator("cuda").manual_seed(seed),
      ).audios[0]

      audio_list_1.append(audio)

      display(f"Prompt: {prompt}")
      display(f"Checkpoint: {pipe_name}")
      display(f"Seed value: {seed}")
      display(Audio(data=audio, rate=framerate))
      # Save the audio after connecting to the drive
      # wavfile.write("generated_audio.wav", rate=framerate, data=audio)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
wavfile.write(result_dir + "v03_audio_seg_3_1_music.wav", rate=framerate, data=audio_list_1[7])
wavfile.write(result_dir + "v03_audio_seg_3_2_music.wav", rate=framerate, data=audio_list_1[8])
wavfile.write(result_dir + "v03_audio_seg_3_3_music.wav", rate=framerate, data=audio_list_1[9])

wavfile.write(result_dir + "v03_audio_seg_4_1_music.wav", rate=framerate, data=audio_list_1[10])
wavfile.write(result_dir + "v03_audio_seg_4_2_music.wav", rate=framerate, data=audio_list_1[11])
wavfile.write(result_dir + "v03_audio_seg_4_3_music.wav", rate=framerate, data=audio_list_1[12])
wavfile.write(result_dir + "v03_audio_seg_4_4_music.wav", rate=framerate, data=audio_list_1[13])

In [None]:
combined_audio_data_1 = np.concatenate(audio_list_1)

# display(Audio(data=combined_audio_data_1, rate=framerate))

wavfile.write(result_dir + "v03_combined_audio_music.wav", rate=framerate, data=combined_audio_data_1)

## Version 0.3 - LLM (audioldm2_music) - Seeds 2nd (Second set of seeds)

In [None]:
### Loop over pipes and seeds and prompts ###

# Controllable parameters
pipes_dict = {"audioldm2_music": pipe_audioldm2_music}
prompts = [prompt_1_1, prompt_1_2, prompt_1_3, \
           prompt_2_1, prompt_2_2, prompt_2_3, prompt_2_4, \
           prompt_3_1, prompt_3_2, prompt_3_3, \
           prompt_4_1, prompt_4_2, prompt_4_3, prompt_4_4]
negative_prompt = "Low quality, average quality."

num_seeds = 1
audio_length_in_s = 60
num_inference_steps = 25
num_waveforms_per_prompt = 3
guidance_scale = 3.5
framerate = 48000 # sample rate

audio_list_2 = []

for prompt in prompts:
  for pipe_name, pipe in pipes_dict.items():
    pipe = pipe.to("cuda")
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)

    for idx_seed in range(num_seeds):
      seed = random.randint(0, 10000)

      audio = pipe(
          prompt=prompt,
          negative_prompt=negative_prompt,
          num_inference_steps=num_inference_steps,
          audio_length_in_s=audio_length_in_s*num_waveforms_per_prompt,
          guidance_scale=guidance_scale,
          num_waveforms_per_prompt=num_waveforms_per_prompt,
          generator=torch.Generator("cuda").manual_seed(seed),
      ).audios[0]

      audio_list_2.append(audio)

      display(f"Prompt: {prompt}")
      display(f"Checkpoint: {pipe_name}")
      display(f"Seed value: {seed}")
      display(Audio(data=audio, rate=framerate))
      # Save the audio after connecting to the drive
      # wavfile.write("generated_audio.wav", rate=framerate, data=audio)

Output hidden; open in https://colab.research.google.com to view.

In [None]:
combined_audio_data_2 = np.concatenate(audio_list_2)

display(Audio(data=combined_audio_data_2, rate=framerate))
# Save the audio after connecting to the drive
# wavfile.write("combined_audio_data_0.wav", rate=framerate, data=combined_audio_data_0)

In [None]:
# wavfile.write(result_dir + "v01_combined_audio_music.wav", rate=framerate, data=combined_audio_data_1)

# wavfile.write(result_dir + "v01_audio_seg_1_music.wav", rate=framerate, data=audio_list_1[0])
# wavfile.write(result_dir + "v01_audio_seg_2_music.wav", rate=framerate, data=audio_list_1[1])
# wavfile.write(result_dir + "v01_audio_seg_3_music.wav", rate=framerate, data=audio_list_1[2])
# wavfile.write(result_dir + "v01_audio_seg_4_music.wav", rate=framerate, data=audio_list_1[3])