## Hugginggface issue - no "transcription"

https://huggingface.co/docs/diffusers/main/en/api/pipelines/audioldm2 

In [4]:
import scipy
import torch
from diffusers import AudioLDM2Pipeline

repo_id = "anhnct/audioldm2_gigaspeech"
pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
pipe = pipe.to("cuda")

# define the prompts
prompt = "A female reporter is speaking"
transcript = "wish you have a good day"

# set the seed for generator
generator = torch.Generator("cuda").manual_seed(0)

# run the generation
audio = pipe(
    prompt,
    transcription=transcript,
    num_inference_steps=200,
    audio_length_in_s=10.0,
    num_waveforms_per_prompt=2,
    generator=generator,
    max_new_tokens=512,          #Must set max_new_tokens equa to 512 for TTS
).audios

# save the best audio sample (index 0) as a .wav file
scipy.io.wavfile.write("tts.wav", rate=16000, data=audio[0])

unet/diffusion_pytorch_model.safetensors not found
Loading pipeline components...:   0%|          | 0/11 [00:00<?, ?it/s]The config attributes {'max_seq_length': 310, 'use_learned_position_embedding': True} were passed to AudioLDM2ProjectionModel, but are not expected and will be ignored. Please verify your config.json configuration file.
Some weights of the model checkpoint were not used when initializing AudioLDM2ProjectionModel: 
 ['learnable_positional_embedding']
Loading pipeline components...: 100%|██████████| 11/11 [00:03<00:00,  3.11it/s]


TypeError: AudioLDM2Pipeline.__call__() got an unexpected keyword argument 'transcription'

## Generating without transcription (text - audio diffusion model)

In [9]:
import scipy
import torch
from diffusers import AudioLDM2Pipeline

repo_id = "cvssp/audioldm2"
pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
pipe = pipe.to("cuda")

prompt = "The sound of a hammer hitting a wooden surface."
negative_prompt = "Low quality."

generator = torch.Generator("cuda").manual_seed(0)

# run the generation
audio = pipe(
    prompt,
    negative_prompt=negative_prompt,
    num_inference_steps=200,
    audio_length_in_s=10.0,
    num_waveforms_per_prompt=3,
    generator=generator,
).audios

# save the best audio sample (index 0) as a .wav file
scipy.io.wavfile.write("hammer-hitting.wav", rate=16000, data=audio[0])

Loading pipeline components...: 100%|██████████| 11/11 [00:02<00:00,  4.49it/s]
100%|██████████| 200/200 [03:34<00:00,  1.07s/it]
  return F.conv1d(input, weight, bias, self.stride,


In [15]:
import scipy
import torch
from diffusers import AudioLDM2Pipeline

repo_id = "cvssp/audioldm2"
pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
pipe = pipe.to("cuda")

prompt = "a female voice saying the word 'cap' in a clear, neutral tone."

negative_prompt = 'low quality'

generator = torch.Generator("cuda").manual_seed(0)

audio = pipe(
    prompt,
    negative_prompt=negative_prompt,
    num_inference_steps=200,
    audio_length_in_s=5.0,
    num_waveforms_per_prompt=3,
    generator=generator,
).audios

scipy.io.wavfile.write("text-speech-cap.wav", rate=16000, data=audio[0])

Loading pipeline components...: 100%|██████████| 11/11 [00:02<00:00,  4.13it/s]
100%|██████████| 200/200 [01:06<00:00,  3.00it/s]


## Installations

In [5]:
! pip install -q \
  diffusers \
  transformers \
  safetensors \
  sentencepiece \
  accelerate \
  bitsandbytes \
  einops \
  mediapy

In [5]:
!pip install phonemizer

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting phonemizer
  Downloading phonemizer-3.2.1-py3-none-any.whl.metadata (7.4 kB)
Collecting joblib (from phonemizer)
  Downloading joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting segments (from phonemizer)
  Downloading segments-2.2.1-py2.py3-none-any.whl.metadata (3.3 kB)
Collecting attrs>=18.1 (from phonemizer)
  Downloading attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting dlinfo (from phonemizer)
  Downloading dlinfo-1.2.1-py3-none-any.whl.metadata (1.1 kB)
Collecting clldutils>=1.7.3 (from segments->phonemizer)
  Downloading clldutils-3.22.2-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting csvw>=1.5.6 (from segments->phonemizer)
  Downloading csvw-3.3.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting tabulate>=0.7.7 (from clldutils>=1.7.3->segments->phonemizer)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting colorlog (from clldutils>=1.7.3->segments->phonemizer)
  Downloading colorlog-6.8.2-py3-none-any.whl.metadata (10 kB)
Co

In [6]:
!pip install torch



In [3]:
! pip install huggingface_hub

Collecting huggingface_hub
  Downloading huggingface_hub-0.23.0-py3-none-any.whl.metadata (12 kB)
Collecting filelock (from huggingface_hub)
  Downloading filelock-3.14.0-py3-none-any.whl.metadata (2.8 kB)
Collecting fsspec>=2023.5.0 (from huggingface_hub)
  Downloading fsspec-2024.3.1-py3-none-any.whl.metadata (6.8 kB)
Collecting pyyaml>=5.1 (from huggingface_hub)
  Using cached PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.1 kB)
Collecting requests (from huggingface_hub)
  Using cached requests-2.31.0-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.42.1 (from huggingface_hub)
  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m942.4 kB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
Collecting charset-normalizer<4,>=2 (from requests->huggingface_hub)
  Downloading charset_normalizer-3.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metada

## Helper functions

In [2]:
from PIL import Image
import torch 

class BaseView:
    '''
    BaseView class, from which all views inherit. Implements the
        following functions:
    '''

    def __init__(self):
        pass

    def view(self, im):
        '''
        Apply transform to an image.

        im (`torch.tensor`):
            For stage 1: Tensor of shape (3, H, W) representing a noisy image
            OR
            For stage 2: Tensor of shape (6, H, W) representing a noisy image
            concatenated with an upsampled conditioning image from stage 1
        '''
        raise NotImplementedError()

    def inverse_view(self, noise):
        '''
        Apply inverse transform to noise estimates.
            Because DeepFloyd estimates the variance in addition to
            the noise, this function must apply the inverse to the
            variance as well.

        noise (`torch.tensor`):
            Tensor of shape (6, H, W) representing the noise estimate
            (first three channel dims) and variance estimates (last
            three channel dims)
        '''
        raise NotImplementedError()

    def make_frame(self, im, t):
        '''
        Make a frame, transitioning linearly from the identity view (t=0)
            to this view (t=1)

        im (`PIL.Image`):
            A PIL Image of the illusion

        t (float):
            A float in [0,1] indicating time in the animation. Should start
            at the identity view at t=0, and continuously transition to the
            view at t=1.
        '''
        raise NotImplementedError()

In [3]:
class FlipView(BaseView):
    def __init__(self):
        pass

    def view(self, im):
        return torch.flip(im, [1])

    def inverse_view(self, noise):
        return torch.flip(noise, [1])

    def make_frame(self, im, t):
        im_size = im.size[0]
        frame_size = int(im_size * 1.5)
        theta = -t * 180

        # TODO: Technically not a flip, change this to a homography later
        frame = Image.new('RGB', (frame_size, frame_size), (255, 255, 255))
        frame.paste(im, ((frame_size - im_size) // 2, (frame_size - im_size) // 2))
        frame = frame.rotate(theta,
                             resample=Image.Resampling.BILINEAR,
                             expand=False,
                             fillcolor=(255,255,255))

        return frame

In [4]:
class IdentityView(BaseView):
    def __init__(self):
        pass

    def view(self, im):
        return im

    def inverse_view(self, noise):
        return noise

## Pipeline

In [1]:
from huggingface_hub import login

token = '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'
login(token=token)

  from .autonotebook import tqdm as notebook_tqdm


The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home2/esh/.cache/huggingface/token
Login successful


loading the audio diffusion models on 4 ADA GPUs

In [2]:
from diffusers import AudioLDM2Pipeline
import torch

stage_1 =  AudioLDM2Pipeline.from_pretrained(
                "cvssp/audioldm2",
                # torch_dtype=torch.float16,
              )
stage_1.enable_model_cpu_offload()
stage_1 = torch.nn.DataParallel(stage_1)


Fetching 26 files: 100%|██████████| 26/26 [01:26<00:00,  3.34s/it]
Loading pipeline components...: 100%|██████████| 11/11 [00:36<00:00,  3.36s/it]


In [3]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
prompt_1 = 'a lady saying the word brainstorm'
prompt_2 = 'a lady saying the word greenneedle'

prompts = [prompt_1, prompt_2]

prompt = [stage_1.module.encode_prompt(prompt, device=device, num_waveforms_per_prompt=3, do_classifier_free_guidance=False) for prompt in prompts]

In [None]:
prompt[1][0].shape

torch.Size([3, 11, 1024])

In [None]:
prompt_embeds, att_mask, negative_prompt_embeds = zip(*prompt)

prompt_embeds = torch.cat(prompt_embeds)
negative_prompt_embeds = torch.cat(negative_prompt_embeds)  # These are just null embeds

In [None]:
prompt_embeds.shape

torch.Size([19, 1024])

In [12]:
negative_prompt_embeds.shape

torch.Size([19, 1024])

In [None]:
views = [IdentityView(), FlipView()]
