## Cloning the Repository

In [1]:
# All vid gen techniques are described in
# https://huggingface.co/docs/diffusers/main/en/using-diffusers/text-img2vid
# https://huggingface.co/docs/diffusers/v0.31.0/using-diffusers/text-img2vid

# clone our repo and install requirements in colab
%cd /content/
!rm -rf LSVD
!git clone -b computerVision https://github.com/davidg-h/LSVD.git
%cd LSVD
%pip install -r requirements.txt
!apt install ffmpeg
!git pull origin Inference

/content
Cloning into 'LSVD'...
remote: Enumerating objects: 118, done.[K
remote: Counting objects: 100% (118/118), done.[K
remote: Compressing objects: 100% (87/87), done.[K
remote: Total 118 (delta 41), reused 95 (delta 29), pack-reused 0 (from 0)[K
Receiving objects: 100% (118/118), 6.06 MiB | 15.17 MiB/s, done.
Resolving deltas: 100% (41/41), done.
/content/LSVD
Looking in indexes: https://pypi.org/simple, https://download.pytorch.org/whl/cu124
Collecting datasets (from -r requirements.txt (line 9))
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate (from -r requirements.txt (line 11))
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting ctransformers[cuda] (from -r requirements.txt (line 16))
  Downloading ctransformers-0.2.27-py3-none-any.whl.metadata (17 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets->-r requirements.txt (line 9))
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from dataset

## Setting variables for audio/video generation

In [2]:
fps = 7
num_frames = 49 # 49 / 7fps = ~6s
decode_chunk_size=2
prompt = "A space rocket with trails of smoke behind it launching into space from the desert"
negt = "bad quality, worse quality, low resolution"

## Text to Audio Generation

In [5]:
from transformers import pipeline, AutoProcessor, MusicgenForConditionalGeneration
import scipy

classifier = pipeline("text-classification", model="j-hartmann/emotion-english-distilroberta-base", return_all_scores=True)

# Input from user is stored into userInput
userInput = prompt

# analysis stores the classified user input for with the different confidence level of the emotions detected
analysis = classifier(userInput)
filtered_analysis = sorted([item for item in analysis[0] if item['score'] > 0.4], key=lambda x: x['score'], reverse=True)

labels_string = "mood: " + ", ".join([item['label'] for item in filtered_analysis])
print(labels_string)

edited_user_input = "\n".join([userInput, labels_string])
print(edited_user_input)

processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")

inputs = processor(
    text=[edited_user_input],
    padding=True,
    return_tensors="pt",
)

audio_values = model.generate(**inputs, max_new_tokens=400)

# Output the final audio into a .wav file
sampling_rate = model.config.audio_encoder.sampling_rate
scipy.io.wavfile.write("musicgen_out.wav", rate=sampling_rate, data=audio_values[0, 0].numpy())

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


mood: fear
A space rocket with trails of smoke behind it launching into space from the desert
mood: fear


In [6]:
from IPython.display import Audio

In [7]:
sampling_rate = model.config.audio_encoder.sampling_rate
Audio(audio_values[0].numpy(), rate=sampling_rate)

## Text to Image Generation

In [8]:
# https://huggingface.co/tasks/text-to-image
# https://huggingface.co/black-forest-labs/FLUX.1-dev
# https://huggingface.co/docs/diffusers/using-diffusers/conditional_image_generation
# https://huggingface.co/blog/if
# https://huggingface.co/docs/diffusers/en/api/pipelines/stable_diffusion/text2img
# https://huggingface.co/docs/diffusers/v0.16.0/en/api/pipelines/stable_diffusion/text2img

import torch
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler

model_id = "stabilityai/stable-diffusion-2"
scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, torch_dtype=torch.float16)
pipe = pipe.to("cuda")
pipe.enable_model_cpu_offload()

image = pipe(prompt).images[0]

image.save("reference.png")

# cleanup
pipe.to("cpu")
del pipe
torch.cuda.empty_cache()

scheduler/scheduler_config.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

model_index.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

(…)ature_extractor/preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

text_encoder/config.json:   0%|          | 0.00/633 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/909 [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/824 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.36G [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.46G [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]



  0%|          | 0/50 [00:00<?, ?it/s]

## Text + Image to Video Stable Diffusion

In [9]:
import torch
from diffusers import StableVideoDiffusionPipeline
from diffusers.utils import load_image, export_to_video

pipe = StableVideoDiffusionPipeline.from_pretrained(
    "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
)

# Optimize
pipe.enable_model_cpu_offload()
pipe.unet.enable_forward_chunking()

image = load_image(
    "/content/LSVD/reference.png"
    #"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png"
    ) #TODO change
image = image.resize((1024, 576))

generator = torch.manual_seed(42)
try:
  frames = pipe(image, num_frames=num_frames, decode_chunk_size=8, generator=generator).frames[0]
  export_to_video(frames, "svd.mp4", fps=fps)
except Exception as e:
  # cleanup

  pipe.to("cpu")
  del pipe
  torch.cuda.empty_cache()
  raise Exception(e)

# Worked in colab

model_index.json:   0%|          | 0.00/496 [00:00<?, ?B/s]

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

image_encoder/config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

(…)ature_extractor/preprocessor_config.json:   0%|          | 0.00/518 [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/607 [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/3.05G [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/984 [00:00<?, ?B/s]

model.fp16.safetensors:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

diffusion_pytorch_model.fp16.safetensors:   0%|          | 0.00/196M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/25 [00:00<?, ?it/s]

## Text to Video AnimateDiff

In [10]:
# import torch
# from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
# from diffusers.utils import export_to_video

# adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)

# pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter, torch_dtype=torch.float16)
# scheduler = DDIMScheduler.from_pretrained(
#     "emilianJR/epiCRealism",
#     subfolder="scheduler",
#     clip_sample=False,
#     timestep_spacing="linspace",
#     beta_schedule="linear",
#     steps_offset=1,
# )
# pipe.scheduler = scheduler
# pipe.enable_vae_slicing()

# # Optimize
# pipe.enable_model_cpu_offload()
# pipe.unet.enable_forward_chunking()

# try:
#   output = pipe(
#       prompt=prompt,
#       negative_prompt=negt,
#       num_frames=32,
#       decode_chunk_size=decode_chunk_size,
#       guidance_scale=7.5,
#       num_inference_steps=50,
#       generator=torch.Generator("cpu").manual_seed(49),
#   )
#   frames = output.frames[0]
#   export_to_video(frames, "animateDiff.mp4", fps=fps)
# except Exception as e:
#   # cleanup

#   pipe.to("cpu")
#   del pipe
#   torch.cuda.empty_cache()
#   raise Exception(e)

# # Worked in colab

## Combining the Audio and Video

In [11]:
import os
import subprocess

In [12]:
def install_ffmpeg():
    try:
        # Update the package list
        print("Updating package list...")
        subprocess.run(["sudo", "apt", "update"], check=True)

        # Install FFmpeg
        print("Installing FFmpeg...")
        subprocess.run(["sudo", "apt", "install", "-y", "ffmpeg"], check=True)

        print("FFmpeg installed successfully!")
    except subprocess.CalledProcessError as e:
        print(f"An error occurred while installing FFmpeg: {e}")

def fuse_video_audio(video_file, audio_file, output_file):
    try:
        # Use FFmpeg to merge video and audio
        print(f"Merging {video_file} and {audio_file} into {output_file}...")
        subprocess.run(["ffmpeg", "-i", video_file, "-i", audio_file, "-c:v", "copy", "-c:a", "aac", "-strict", "experimental", output_file], check=True)
        print(f"Successfully merged into {output_file}")
    except subprocess.CalledProcessError as e:
        print(f"An error occurred while merging video and audio: {e}")

In [13]:
install_ffmpeg()
# Example usage
fuse_video_audio("svd.mp4", "musicgen_out.wav", "advertisement.mp4")

Updating package list...
Installing FFmpeg...
FFmpeg installed successfully!
Merging svd.mp4 and musicgen_out.wav into advertisement.mp4...
Successfully merged into advertisement.mp4
