# Music Video Synthesis
* Extract lyrics from song with timestamps
* Compose scenes, include timestamps
* Construct video text prompt for each scene
* Build videos for each scene
* Stitch together

# We will use openai whipser for stability

In [None]:
#!pip install --quiet --upgrade pip
#!pip install --quiet --upgrade openai-whisper
# Ubuntu or Debian
#!sudo apt update && sudo apt install ffmpeg
#!pip install setuptools-rust

In [1]:
import cv2
import gc 
import diffusers
import imageio
import imageio_ffmpeg
import json
import math
import moviepy.editor as mp
import numpy as np
import os
import random
import tempfile
import threading
import time
import transformers
import torch
import utils
import whisper

from datetime import datetime, timedelta
from diffusers import AutoencoderKL, DPMSolverMultistepScheduler, FlowMatchEulerDiscreteScheduler, KDPM2DiscreteScheduler, StableDiffusionXLPipeline
from diffusers.models.transformers.transformer_flux import FluxTransformer2DModel
from diffusers.pipelines.flux.pipeline_flux import FluxPipeline
from huggingface_hub import hf_hub_download, snapshot_download
from openai import OpenAI
# pip install diffusers optimum-quanto
from optimum.quanto import freeze, qfloat8, quantize, requantize
from PIL import Image
from safetensors.torch import load_file as load_safetensors
from sd_embed.embedding_funcs import get_weighted_text_embeddings_flux1
from torchao.quantization import quantize_, int8_weight_only, int8_dynamic_activation_int8_weight
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoImageProcessor, CLIPConfig, CLIPFeatureExtractor, CLIPModel, CLIPProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModel, Swin2SRForImageSuperResolution, T5EncoderModel, T5TokenizerFast

dtype = torch.bfloat16
MAX_SEED = np.iinfo(np.int32).max
device = "cuda" if torch.cuda.is_available() else "cpu"
retry_limit = 3

openai_api_key = ""
openai_model = 'gpt-4o-mini'
openai_model_large = 'gpt-4o'
hg_token = ''
working_dir = './images2'
video_dir = "./output2"
audio_file = '//mnt/d/audio/VampireLament.mp3'

os.makedirs(working_dir, exist_ok=True)
os.makedirs(video_dir, exist_ok=True)


2024-10-08 06:21:47.187749: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-08 06:21:47.220964: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-08 06:21:47.234675: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-08 06:21:47.280973: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def get_openai_prompt_response(prompt: str, max_tokens: int = 6000, temperature: float = 0.33, openai_model: str = ''):
 
    client = OpenAI(api_key=openai_api_key)
    response = client.chat.completions.create(
        max_tokens = max_tokens,
        messages=[
            {"role": "system", "content": f"""Act as a helpful assistant, you are an expert editor."""},
            {"role": "user", "content": f"""{prompt}"""}
        ],
        model=openai_model,
        temperature = temperature
    )

    retry_count = 0
    while retry_count < retry_limit:
        try:
            message_content = response.choices[0].message.content
            break  # If successful, break out of the retry loop
        except Exception as e:
            print(f"Error occurred: {e}")
            retry_count += 1
            if retry_count == retry_limit:
                print("Retry limit reached. Moving to the next iteration.")
            else:
                print(f"Retrying... (Attempt {retry_count}/{retry_limit})")
    
    return message_content
    
def load_quanto_transformer(repo_path):
    with open(hf_hub_download(repo_path, "transformer/quantization_map.json"), "r") as f:
        quantization_map = json.load(f)
    with torch.device("meta"):
        transformer = diffusers.FluxTransformer2DModel.from_config(hf_hub_download(repo_path, "transformer/config.json")).to(dtype)
    state_dict = load_safetensors(hf_hub_download(repo_path, "transformer/diffusion_pytorch_model.safetensors"))
    requantize(transformer, state_dict, quantization_map, device=torch.device("cuda"))
    return transformer


def load_quanto_text_encoder_2(repo_path):
    with open(hf_hub_download(repo_path, "text_encoder_2/quantization_map.json"), "r") as f:
        quantization_map = json.load(f)
    with open(hf_hub_download(repo_path, "text_encoder_2/config.json")) as f:
        t5_config = transformers.T5Config(**json.load(f))
    with torch.device("meta"):
        text_encoder_2 = transformers.T5EncoderModel(t5_config).to(dtype)
    state_dict = load_safetensors(hf_hub_download(repo_path, "text_encoder_2/model.safetensors"))
    requantize(text_encoder_2, state_dict, quantization_map, device=torch.device("cuda"))
    return text_encoder_2

def load_quanto_text_encoder_2_longer(repo_path, max_length=512):
    with open(hf_hub_download(repo_path, "text_encoder_2/quantization_map.json"), "r") as f:
        quantization_map = json.load(f)
    with open(hf_hub_download(repo_path, "text_encoder_2/config.json")) as f:
        t5_config = transformers.T5Config(**json.load(f))
    
    # Update the config for longer sequence length
    t5_config.max_position_embeddings = max_length
    
    with torch.device("meta"):
        text_encoder_2 = transformers.T5EncoderModel(t5_config).to(dtype)
    
    state_dict = load_safetensors(hf_hub_download(repo_path, "text_encoder_2/model.safetensors"))
    requantize(text_encoder_2, state_dict, quantization_map, device=torch.device("cuda"))
    
    return text_encoder_2
    
def load_flux_pipe():
    # Load the main pipeline without the transformer or text_encoder_2 initially
    pipe = None
    clip_repo = "zer0int/CLIP-GmP-ViT-L-14"
    text_encoder = CLIPTextModel.from_pretrained(clip_repo, torch_dtype=dtype)
    
    pipe = diffusers.AutoPipelineForText2Image.from_pretrained(
        "Disty0/FLUX.1-dev-qint8", 
        text_encoder=text_encoder,
        transformer=None, 
        text_encoder_2=None, 
        torch_dtype=dtype
    )
    
    # Load custom transformer and text encoder with specific configurations
    pipe.transformer = load_quanto_transformer("Disty0/FLUX.1-dev-qint8")
    pipe.text_encoder_2 = load_quanto_text_encoder_2_longer(
        "Disty0/FLUX.1-dev-qint8", 
        max_length=512
    )

    
    # Move the pipeline to CUDA with bfloat16 precision for performance
    pipe = pipe.to("cuda", dtype=dtype)

    # Enable memory optimizations (attention slicing)
    pipe.enable_attention_slicing()

    return pipe

def gen_flux_image(pipe, prompt, height=1024, width=1024, guidance_scale=3.5, num_inference_steps=32, max_sequence_length=512, seed=-1):
    if seed == -1:
        seed = random.randint(0, MAX_SEED)
        
    prompt_embeds, pooled_prompt_embeds = get_weighted_text_embeddings_flux1(
        pipe        = pipe,
        prompt    = prompt
    )
    
    image = pipe(
        prompt_embeds               = prompt_embeds,
        pooled_prompt_embeds      = pooled_prompt_embeds,
        height=height,
        width=width,
        guidance_scale=guidance_scale,
        output_type="pil",
        num_inference_steps=num_inference_steps,
        max_sequence_length=max_sequence_length,
        generator=torch.Generator("cpu").manual_seed(seed)
    ).images[0]
    return image

In [3]:
model = whisper.load_model("turbo")
result = model.transcribe(audio_file)
print(result["text"])
# done with model
del model
# If using GPU, clear the GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
gc.collect()

  checkpoint = torch.load(fp, map_location=device)



 Music Meet that streaks that roam the streets Hunting for some youthful treats See the teens all gathered round My heartbeats slowing down Oh, their blood is full of fries Energy drinks And sugary pies I can't take a single bite Their poor health just isn't right Used to savor youthful veins Now it's all polluted stains Fast food feasts and lack of sleep Makes their life buds are too cheap Craving pure and wholesome flow But where did all the healthy go? A healthy go Oh, their blood is full of fries Video games and neon skies Fast food feasts and lack of food I can't take a single bite Their poor health just isn't right Graving pure and wholesome flow But where did all the healthy go? Wow Wow Wow Wow Oh, their blood is full of fries Oh, their blood is full of fries Video games and neon skies Oh, their blood is full of fries Video games and neon skies Fast food feasts and lack of sleep Makes their life blood far too cheap Small energy Cricks like Enth various Leuten Rooms and Perdice H

0

In [4]:
segments = result['segments']

In [5]:
# Extracting list of start times and texts
segment_texts_and_start_times = [(segment['text'].strip(), segment['start']) for segment in segments]

# Assuming 'segment_texts_and_start_times' is a list of tuples (text, start)
# Printing the results
text = ""
for segment_text, start in segment_texts_and_start_times:
    text += f"Start: {start}, Text: {segment_text}\n"
    #print(f"Start: {start}, Text: {segment_text}")

last_end_value = segments[-1]['end']
#print(f"The last end value is: {last_end_value}")

In [6]:
video_summary_prompt = f'Create a short summary to describe an overall scene for a music video based on these lyrics: {text}'
video_summary = get_openai_prompt_response(video_summary_prompt, openai_model=openai_model)

In [7]:
prompt = f'''Create a json list of scenes from the following text.  scenes should be groups of similar lyrics with new scenes when the context changes.  Text: {text}   The json list should have the start value for the first item in the scene and the text that is combined for all items in the same scene.  Return only the json list, less jargon. '''
result = get_openai_prompt_response(prompt, openai_model=openai_model)
result = result.replace("```", "").replace("```json\n", "").replace("json\n", "").replace("\n", "")

In [8]:
scenes = json.loads(result)

In [9]:
# iterate through scenes
for scene in scenes:
    # synthesize the scene composition
    prompt = f'''Create an imaginative and vivid video scene descriptive caption based on the following overall music video description and scene lyrics. Include rich details such as attire, setting, mood, lighting, and any significant movements or expressions, painting a clear visual scene.  An example of a scene description is: "A radiant woman stands on a deserted beach, arms outstretched, wearing a beige trench coat, white blouse, light blue jeans, and chic boots, against a backdrop of soft sky and sea. Moments later, she is seen mid-twirl, arms exuberant, with the lighting suggesting dawn or dusk. Then, she runs along the beach, her attire complemented by an off-white scarf and black ankle boots, the tranquil sea behind her. Finally, she holds a paper airplane, her pose reflecting joy and freedom, with the ocean's gentle waves and the sky's soft pastel hues enhancing the serene ambiance."
Another example is: "A determined man in athletic attire, including a blue long-sleeve shirt, black shorts, and blue socks, jogs around a snow-covered soccer field, showcasing his solitary exercise in a quiet, overcast setting. His long dreadlocks, focused expression, and the serene winter backdrop highlight his dedication to fitness. As he moves, his attire, consisting of a blue sports sweatshirt, black athletic pants, gloves, and sneakers, grips the snowy ground. He is seen running past a chain-link fence enclosing the playground area, with a basketball hoop and children's slide, suggesting a moment of solitary exercise amidst the empty field."
Music video description: {video_summary}
Scene lyrics: {scene["text"]}
Return only the scene description, less jargon.'''
    result = get_openai_prompt_response(prompt, openai_model=openai_model)
    scene["scene_description"] = result
    # synthesize the action sequence
    prompt = f'''This is the overall theme for a music video: {video_summary}
This is a scene description for a scene of the music video, which we have a starting image for: {scene["scene_description"]}
We want to create a short animation prompt for the starting image, please create this short animation prompt.  Return only the prompt, less jargon.  For example, an image of a woman riding a bike might have the short animation prompt: "A woman is riding a bicycle at high speed. Focused, detailed, realistic."  An image of a starry night painting might have the short animation prompt: "Starry sky slowly rotating."'''
    result = get_openai_prompt_response(prompt, openai_model=openai_model)
    scene["action_sequence"] = result


In [10]:
scenes

[{'start': 16.0,
  'text': "Meet that streaks that roam the streets Hunting for some youthful treats See the teens all gathered round My heartbeats slowing down Oh, their blood is full of fries Energy drinks And sugary pies I can't take a single bite Their poor health just isn't right Used to savor youthful veins Now it's all polluted stains Fast food feasts and lack of sleep Makes their life buds are too cheap Craving pure and wholesome flow But where did all the healthy go? A healthy go",
  'scene_description': "In a vibrant urban landscape alive with pulsating beats, a vampire stands at the edge of a bustling street, cloaked in a sleek, dark trench coat that contrasts sharply with the neon lights flickering around them. Their pale skin glows under the electric hues of pink and blue, while sharp, expressive eyes scan the scene filled with groups of teens. The air is thick with laughter and the enticing aroma of fast food, as the young crowd indulges in greasy fries and sugary drinks,

In [11]:
pipe = load_flux_pipe()
height = 480
width = 720
guidance_scale=3.9
num_inference_steps=24
max_sequence_length=512  
seed=-1

Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
  deprecate("config-passed-as-path", "1.0.0", deprecation_message, standard_warn=False)



In [None]:
# create starting images for video
image_num = 1
for scene in scenes:
    image_prompt = scene['scene_description']
    image = gen_flux_image(pipe, prompt, height, width, guidance_scale, num_inference_steps, max_sequence_length, seed)
    filename = "image_" + str(image_num).zfill(2) + ".jpg"
    image_name = working_dir + "/" + filename
    image.save(image_name,  dpi=(300, 300))
    image_num += 1

Token indices sequence length is longer than the specified maximum sequence length for this model (826 > 77). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (960 > 512). Running this sequence through the model will result in indexing errors


  0%|          | 0/24 [00:00<?, ?it/s]

In [None]:
del diffusers
del pipe
del transformers
# If using GPU, clear the GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
gc.collect()

In [None]:
import diffusers
import transformers

from diffusers import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel, CogVideoXPipeline, CogVideoXDPMScheduler
from diffusers import CogVideoXVideoToVideoPipeline, CogVideoXImageToVideoPipeline
from diffusers.image_processor import VaeImageProcessor
from diffusers.utils import export_to_video, load_video, load_image
from transformers import T5EncoderModel

In [None]:
# Helper functions
def convert_to_gif(video_path):
    clip = mp.VideoFileClip(video_path)
    clip = clip.set_fps(8)
    clip = clip.resize(height=240)
    gif_path = video_path.replace(".mp4", ".gif")
    clip.write_gif(gif_path, fps=8)
    return gif_path

def resize_if_unfit(input_video):
    width, height = get_video_dimensions(input_video)

    if width == 720 and height == 480:
        processed_video = input_video
    else:
        processed_video = center_crop_resize(input_video)
    return processed_video


def get_video_dimensions(input_video_path):
    reader = imageio_ffmpeg.read_frames(input_video_path)
    metadata = next(reader)
    return metadata["size"]


def center_crop_resize(input_video_path, target_width=720, target_height=480):
    cap = cv2.VideoCapture(input_video_path)

    orig_width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    orig_height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    orig_fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    width_factor = target_width / orig_width
    height_factor = target_height / orig_height
    resize_factor = max(width_factor, height_factor)

    inter_width = int(orig_width * resize_factor)
    inter_height = int(orig_height * resize_factor)

    target_fps = 8
    ideal_skip = max(0, math.ceil(orig_fps / target_fps) - 1)
    skip = min(5, ideal_skip)  # Cap at 5

    while (total_frames / (skip + 1)) < 49 and skip > 0:
        skip -= 1

    processed_frames = []
    frame_count = 0
    total_read = 0

    while frame_count < 49 and total_read < total_frames:
        ret, frame = cap.read()
        if not ret:
            break

        if total_read % (skip + 1) == 0:
            resized = cv2.resize(frame, (inter_width, inter_height), interpolation=cv2.INTER_AREA)

            start_x = (inter_width - target_width) // 2
            start_y = (inter_height - target_height) // 2
            cropped = resized[start_y : start_y + target_height, start_x : start_x + target_width]

            processed_frames.append(cropped)
            frame_count += 1

        total_read += 1

    cap.release()

    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_file:
        temp_video_path = temp_file.name
        fourcc = cv2.VideoWriter_fourcc(*"mp4v")
        out = cv2.VideoWriter(temp_video_path, fourcc, target_fps, (target_width, target_height))

        for frame in processed_frames:
            out.write(frame)

        out.release()

    return temp_video_path

def extract_last_frame(video_filename, output_image_filename):
    """
    Extracts the last frame from a video file and saves it as an image.

    Parameters:
    - video_filename (str): Path to the input video file.
    - output_image_filename (str): Path where the last frame image will be saved.
    """
    try:
        # Create a reader object for the video
        reader = imageio.get_reader(video_filename, 'ffmpeg')
        
        # Initialize last_frame to None
        last_frame = None
        
        # Iterate through all frames to get the last one
        for frame in reader:
            last_frame = frame
        
        # Close the reader to free resources
        reader.close()
        
        if last_frame is not None:
            # Save the last frame as an image
            imageio.imwrite(output_image_filename, last_frame)
            print(f"Last frame saved successfully as '{output_image_filename}'.")
        else:
            print("The video contains no frames.")
    
    except FileNotFoundError:
        print(f"Error: The file '{video_filename}' was not found.")
    except ValueError as ve:
        print(f"ValueError: {ve}")
    except RuntimeError as re:
        print(f"RuntimeError: {re}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

In [None]:
quantization = int8_weight_only

text_encoder = T5EncoderModel.from_pretrained("THUDM/CogVideoX-5b", subfolder="text_encoder", torch_dtype=torch.bfloat16)
quantize_(text_encoder, quantization())

transformer = CogVideoXTransformer3DModel.from_pretrained("THUDM/CogVideoX-5b", subfolder="transformer", torch_dtype=torch.bfloat16)
quantize_(transformer, quantization())

i2v_transformer = CogVideoXTransformer3DModel.from_pretrained(
    "THUDM/CogVideoX-5b-I2V", subfolder="transformer", torch_dtype=torch.bfloat16
)

vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX-5b", subfolder="vae", torch_dtype=torch.bfloat16)
quantize_(vae, quantization())

# Create pipeline and run inference
pipe = CogVideoXPipeline.from_pretrained(
    "THUDM/CogVideoX-5b",
    text_encoder=text_encoder,
    transformer=transformer,
    vae=vae,
    torch_dtype=torch.bfloat16,
)
#pipe.enable_model_cpu_offload()
pipe.vae.enable_tiling()

pipe.scheduler = CogVideoXDPMScheduler.from_config(pipe.scheduler.config, timestep_spacing="trailing")

i2v_vae=pipe.vae
i2v_scheduler=pipe.scheduler
i2v_tokenizer=pipe.tokenizer
i2v_text_encoder=pipe.text_encoder

del pipe
gc.collect()

In [None]:
# Load the pipeline once before the loop
pipe_image = CogVideoXImageToVideoPipeline.from_pretrained(
    "THUDM/CogVideoX-5b-I2V",
    transformer=i2v_transformer,
    vae=i2v_vae,
    scheduler=i2v_scheduler,
    tokenizer=i2v_tokenizer,
    text_encoder=i2v_text_encoder,
    torch_dtype=torch.bfloat16,
).to(device)

def infer(
        pipe_image,
        prompt: str,
        image_input: str,
        num_inference_steps: int,
        guidance_scale: float,
        seed: int = -1,
        num_frames: int = 49,
    ):
    if seed == -1:
        seed = random.randint(0, 2**8 - 1)

    image_input = Image.open(image_input).resize(size=(720, 480))  # Convert to PIL
    image = load_image(image_input)

    video_pt = pipe_image(
        image=image,
        prompt=prompt,
        num_inference_steps=num_inference_steps,
        num_videos_per_prompt=1,
        use_dynamic_cfg=True,
        output_type="pt",
        guidance_scale=guidance_scale,
        generator=torch.Generator(device="cpu").manual_seed(seed),
        num_frames=num_frames,
    ).frames
    return (video_pt, seed)

def generate(
    pipe_image,
    prompt,
    image_input,
    seed_value: int = -1,
    video_filename: str = "",
    num_frames: int = 49,
):
    latents, seed = infer(
        pipe_image,
        prompt,
        image_input,
        num_inference_steps=50, 
        guidance_scale=7.0,
        seed=seed_value,
        num_frames=num_frames,
    )
    batch_size = latents.shape[0]
    batch_video_frames = []
    for batch_idx in range(batch_size):
        pt_image = latents[batch_idx]
        pt_image = torch.stack([pt_image[i] for i in range(pt_image.shape[0])])
        image_np = VaeImageProcessor.pt_to_numpy(pt_image)
        image_pil = VaeImageProcessor.numpy_to_pil(image_np)
        batch_video_frames.append(image_pil)
    video_path = utils.save_video(
        batch_video_frames[0],
        fps=math.ceil((len(batch_video_frames[0]) - 1) / 6),
        filename=video_filename
    )
    return video_path

In [None]:
video_num = 1
temp_image = f'{working_dir}/temp_image.jpg'

for i, scene in enumerate(scenes):
    prompt = scene["action_sequence"]
    
    # Use the initial image for each scene
    image_input = f"{working_dir}/image_{str(i+1).zfill(2)}.jpg"  # First time use an initial image
    
    # Calculate duration to keep the video in 6-second increments
    if i + 1 < len(scenes):
        next_start_time = scenes[i + 1]["start"]
    else:
        next_start_time = last_end_value  # Use the final ending time for the last scene
    
    duration = next_start_time - scene["start"]
    num_video_segments = int((duration + 3) // 6)

    print(f'scene {i} has {num_video_segments} segments')
    for j in range(num_video_segments):
        video_name = f"video_{str(video_num).zfill(2)}_{str(j).zfill(2)}"
        video_output = generate(pipe_image, prompt, image_input, -1, video_dir+"/"+video_name)
        time.sleep(1)  # Pause for 1 second
        # After generating the video, extract the last frame to use as input for the next segment
        extract_last_frame(f'{video_dir}/{video_name}.mp4', temp_image)
        
        # Use the last frame in as input for the next video segment in the same scene
        image_input = temp_image
 
        video_num += 1  # Increment video number for the next segment

# Clean up the pipeline after use
del diffusers
del pipe_image
del transformers
# If using GPU, clear the GPU cache
if torch.cuda.is_available():
    torch.cuda.empty_cache()
gc.collect()