## Setup

In [None]:
!bash setup.sh
import os
os._exit(00)

In [None]:
!huggingface-cli login

## Generate video from text

In [None]:
from huggingface_hub import snapshot_download
import os, subprocess
from modelscope.pipelines import pipeline
from modelscope.outputs import OutputKeys
import pathlib
import torch
torch.manual_seed(468)

model_dir = pathlib.Path('/notebooks/modelscope-damo-text-to-video-synthesis')

if not os.path.exists('modelscope-damo-text-to-video-synthesis'):
    snapshot_download('damo-vilab/modelscope-damo-text-to-video-synthesis', repo_type='model', local_dir=model_dir)
    subprocess.run(['cp', 'configuration.json', 'modelscope-damo-text-to-video-synthesis/configuration.json'])

pipe = pipeline('text-to-video-synthesis', model_dir.as_posix(),output_video = 'outs/video.mp4')
test_text = {
        'text': 'Alice in Wonderland animated disney princess dancing',
        'output_video_path' : 'outs/video.mp4'
    }
output_video_path = pipe(test_text,output_video = 'outs/video.mp4')[OutputKeys.OUTPUT_VIDEO]
print('output_video_path:', output_video_path)



## Generate Speech from voice sample and text

In [2]:
## Alice
!yt-dlp --extract-audio --audio-format wav https://www.youtube.com/watch?v=Srn0xkXTSgs --output TTS/audio_samps/alice.wav


[youtube] Extracting URL: https://www.youtube.com/watch?v=Srn0xkXTSgs
[youtube] Srn0xkXTSgs: Downloading webpage
[youtube] Srn0xkXTSgs: Downloading android player API JSON
[info] Srn0xkXTSgs: Downloading 1 format(s): 251
[download] TTS/audio_samps/alice.wav has already been downloaded
[ExtractAudio] Destination: TTS/audio_samps/alice.wav
Deleting original file TTS/audio_samps/alice.orig.wav (pass -k to keep)


In [None]:
from TTS.api import TTS


tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=True)
tts.tts_to_file('Oh what a lovely day to be outside!', speaker_wav="/notebooks/TTS/audio_samps/alice.wav", language="en", file_path="outs/speech.wav")


## Generate background music from text

In [None]:
import torch

from PIL import Image
import numpy as np
from spectro import wav_bytes_from_spectrogram_image

from diffusers import StableDiffusionPipeline
from diffusers import StableDiffusionImg2ImgPipeline
import gradio as gr
device = "cuda"
MODEL_ID = "riffusion/riffusion-model-v1"
pipe = StableDiffusionPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.float16)
pipe = pipe.to(device)
pipe2 = StableDiffusionImg2ImgPipeline.from_pretrained(MODEL_ID, torch_dtype=torch.float16)
pipe2 = pipe2.to(device)

spectro_from_wav = gr.Interface.load("spaces/fffiloni/audio-to-spectrogram")

def predict(prompt, negative_prompt, audio_input, duration):
    if audio_input == None :
        return classic(prompt, negative_prompt, duration)
    else :
        return style_transfer(prompt, negative_prompt, audio_input)

def classic(prompt, negative_prompt, duration):
    if duration == 5:
        width_duration=512
    else :
        width_duration = 512 + ((int(duration)-5) * 128)
    spec = pipe(prompt, negative_prompt=negative_prompt, height=512, width=width_duration).images[0]
    print(spec)
    wav = wav_bytes_from_spectrogram_image(spec)
    with open("outs/music.wav", "wb") as f:
        f.write(wav[0].getbuffer())
    return spec, 'outs/music.wav', gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)

def style_transfer(prompt, negative_prompt, audio_input):
    spec = spectro_from_wav(audio_input)
    print(spec)
    # Open the image
    im = Image.open(spec)
    
    
    # Open the image
    im = image_from_spectrogram(im, 1)
   
    
    new_spectro = pipe2(prompt=prompt, image=im, strength=0.5, guidance_scale=7).images
    wav = wav_bytes_from_spectrogram_image(new_spectro[0])
    with open("outs/music.wav", "wb") as f:
        f.write(wav[0].getbuffer())
    return new_spectro[0], 'outs/music.wav', gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)

def image_from_spectrogram(
    spectrogram: np.ndarray, max_volume: float = 50, power_for_image: float = 0.25
) -> Image.Image:
    """
    Compute a spectrogram image from a spectrogram magnitude array.
    """
    # Apply the power curve
    data = np.power(spectrogram, power_for_image)

    # Rescale to 0-255
    data = data * 255 / max_volume

    # Invert
    data = 255 - data

    # Convert to a PIL image
    image = Image.fromarray(data.astype(np.uint8))

    # Flip Y
    image = image.transpose(Image.FLIP_TOP_BOTTOM)

    # Convert to RGB
    image = image.convert("RGB")

    return image


prompt_input = 'a disney theme song'
negative_prompt = ''
audio_input = None
duration_input = 5
    
spectrogram_output, sound_output, share_button, community_icon, loading_icon = predict(prompt_input, negative_prompt, audio_input, duration_input)



## Composite audio and video files

In [None]:
from moviepy.editor import *
# load the video
video_clip = VideoFileClip('outs/video.mp4')
# # load the audio
music_clip = AudioFileClip('outs/music.wav')
speech_clip = AudioFileClip('outs/speech.wav')


new_audioclip = CompositeAudioClip([music_clip, speech_clip])
video_clip.audio = new_audioclip
video_clip.write_videofile("outs/final.mp4")

In [None]:
from IPython.display import Video

Video("outs/final.mp4")

## Gradio

In [None]:
!python app.py