<a href="https://colab.research.google.com/github/georgedouzas/avatar-poc/blob/main/environment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup & Installation

In [3]:
# Setup environment
!sudo apt-get update
!apt install software-properties-common
!sudo apt-get install python3.8 python3.8-distutils
!curl -sS https://bootstrap.pypa.io/pip/3.8/get-pip.py -o get-pip.py
!sudo python3.8 get-pip.py
!python3.8 -m pip install -U setuptools wheel
!sudo apt-get install python3.9 python3.9-distutils
!curl -sS https://bootstrap.pypa.io/get-pip.py -o get-pip.py
!sudo python3.9 get-pip.py
!python3.9 -m pip install -U setuptools wheel

# Install SadTalker
!git clone https://github.com/cedro3/SadTalker.git &> /dev/null
%cd SadTalker
!export PYTHONPATH=/content/SadTalker:$PYTHONPATH
!python3.8 -m pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113
!apt update
!apt install ffmpeg &> /dev/null
!python3.8 -m pip install -r requirements.txt
!rm -rf checkpoints
!bash scripts/download_models.sh

# Install piper
!python3.9 -m pip install piper-tts
!python3.9 -m piper.download_voices en_US-lessac-medium
!python3.9 -m piper.download_voices el_GR-rapunzelina-low

0% [Working]            Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
0% [Connecting to archive.ubuntu.com] [Waiting for headers] [Waiting for header0% [Connecting to archive.ubuntu.com] [Waiting for headers] [Waiting for header                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,806 kB]
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:8 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,759 kB]
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:10 http://se

# Gradio App

In [10]:
languages_mapping = {
    'English': 'en_US-lessac-medium',
    'Greek': 'el_GR-rapunzelina-low'
}
language = languages_mapping['English']
text = 'My name is Quento. I am a great company'
driven_audio = 'examples/driven_audio/output.wav'
source_image = 'examples/source_image/full3.png'

!python3.9 -m piper -m $language -f $driven_audio -- $text
!python3.8 inference.py --driven_audio $driven_audio --source_image $source_image --result_dir ./results

using safetensor as default
3DMM Extraction for source image
landmark Det:: 100% 1/1 [00:00<00:00, 16.41it/s]
3DMM Extraction In Video:: 100% 1/1 [00:00<00:00, 54.99it/s]
mel:: 100% 64/64 [00:00<00:00, 41183.72it/s]
audio2exp:: 100% 7/7 [00:00<00:00, 126.29it/s]
Face Renderer:: 100% 32/32 [00:17<00:00,  1.86it/s]
The generated video is named ./results/2025_07_15_07.21.22/full3##output.mp4
The generated video is named: ./results/2025_07_15_07.21.22.mp4


In [None]:
import gradio as gr
import subprocess
import os
import datetime
import shutil

# Language to voice model mapping
languages_mapping = {
    'English': 'en_US-lessac-medium',
    'Greek': 'el_GR-rapunzelina-low'
}

def generate_video(language, text, image):
    # Set paths
    language_model = languages_mapping.get(language, 'en_US-lessac-medium')
    driven_audio_path = 'examples/driven_audio/output.wav'
    source_image_path = 'examples/source_image/input_image.png'

    # Save uploaded image to source path
    shutil.copy(image, source_image_path)

    # Run TTS to generate audio
    subprocess.run([
        'python3.9', '-m', 'piper',
        '-m', language_model,
        '-f', driven_audio_path,
        '--', text
    ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # Run inference
    subprocess.run([
        'python3.8', 'inference.py',
        '--driven_audio', driven_audio_path,
        '--source_image', source_image_path,
        '--result_dir', './results'
    ], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    # Construct expected output filename (latest .mp4 in results/)
    result_subdirs = sorted(os.listdir('./results'), reverse=True)
    for subdir in result_subdirs:
        potential_path = os.path.join('./results', subdir, 'input_image##output.mp4')
        if os.path.exists(potential_path):
            return potential_path

    return "Error: No output video generated."

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("# Text-to-Animated Face Video")

    with gr.Row():
        language_input = gr.Dropdown(choices=list(languages_mapping.keys()), label="Select Language", value="English")
        text_input = gr.Textbox(lines=4, label="Enter Text")
        image_input = gr.Image(label="Upload Image", type="filepath")

    generate_button = gr.Button("Generate Video")

    video_output = gr.Video(label="Generated Video")

    generate_button.click(
        fn=generate_video,
        inputs=[language_input, text_input, image_input],
        outputs=video_output
    )

demo.launch()