# ðŸŽ­ SadTalker: Single Image + Voice from Text (Colab)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/OpenTalker/SadTalker/blob/main/colab_single_image_voice_text.ipynb) *(Upload this notebook to Colab if the link points elsewhere.)*

**One image, one voice, your text.** The avatar reads your text with lip sync. No voice stripping, no extra variation cost.

1. Upload **one face image** (avatar)
2. Enter **text** for the avatar to speak
3. Pick a **voice** (text-to-speech, free)
4. Generate â†’ talking head video with lip sync

## Step 1: Enable GPU

**Runtime â†’ Change runtime type â†’ Hardware accelerator â†’ GPU**

In [None]:
!nvidia-smi --query-gpu=name,memory.total,memory.free --format=csv,noheader

## Step 2: Clone SadTalker & install

In [None]:
import os
if not os.path.exists('SadTalker'):
    !git clone https://github.com/OpenTalker/SadTalker.git
%cd SadTalker
!pip install -q -r requirements.txt
!pip install -q edge-tts

## Step 3: Download checkpoints

In [None]:
!bash scripts/download_models.sh

## Step 4: Single image + text â†’ talking video

Upload **one** face image, type the text, choose a voice, then run generation.

In [None]:
import asyncio
import edge_tts
import os
import sys
import subprocess
from pathlib import Path
from datetime import datetime
from pydub import AudioSegment
import ipywidgets as widgets
from IPython.display import display, HTML, clear_output
from base64 import b64encode

BASE_DIR = "/content/SadTalker"
os.chdir(BASE_DIR)
sys.path.insert(0, BASE_DIR)

RESULT_DIR = os.path.join(BASE_DIR, "results")
os.makedirs(RESULT_DIR, exist_ok=True)

# Default TTS voices (edge-tts, free)
VOICES = [
    ("en-US-JennyNeural (Female)", "en-US-JennyNeural"),
    ("en-US-GuyNeural (Male)", "en-US-GuyNeural"),
    ("en-GB-SoniaNeural (Female, UK)", "en-GB-SoniaNeural"),
    ("en-GB-RyanNeural (Male, UK)", "en-GB-RyanNeural"),
]


async def text_to_speech_async(text: str, voice: str, out_path: str):
    """Generate speech from text with edge-tts, save as WAV."""
    mp3_path = out_path.replace(".wav", ".mp3")
    communicate = edge_tts.Communicate(text, voice)
    await communicate.save(mp3_path)
    # Convert to WAV for SadTalker
    audio = AudioSegment.from_mp3(mp3_path)
    audio.export(out_path, format="wav")
    if os.path.exists(mp3_path):
        os.remove(mp3_path)
    return out_path


def text_to_speech(text: str, voice: str, out_path: str):
    return asyncio.run(text_to_speech_async(text, voice, out_path))


def run_sadtalker(image_path: str, audio_path: str, result_dir: str) -> str:
    """Run SadTalker inference. Returns path to generated MP4."""
    cmd = [
        sys.executable, "inference.py",
        "--driven_audio", audio_path,
        "--source_image", image_path,
        "--result_dir", result_dir,
        "--still", "--preprocess", "full", "--enhancer", "gfpgan"
    ]
    subprocess.run(cmd, check=True, cwd=BASE_DIR)
    # Newest mp4 in result_dir
    mp4s = sorted(Path(result_dir).rglob("*.mp4"), key=os.path.getmtime, reverse=True)
    if not mp4s:
        raise FileNotFoundError("No output video found.")
    return str(mp4s[0])


def generate(image_path: str, text: str, voice_name: str):
    if not text or not text.strip():
        print("Please enter some text.")
        return None
    if not image_path or not os.path.exists(image_path):
        print("Please upload a face image.")
        return None
    voice_id = dict(VOICES).get(voice_name, VOICES[0][1])
    ts = datetime.now().strftime("%Y_%m_%d_%H.%M.%S")
    audio_path = os.path.join(RESULT_DIR, f"tts_{ts}.wav")
    print("Generating speech from text...")
    text_to_speech(text.strip(), voice_id, audio_path)
    print("Running SadTalker (lip sync)...")
    video_path = run_sadtalker(image_path, audio_path, RESULT_DIR)
    print("Done:", video_path)
    return video_path


# UI
upload = widgets.FileUpload(accept=".png,.jpg,.jpeg", multiple=False, description="Face image")
text_in = widgets.Textarea(placeholder="Enter the text for the avatar to read...", rows=4, layout=widgets.Layout(width="100%"))
voice_drop = widgets.Dropdown(options=[v[0] for v in VOICES], value=VOICES[0][0], description="Voice:")
go_btn = widgets.Button(description="Generate video", button_style="primary")
out_area = widgets.Output()

def _get_uploaded_file(upload_widget):
    """Get (filename, bytes) from Colab/ipywidgets FileUpload."""
    val = upload_widget.value
    if not val:
        return None, None
    if isinstance(val, list) and len(val) > 0:
        # Colab sometimes: [{"name": "x.png", "content": b"..."}]
        item = val[0]
        name = item.get("name", "image.png")
        data = item.get("content", b"")
        if hasattr(data, "tobytes"):
            data = data.tobytes()
        return name, data
    if isinstance(val, dict):
        name = list(val.keys())[0]
        data = val[name].get("content", val[name]) if isinstance(val[name], dict) else val[name]
        if hasattr(data, "tobytes"):
            data = data.tobytes()
        return name, data
    return None, None


def on_click(btn):
    with out_area:
        clear_output(wait=True)
        name, data = _get_uploaded_file(upload)
        if not name or not data:
            print("Upload a face image first.")
            return
        image_path = os.path.join(RESULT_DIR, name)
        with open(image_path, "wb") as f:
            f.write(data)
        video_path = generate(image_path, text_in.value, voice_drop.value)
        if video_path:
            mp4 = open(video_path, "rb").read()
            data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
            display(HTML(f"""
            <p><b>Output:</b> {os.path.basename(video_path)}</p>
            <video width=400 controls><source src="{data_url}" type="video/mp4"></video>
            """))

go_btn.on_click(on_click)

display(widgets.VBox([
    widgets.HTML("<b>1. Upload one face image (PNG/JPG)</b>"),
    upload,
    widgets.HTML("<b>2. Text for the avatar to read</b>"),
    text_in,
    widgets.HTML("<b>3. Voice (TTS)</b>"),
    voice_drop,
    go_btn,
    widgets.HTML("<b>4. Output</b>"),
    out_area,
]))