In [1]:
from collections import OrderedDict

import cv2
import librosa
import numpy as np
import torch
from models.Wav2Lip.model import wav2lip
from moviepy.editor import AudioFileClip, VideoFileClip

# Preparing samples for testing

In [None]:
input_video = "../samples/hd.mp4"
output_video = "../samples/clip.mp4"

# Define start and end time (in seconds)
start_time = 480
end_time = 780

clip = VideoFileClip(input_video)
trimmed_clip = clip.subclip(start_time, end_time)
trimmed_clip.write_videofile(output_video, fps=clip.fps)

In [None]:
clip = VideoFileClip("../samples/audio_sample.mp4")
clip = clip.subclip(600, 900)
audioclip = clip.audio
audioclip.write_audiofile("../samples/audio_clip.mp3")

# Loading model

In [2]:
model = wav2lip.Wav2Lip()
checkpoint = torch.load("models/Wav2Lip/weights/wav2lip.pth", map_location="cpu")
new_state_dict = OrderedDict()
for k, v in checkpoint["state_dict"].items():
    new_key = k.replace("module.", "")
    new_state_dict[new_key] = v
model.load_state_dict(new_state_dict)
model.eval()
None

# Preparing samples

In [None]:
input_video = "../samples/clip.mp4"
output_video = "../samples/subclip.mp4"

input_audio = "../samples/audio_clip.mp3"
output_audio = "../samples/audio_subclip.mp3"

start_time = 0
end_time = 30

clip = VideoFileClip(input_video)
trimmed_clip = clip.subclip(start_time, end_time)
trimmed_clip.write_videofile(output_video, fps=clip.fps)

clip = AudioFileClip(input_audio)
clip = clip.subclip(start_time, end_time)
clip.write_audiofile(output_audio)

# Testing

In [3]:
def extract_frames(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    cap.release()
    return np.array(frames)


video_frames = extract_frames("../samples/subclip.mp4")
print(f"Extracted {len(video_frames)} frames from video.")

Extracted 750 frames from video.


In [4]:
def detect_face(
    image: np.array,
    model_path="models/YUnet/face_detection_yunet_2023mar_int8bq.onnx",
) -> np.array:
    detector = cv2.FaceDetectorYN.create(
        model=model_path,
        config="",
        input_size=(image.shape[0], image.shape[1]),
        score_threshold=0.9,
        nms_threshold=0.3,
        top_k=5000,
    )

    height, width = image.shape[:2]

    detector.setInputSize((width, height))
    faces = detector.detect(image)

    x, y, w, h = map(int, faces[1][0][:4])
    cropped_image = image[y : y + h, x : x + w, :]
    # converting to shape [n_channels, height, width]
    # height and width are predifined to be 96
    cropped_image = cv2.resize(
        cropped_image, dsize=(96, 96), interpolation=cv2.INTER_CUBIC
    )
    cropped_image = np.transpose(cropped_image, (2, 0, 1))
    # cropped_image = cropped_image[np.newaxis, :, :, :]
    return cropped_image

In [None]:
def audio_to_mel(audio_path, num_frames, fps):
    secs_per_frame = 1 / fps
    hop_len = int(secs_per_frame * 1000)

    audio, sr = librosa.load(audio_path, sr=None)
    audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
    mel_spec = librosa.feature.melspectrogram(
        y=audio, sr=16000, n_mels=80, hop_length=hop_len
    )
    mel_spec = librosa.power_to_db(mel_spec)

    # converting to shape [batch_size, n_frames, n_channels, n_mels, chunk_size]
    # chunk_size is predifined to be 16, n_mels is 80, n_channels is 1
    n_mels, n_frames = mel_spec.shape
    n_chunks = n_frames // 16
    mel_spec = mel_spec[:, : n_chunks * 16]
    mel_spec = mel_spec.reshape(n_mels, n_chunks, 16)
    mel_spec = np.transpose(mel_spec, (1, 0, 2))

    mel_spec = mel_spec[:, np.newaxis, :, :]
    return mel_spec


mel_spec = audio_to_mel("../samples/audio_subclip.mp3", num_frames=750, fps=25)
mel_spec.shape

(750, 1, 80, 16)

In [7]:
faces = []
for frame in video_frames:
    face = detect_face(frame)
    faces.append(face)
faces = np.array(faces)
faces.shape

(750, 3, 96, 96)

In [11]:
regular_face = cv2.imread("../samples/regular_face.png")
regular_face = cv2.resize(regular_face, dsize=(96, 96), interpolation=cv2.INTER_CUBIC)
regular_face_frames = np.tile(regular_face, (750, 1, 1, 1))
regular_face_frames = np.transpose(regular_face_frames, (0, 3, 1, 2))
regular_face_frames.shape

(750, 3, 96, 96)

In [16]:
frames = np.concatenate((faces, regular_face_frames), axis=1)
frames.shape

(750, 6, 96, 96)

In [39]:
def run_wav2lip(model, frames, mel_spec):
    frames = torch.tensor(frames).float()
    mel = torch.tensor(mel_spec).float()
    outputs = []
    with torch.no_grad():
        output = model(mel, frames)  # Now both tensors have batch size 750
        outputs.append(output.cpu().numpy())
    return np.array(outputs)


output_frames = run_wav2lip(model, frames, mel_spec)

In [48]:
output_frames[0][0][0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

In [30]:
output_frames = np.transpose(output_frames[0], (0, 2, 3, 1))
output_frames.shape

(750, 96, 96, 3)

In [36]:
output_frames = output_frames.astype(np.uint8)

In [38]:
def save_video(frames, output_path, fps=25):
    height, width, _ = frames[0].shape
    print(height, width)
    out = cv2.VideoWriter(
        output_path, cv2.VideoWriter_fourcc(*"mp4v"), fps, (width, height)
    )

    for frame in frames:
        out.write(cv2.cvtColor(frame, cv2.COLOR_RGB2BGR))

    out.release()
    cv2.destroyAllWindows()


save_video(output_frames, "output_lipsynced.mp4")


96 96
