In [2]:
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=1

env: CUDA_DEVICE_ORDER=PCI_BUS_ID
env: CUDA_VISIBLE_DEVICES=1


In [1]:
seqid = "seq0001"
refid = "ref0002"
root = f"/root/TalkingHead"
audio_data_path = f"{root}/Demo/data/melspec/{seqid}.npy"
real_landmark_data_path = f"{root}/Demo/data/real_landmarks/{seqid}"
real_image_data_path = f"{root}/Demo/data/real_images/{seqid}"


audio2landmark_model_path = f"{root}/audio2landmark/lstm_melspec"
audio2landmark_model_pretrained = f"saved_model/backups/audio2feature_700000.bak"
audio2landmark_output_path = f"{root}/Demo/output/fake_landmarks/{seqid}.npy"

landmark2image_driving_path = f"{root}/Demo/output/driving/{seqid}"
landmark2image_ref_path = f"{root}/Demo/output/reference/{refid}"
landmark2image_real_image_input_path = f"{landmark2image_driving_path}/images"
landmark2image_fake_landmark_input_path = f"{landmark2image_driving_path}/landmarks-dlib68"
landmark2image_ref_image_input_path = f'{landmark2image_ref_path}/images'
landmark2image_ref_landmark_input_path = f'{landmark2image_ref_path}/landmarks-dlib68'

landmark2image_model_path = f"/root/imaginaire"
landmark2image_model_pretrained = f"logs/2022_0821_1502_30_ampO1/epoch_00119_iteration_000028000_checkpoint.pt"
landmark2image_model_config = f"{root}/Demo/output/ampO1.yaml"
landmark2image_output_path = f"{root}/Demo/output/fake_images/{seqid}-{refid}"


In [3]:
import yaml

fname = landmark2image_model_config

stream = open(fname, 'r')
data = yaml.safe_load(stream)

data['test_data']['test']['roots'][0] = landmark2image_ref_path
data['test_data']['test']['roots'][1] = landmark2image_driving_path

with open(fname, 'w') as yaml_file:
    yaml_file.write( yaml.dump(data, default_flow_style=False))

# Audio 2 Landmark 68 points

In [4]:
%cd {audio2landmark_model_path}

/root/TalkingHead/audio2landmark/lstm_melspec


In [5]:
%run inference.py --test_data_path {audio_data_path} \
--model_path {audio2landmark_model_pretrained} \
--save_path {audio2landmark_output_path}


Trainable Parameters: 2.703M
Load pretrained model audio2feature_700000.bak | Step: 700000
torch.Size([1, 112, 257])
torch.Size([1, 112, 136])
Saved: /root/TalkingHead/Demo/output/fake_landmarks/seq0001.npy


# Preprocessing real/fake landmark/images

In [6]:
os.makedirs(landmark2image_real_image_input_path, exist_ok=True)
os.makedirs(landmark2image_fake_landmark_input_path, exist_ok=True)
os.makedirs(landmark2image_ref_image_input_path, exist_ok=True)
os.makedirs(landmark2image_ref_landmark_input_path, exist_ok=True)


In [7]:
%cp -r {real_image_data_path}/* {landmark2image_real_image_input_path}


In [8]:
import cv2
import numpy as np
import json
import os
import shutil

root = audio2landmark_output_path
real_img = real_image_data_path
real_lm = landmark2image_fake_landmark_input_path
ref_img = landmark2image_ref_image_input_path
ref_lm = landmark2image_ref_landmark_input_path


lm_pred = np.load(root)
lm_pred = lm_pred.reshape(lm_pred.shape[1],68,2)
dsize = (256,256)
for index, frame in enumerate(lm_pred):
    img_path = os.path.join(real_img, f"{(index+1):06d}.png")
    image = cv2.imread(img_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = np.asarray(image)
    
    lm = []    
    for p in frame:
        relative_y = int(p[1] / dsize[0] * image.shape[0] )
        relative_x = int(p[0] / dsize[1] * image.shape[1] )
        lm.append([relative_x,relative_y])
    with open(os.path.join(real_lm, f"{(index+1):06d}.json"), "w") as outfile:
        json.dump(lm, outfile)
        
    if index == 0:
        shutil.copy(img_path, ref_img)
        shutil.copy(os.path.join(real_lm, f"{(index+1):06d}.json"), ref_lm)
        
        

# Fake Landmark + Ref Image => Video

In [9]:
%cd {landmark2image_model_path}

/root/imaginaire


In [10]:
%run inference.py --single_gpu --num_workers 0 \
--config {landmark2image_model_config} \
--output_dir {landmark2image_output_path} \
--checkpoint {landmark2image_model_pretrained}


Using random seed 0
cudnn benchmark: True
cudnn deterministic: False
Creating metadata
['images', 'landmarks-dlib68']
Data file extensions: {'images': 'png', 'landmarks-dlib68': 'json'}
Searching in dir: images
Found 1 sequences
Found 1 files
['images', 'landmarks-dlib68']
Data file extensions: {'images': 'png', 'landmarks-dlib68': 'json'}
Searching in dir: images
Found 1 sequences
Found 112 files
Folder at /root/TalkingHead/Demo/output/reference/ref0002/images opened.
Folder at /root/TalkingHead/Demo/output/reference/ref0002/landmarks-dlib68 opened.
Folder at /root/TalkingHead/Demo/output/driving/seq0001/images opened.
Folder at /root/TalkingHead/Demo/output/driving/seq0001/landmarks-dlib68 opened.
Num datasets: 2
Num sequences: 2
Max sequence length: 112
Epoch length: 1
Using random seed 0
Concatenate images:
    ext: png
    normalize: True
    num_channels: 3 for input.
	Num. of channels in the input image: 3
Concatenate images:
    ext: png
    normalize: True
    num_channels: 3 



Perceptual loss:
	Mode: vgg19
Loss GAN                  Weight 1.0
Loss FeatureMatching      Weight 10.0
Loss Perceptual           Weight 10.0
Loss Flow                 Weight 10.0
Loss Flow_L1              Weight 10.0
Loss Flow_Warp            Weight 10.0
Loss Flow_Mask            Weight 10.0


  0%|                                                                                       | 0/112 [00:00<?, ?it/s]

Done with loading the checkpoint.
Epoch length: 2


100%|█████████████████████████████████████████████████████████████████████████████| 112/112 [00:29<00:00,  3.78it/s]


# Display video

In [3]:
from IPython.display import HTML
from base64 import b64encode

def play(filename):
    html = ''
    video = open(filename,'rb').read()
    src = 'data:video/mp4;base64,' + b64encode(video).decode()
    html += '<video width=720 controls autoplay loop><source src="%s" type="video/mp4"></video>' % src 
    return HTML(html)

play(f'{landmark2image_output_path}/001.mp4')

# Evaluation

In [2]:
import json
import cv2
import os
import numpy as np

gt_landmark_datas = [];
del_rows = []
for index, jf in enumerate(os.listdir(real_landmark_data_path)):
    path = os.path.join(real_landmark_data_path, jf)
    with open(path) as json_file:
        data = json.load(json_file)
        
    img_path = os.path.join(real_image_data_path, f"{(index+1):06d}.png")
    image = cv2.imread(img_path)
    image = np.asarray(image)
    
    lm = []    
    for p in data:
        relative_y = p[1] * 256 / image.shape[0]
        relative_x = p[0] * 256 / image.shape[1]
        lm.append([relative_x,relative_y])
    gt_landmark_datas.append(np.asarray(lm)) 

gt_landmark_datas = np.asarray(gt_landmark_datas) # (Frame, 68,2)
gt_landmark_datas = np.expand_dims(gt_landmark_datas, axis=0)
print(gt_landmark_datas.shape)

pred_landmarks_datas = np.load(audio2landmark_output_path)
pred_landmarks_datas = pred_landmarks_datas.reshape(gt_landmark_datas.shape)
print(pred_landmarks_datas.shape)

(1, 112, 68, 2)
(1, 112, 68, 2)


In [3]:
from utils.evaluation_metric import evaluate_LMD

gt_norm = gt_landmark_datas[:, :, 36] - gt_landmark_datas[:, :, 45]
interocular_distance = np.sqrt(np.sum(gt_norm**2, axis=2))

lmd = evaluate_LMD(pred_landmarks_datas, gt_landmark_datas, norm_distance=interocular_distance)
print(f'LMD: {lmd}')

LMD: 0.06704060935028368


In [4]:
from utils.evaluation_metric import evaluate_LMD

gt_norm = gt_landmark_datas[:, :, 0] - gt_landmark_datas[:, :, 16]
face_width_distance = np.sqrt(np.sum(gt_norm**2, axis=2))

lmd = evaluate_LMD(pred_landmarks_datas, gt_landmark_datas, norm_distance=face_width_distance)
print(f'LMD: {lmd}')

LMD: 0.04090140669836443


In [5]:
from utils.evaluation_metric import evaluate_LMV

gt_norm = gt_landmark_datas[:, :, 0] - gt_landmark_datas[:, :, 16]
face_width_distance = np.sqrt(np.sum(gt_norm**2, axis=2))

lmv = evaluate_LMV(pred_landmarks_datas, gt_landmark_datas, norm_distance=face_width_distance)
print(f'LMV: {lmv}')

LMV: 0.010704419431810072


In [9]:
img_path1 = os.path.join('/root/TalkingHead/Demo/output/driving/seq0001/images/000001.png')
image1 = cv2.imread(img_path1)
image1 = cv2.resize(image1, (256, 256))

img_path2 = os.path.join('/root/TalkingHead/Demo/output/driving/seq0001/images/000010.png')
image2 = cv2.imread(img_path2)
image2 = cv2.resize(image2, (256, 256))

from utils.evaluation_metric import evaluate_SSIM

ssim = evaluate_SSIM(image1, image2, multichannel=True)
print(f'SSIM: {ssim}')

SSIM: 0.5846491001631783


In [9]:
img_path1 = os.path.join('/root/TalkingHead/Demo/output/driving/seq0001/images/000001.png')
image1 = cv2.imread(img_path1, cv2.IMREAD_GRAYSCALE)

from utils.evaluation_metric import evaluate_CPBD

cpbd = evaluate_CPBD(image1)
print(f'CPBD: {cpbd}')

CPBD: 0.15349290286668524


In [10]:
from utils.evaluation_metric import evaluate_AV

offset, conf = evaluate_AV('/root//Datasets/MEAD/M003/video/front/neutral/level_1/001.mp4', './utils/syncnet/data/syncnet_v2.model', fps=30, use_ffmpeg=False) 
print(f'AV offset: {offset}\nAV confident: {conf}')

                                                                                                                    

MoviePy - Writing audio in tmp/demo/audio.wav
MoviePy - Done.




Compute time 0.759 sec.
Framewise conf: 
[ 0.124  1.531  1.657  1.783  1.783  1.783  1.531  2.068  2.482  2.482
  2.482  2.068  2.068  3.063  3.063  3.063  3.063  0.855  0.855 -0.521
  0.035 -0.521 -0.792 -0.792 -0.848 -1.005 -1.682 -2.704 -2.704 -3.006
 -2.704 -2.704 -1.682 -1.682 -3.006 -3.006 -1.028 -1.028 -0.864 -0.774
 -0.540  0.161  2.327  2.437  2.437  2.437  2.327  0.161  0.120 -0.544
 -1.555 -1.744 -2.215 -2.854 -3.172 -3.227 -3.227 -3.227 -3.172 -2.215
 -1.244 -0.479  0.386  2.118  2.290  2.290  1.225  1.225  1.225  1.225
  0.479 -0.263 -0.263 -0.263 -0.263  0.479  0.541  1.037  1.224  2.433
  2.437  2.437  2.433  2.437  1.405  1.405  1.405  1.079]
AV offset: 2
AV confident: 0.34594249725341797


In [3]:
from utils.evaluation_metric import evaluate_AV

offset, conf = evaluate_AV('/root/TalkingHead/Demo/utils/syncnet/data/example.avi', './utils/syncnet/data/syncnet_v2.model', fps=25, use_ffmpeg=True) 
print(f'AV offset: {offset}\nAV confident: {conf}')

Compute time 2.600 sec.
Framewise conf: 
[ 3.079  4.003  5.885  7.139  7.408  7.658  8.228  8.499  8.499  11.430
  11.502  12.130  12.130  12.130  12.130  11.972  11.880  11.430  9.869
  9.834  9.511  9.511  9.834  9.834  9.511  9.374  9.834  10.330  10.330
  11.036  10.330  9.477  9.477  9.477  9.477  9.477  9.477  9.485  9.485
  9.833  9.833  9.833  9.833  9.833  9.351  9.351  9.351  9.351  9.351
  10.689  11.066  11.495  11.534  11.648  11.661  11.726  12.505  12.505
  12.505  12.505  11.882  11.734  11.726  11.726  10.965  10.965  10.481
  9.919  10.481  10.481  10.965  11.191  11.191  11.226  11.226  11.226
  11.226  11.110  11.226  11.226  11.226  11.110  10.434  10.220  9.644
  9.379  8.801  8.477  8.477  8.477  8.477  9.110  9.318  9.379  11.517
  11.755  11.953  11.953  11.953  11.755  11.755  11.755  11.755  11.690
  11.690  11.128  11.128  9.377  9.377  9.377  7.736  7.736  7.736  6.818
  6.818  7.736  8.916  8.935  10.924  10.942  11.145  11.582  11.582
  11.721  11.877  11

In [14]:
from utils.evaluation_metric import evaluate_FID

fid = evaluate_FID('/root/Datasets/Features/M003/video_frames/front/neutral/level_1/001', '/root/Datasets/Features/M003/video_frames/front/neutral/level_1/001') 
print(f'FID: {fid}')


100%|█████████████████████████████████████████████████████████████████████████████| 112/112 [00:05<00:00, 18.76it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 112/112 [00:06<00:00, 17.69it/s]


FID: -8.95276614443219e-06


In [3]:
import os
import cv2

img_path1 = os.path.join('/root/TalkingHead/Demo/output/driving/seq0001/images/000001.png')
image1 = cv2.imread(img_path1)
image1 = cv2.resize(image1, (256, 256))

img_path2 = os.path.join('/root/TalkingHead/Demo/output/driving/seq0001/images/000002.png')
image2 = cv2.imread(img_path2)
image2 = cv2.resize(image2, (256, 256))

from utils.evaluation_metric import evaluate_PSNR

psnr = evaluate_PSNR(image1, image2)
print(f'PSNR: {psnr}')


PSNR: 36.43885228404482


In [2]:
import os
import cv2
import torch

img_path1 = os.path.join('/root/TalkingHead/Demo/output/driving/seq0001/images/000001.png')
image1 = cv2.imread(img_path1)
image1 = cv2.resize(image1, (256, 256))
image1 = torch.tensor(image1).permute((2,0,1))

img_path2 = os.path.join('/root/TalkingHead/Demo/output/driving/seq0001/images/000002.png')
image2 = cv2.imread(img_path2)
image2 = cv2.resize(image2, (256, 256))
image2 = torch.tensor(image2).permute((2,0,1))

from utils.evaluation_metric import evaluate_LPIPS

lpips = evaluate_LPIPS(image1, image2, net='vgg')
print(f'LPIPS: {lpips[0,0,0,0]}')

Setting up [LPIPS] perceptual loss: trunk [vgg], v[0.1], spatial [off]
Loading model from: /root/miniconda3/envs/imaginaire/lib/python3.8/site-packages/lpips/weights/v0.1/vgg.pth
LPIPS: 0.013878915458917618
